顿搜
Pandas 数据分析代码示例——Python中数据处理库
pandas 方法
read_csv—读逗号分隔的数据
import pandas
A = pandas.read_csv("test.txt")
print(type(A))
print(A.dtypes)*
No. int64
Content object
Value int64
dtype: object*
to_datetime—时间格式转化
import pandas
A = pandas.read_csv("test1.csv")
A["Date"] = pandas.to_datetime(A["Date"])
print(A.head(3))*No Content Values Values1 Date
0 1 a 111.0 32.0 2017-01-01
1 2 b NaN 45.0 2017-02-01
2 3 c 333.0 25.0 2017-03-01*
DataFrame
DataFrame数据类型
| 类型 | 意义 | P.S |
|---|---|---|
| Object | String | |
| int | integer | |
| float | float | |
| datetime | time | |
| bool | boolean |
head—取前几行
A = pandas.read_csv("test.txt")
print(A.head(1))
print("====================================================")
print(A.head())*No. Content Value
0 1 a 111
——————————————————————————————————————————
No. Content Value
0 1 a 111
1 2 b 222
2 3 c 333
3 4 d 444*
tail—取后几行
A = pandas.read_csv("test.txt")
print(A.tail(1))
print("====================================================")
print(A.tail())*No. Content Value
3 4 d 444
——————————————————————————————————————————
No. Content Value
0 1 a 111
1 2 b 222
2 3 c 333
3 4 d 444*
columns—获取列名
A = pandas.read_csv("test.txt")
print(A.columns)Index(['No.', 'Content', 'Value'], dtype='object')
shape—查看行列值
A = pandas.read_csv("test.txt")
print(A.shape)(4, 3)
loc—定位取索引(行号)
A = pandas.read_csv("test.txt")
print(A.loc[3])
print("====================================================")
print(A.loc[[1,3]])
print("====================================================")
print(A.loc[1:3])*No. 4
Content d
Value 444
Name: 3, dtype: object
——————————————————————————————————————————
No. Content Value
1 2 b 222
3 4 d 444
——————————————————————————————————————————
No. Content Value
1 2 b 222
2 3 c 333
3 4 d 444*
拿到某一列
A = pandas.read_csv("test.txt")
print(A["Content"])
print("====================================================")
print(A[["Content","Value"]])*0 a
1 b
2 c
3 d
Name: Content, dtype: object
——————————————————————————————————————————
Content Value
0 a 111
1 b 222
2 c 333
3 d 444*
取最大(小)值
A = pandas.read_csv("test.txt")
print(A["Value"].max())
print(A["Value"].min())444
111
sort_values—按列排序
A = pandas.read_csv("test.txt")
A.sort_values("Value", inplace = True)
print(A)
print("====================================================")
A.sort_values("Value", inplace = True, ascending = False)
print(A)*No. Content Value
0 1 a 111
1 2 b 222
2 3 c 333
3 4 d 444
——————————————————————————————————————————
No. Content Value
3 4 d 444
2 3 c 333
1 2 b 222
0 1 a 111*
isnull—判断是否有缺失值
A = pandas.read_csv("test1.csv")
print(A)
print("====================================================")
values = A["Value"]
is_null_values = pandas.isnull(values)
print(is_null_values)
true_null = values[is_null_values]
print("====================================================")
print(true_null)
print("====================================================")
print(len(true_null))*No.\t Content Value
0 1 a 111.0
1 2 b NaN
2 3 c 333.0
3 4 d 444.0
——————————————————————————————————————————
0 False
1 True
2 False
3 False
Name: Value, dtype: bool
——————————————————————————————————————————
1 NaN
Name: Value, dtype: float64
——————————————————————————————————————————
1*
mean—求列的均值
A = pandas.read_csv("test1.csv")
print(A["Values"].mean())277.25
pivot_table—数据透视表
import numpy
A = pandas.read_csv("test1.csv")
print(A)
print("====================================================")
statistics = A.pivot_table(index="No", values="Values", aggfunc = numpy.mean)
print(statistics)
print("====================================================")
statistics = A.pivot_table(index="No", values=["Values", "Values1"], aggfunc = numpy.sum)
print(statistics)*No Content Values Values1
0 1 a 111.0 32.0
1 2 b NaN 45.0
2 3 c 333.0 25.0
3 4 d 444.0 76.0
4 1 e 221.0 NaN
——————————————————————————————————————————
Values No
1 166.0
2 NaN
3 333.0
4 444.0
——————————————————————————————————————————
Values Values1 No
1 332.0 32.0
2 NaN 45.0
3 333.0 25.0
4 444.0 76.0*
dropna—去掉含缺失值的行(列)
import numpy
A = pandas.read_csv("test1.csv")
drop_na_1 = A.dropna(axis = 1)
print(drop_na_1)
print("====================================================")
drop_na_0 = A.dropna(axis = 0, subset=["Values","Values1"])
print(drop_na_0)*No Content
0 1 a
1 2 b
2 3 c
3 4 d
4 1 e
——————————————————————————————————————————
No Content Values Values1
0 1 a 111.0 32.0
2 3 c 333.0 25.0
3 4 d 444.0 76.0*
loc—定位
A = pandas.read_csv("test1.csv")
print(A.loc[2,"Values"])333.0
reset_index—重建索引
A = pandas.read_csv("test1.csv")
A.sort_values("Values", inplace = True, ascending = False)
print(A)
print("====================================================")
A = A.reset_index(drop=True)
print(A)*No Content Values Values1
3 4 d 444.0 76.0
2 3 c 333.0 25.0
4 1 e 221.0 NaN
0 1 a 111.0 32.0
1 2 b NaN 45.0
——————————————————————————————————————————
No Content Values Values1
0 4 d 444.0 76.0
1 3 c 333.0 25.0
2 1 e 221.0 NaN
3 1 a 111.0 32.0
4 2 b NaN 45.0*
apply—自定义函数
A = pandas.read_csv("test1.csv")
def third(matrix):
return matrix.loc[2]
print(A.apply(third))*No 3
Content c
Values 333
Values1 25
dtype: object*
Series
Series 构造
构造单列
A = pandas.read_csv("test1.csv")
print(type(A))
series = A["Values"]
print(type(series))
print(series[0:3])*
0 111.0
1 NaN
2 333.0
Name: Values, dtype: float64*
构造多列并制定索引
from pandas import Series
A = pandas.read_csv("test1.csv")
series = A["Content"]
content = series.values
series = A["Values"]
values = series.values
series = Series(values,index = content)
result = series[["c", 333]]
print(result)
print("====================================================")
result = series[0:4]
print(result)*c 333.0
333 NaN
dtype: float64
——————————————————————————————————————————
a 111.0
b NaN
c 333.0
d 444.0
dtype: float64*
Series 应用
按列排序
A = pandas.read_csv("test1.csv")
series = A["Content"]
content = series.values
series = A["Values"]
values = series.values
series = Series(values,index = content)
index = series.index.tolist()
sorted_index = sorted(index)
series = series.reindex(sorted_index)
print(series)
print("====================================================")
series = Series(values,index = content)
sorted_by_index = series.sort_index()
print(sorted_by_index)
print("====================================================")
sorted_by_values = series.sort_values()
print(sorted_by_values)*a 111.0
b NaN
c 333.0
d 444.0
e 221.0
dtype: float64
——————————————————————————————————————————
a 111.0
b NaN
c 333.0
d 444.0
e 221.0
dtype: float64
——————————————————————————————————————————
a 111.0
e 221.0
c 333.0
d 444.0
b NaN
dtype: float64*
series 与 numpy 混用
A = pandas.read_csv("test1.csv")
series = A["Content"]
content = series.values
series = A["Values"]
values = series.values
series = A["Values1"]
values1 = series.values
result1 = Series(values,index = content)
result2 = Series(values1,index = content)
print(numpy.add(result2, result1))*a 143.0
b NaN
c 358.0
d 520.0
e NaN
dtype: float64*
求两列均值
A = pandas.read_csv("test1.csv")
series = A["Content"]
content = series.values
series = A["Values"]
values = series.values
series = A["Values1"]
values1 = series.values
result1 = Series(values,index = content)
result2 = Series(values1,index = content)
mean = (result1 + result2)/ 2
print(mean)*a 71.5
b NaN
c 179.0
d 260.0
e NaN
dtype: float64*