Numpy, pandas, scikit

Posted on 2021-02-16 Edited on 2021-02-21 In python 30 days Views: Valine:

Symbols count in article: 17k Reading time ≈ 15 mins.

Matplotlib

Scatter graph is used to represent the relationship between variables
折线图体现变化，散点图体现x和y的关系，条形图统计离散数据，直方图统计连续数据

import random

from matplotlib import pyplot as plt

x = range(0, 120)
y = [random.randint(20, 35) for i
     in range(120)]

plt.figure(figsize=(20, 8), dpi=80)
plt.plot(x, y)

# adjust x
#_x = list(x)
_xtick_labels = ["10:{} min"
                      .format(i) for i in range(60)]
_xtick_labels += ["11:{}".format(i) for i in range(60)]
plt.xticks(list(x)[::3], _xtick_labels[::3], rotation=45)

plt.xlabel("Time")
plt.ylabel('Temperatur unit(C)')
plt.title('Temprature from 10:12')
plt.show()

x, y and grid

from matplotlib import pyplot as plt

y = [1, 0, 1, 1, 2, 4, 3, 2, 3, 4, 4, 5, 6, 5, 4, 3, 3, 1, 1, 1]


x = range(11, 31)

plt.figure(figsize=(20, 8), dpi=80)
plt.plot(x, y)

_xtick_labels = ["{} years old".format(i) for i in x]
plt.xticks(x, _xtick_labels, rotation=45)
plt.yticks(range(0, 9))
# grid
plt.grid(alpha=0.4)  # alpha is opacity

plt.show()

Numpy

import random

import numpy as np

t1 = np.array([1, 2, 3])
print(t1)

t2 = np.array(range(10))
print(t2)

t3 = np.arange(4, 10, 2)
print(t3)

t4 = np.array(range(1, 4), dtype=float)
print(t4)
print(t4.dtype)

# numpy boolean
t5 = np.array([1,1,0,1,0,1,1], dtype=bool)
print(t5)

# adjust dtype
t6 = t5.astype("int8")
print(t6)
print(t6.dtype)

# decimal point
t7 = np.array([random.random()
               for i in range(10)])
print(t7)

t8 = np.round(t7, 2) # two decimal place
print(t8)

Two dimensional and three dimensional array

t2 = np.array([[1, 2, 3], [4, 5, 6]])
print(t2.shape) # (2, 3)

print('t3 is ')
t3 = np.array([[[1,2,3],[2,3,4]], [[4,5,6],[5,6,7]]])
print(t3.shape) # (2, 2, 3)

#shape(column, row)
t4 = np.arange(12)
print(t4.reshape(1,12)) # two dimension
print(t4.reshape(12,)) # one dimension

# b.flatten() to transform two-d array to 1D array

Reshape

import numpy as np

# broadcast mechanism.
# if t5 is an array, t5 + 2 we got result that all
# members increments 2
# Array - array, it must has at least one dimension
# correspond. Either column or row

t1 = np.arange(24)
print(t1.reshape(4, 6))
t2 = t1.reshape(4, 6)

t3 = t1.reshape(24,)
print(t3.sum())

Three way to transpose

import numpy as np

t2 = np.arange(24).reshape(4, 6)
print(t2)

print(t2.transpose())

print(t2.T) # Same

print(t2.swapaxes(1,0))

Slicing, index, row and column

Siling: a group of value

Index: a single value

import numpy as np

us_file_path = "./youtube_video_data/US_video_data_numbers.csv"
uk_file_path = "./youtube_video_data/GB_video_data_numbers.csv"

t1 = np.loadtxt(us_file_path, delimiter=",", dtype="int")
t2 = np.loadtxt(uk_file_path, delimiter=",", dtype="int")

# print(t1)
print(t2)

print('*' * 100)

# get one row
# print(t2[2])

# continuous get many rows
#  print(t2[2:])

# non-continuous get many rows
# print(t2[[2,8,10]])

# get column
# print(t2[:, 0])

# continuously get column
#print(t2[:, 2:])  # 从第二列开始取,note：comma

# non-continuously get column
#print(t2[:,[0, 2]]) # 取第0列和第二列

# multiple columns and rows
# a = t2[2,3]
# print(a)
# print(type(a))


# the third row to fifth row, second column to 4th column
# b = t2[2:5, 1:4]
# print(b)

# get multiple non-neighbour point
# c = t2[[0,2],[0,1]] # NOTE! It will give you [0,0] and [2,1]
# c = t2[[0, 2, 2], [0, 1, 3]]
# print(c)

Replace number to nan

import numpy as np

t1 = np.arange(24)
t2 = t1.reshape([4, 6])
# print(t3)

# 取t3中比10小的数，改成3
# t3[t3 < 10] = 3
# print(t3)

# 把小于10的数字替换为0，把大于10的替换为10

# 1.
# t3[t3 < 10] = 0
# t3[t3 > 10] = 10
# print(t3)

# 2.
# t4 = np.where(t3 <= 10, 0, 10) # 三元运算
# print(t4)


# Clip
# t4 = t3.clip(10, 18) # 小于10 替换成10， 大于18替换为18
# print(t4)

# 把第四行第四个数换成nan，需要先astype成float
t2 = t2.astype(float)
t2[3, 3] = np.nan
print(t2)


print('*' * 100)
#第一列全是0
#t3[:, 0] = 0
#print(t3)

# 不为0的个数是多少个
#print(np.count_nonzero(t3))

# true/false nan
# print(np.isnan(t3))


# t3 = np.arange(12).reshape(3, 4)
# print(t3)
# print(np.sum(t3))
#
# # 计算行上的sum
#
# print(np.sum(t3, axis=0))
#
# # 计算列上的sum
# print(np.sum(t3, axis=1))

# 每一列的sum和mean,median
# print(t2.sum(axis=0))
# print(t2.mean(axis=0))
# print(np.median(t2, axis=0))
# print(t2.max(axis=0))

# print(np.ptp(t2))

# standard deviation: the larger the more unstable
print(t2.std(axis=None))

Use mean values to fillout missing values

import numpy as np


def fill_ndarray(t1):
    for i in range(t1.shape[1]):  # 0为行，1为列
        temp_col = t1[:, i]  # 当前这一列
         # 统计nan个数
        nan_num = np.count_nonzero(temp_col != temp_col) # 因为nan != nan
        if nan_num != 0:  # 不为0说明当前这一列有nan
            # 取出当前一列不为nan的数组
            temp_not_nan_col = temp_col[temp_col == temp_col]

            # 选中当前nan的位置，赋值为不为nan的均值
            temp_col[np.isnan(temp_col)] = temp_not_nan_col.mean()
    return t1


if __name__ == '__main__':
    # t1 = np.arange(12).reshape(3, 4).astype("float")
    t1 = np.arange(24).reshape(4, 6).astype("float")
    # print(t1)
    #
    # print('*' * 100)
    t1[1, 2:] = np.nan  # 从第一行，第二列开始后面的几个都是nan
    print(t1)

    t1 = fill_ndarray(t1)
    print(t1)

Exercie: The US YouTube video comments

import numpy as np
from matplotlib import pyplot as plt

us_file_path = "./youtube_video_data/US_video_data_numbers.csv"
uk_file_path = "./youtube_video_data/GB_video_data_numbers.csv"

t_us = np.loadtxt(us_file_path, delimiter=",", dtype="int")
t_uk = np.loadtxt(uk_file_path, delimiter=",", dtype="int")

# 取评论的数据
t_us_comments = t_us[:, -1]

# 选比5000小的数据
t_us_comments = t_us_comments[t_us_comments <= 5000]

# Find max and min
print(t_us_comments.max(), t_us_comments.min())

d = 50

bin_nums = (t_us_comments.max() - t_us_comments.min()) // d
plt.figure(figsize=(20, 8), dpi=80)
plt.hist(t_us_comments, bin_nums)
plt.show()

Exercise: The relationship between UK video likes, and comments

import numpy as np
from matplotlib import pyplot as plt

us_file_path = "./youtube_video_data/US_video_data_numbers.csv"
uk_file_path = "./youtube_video_data/GB_video_data_numbers.csv"

t_us = np.loadtxt(us_file_path, delimiter=",", dtype="int")
t_uk = np.loadtxt(uk_file_path, delimiter=",", dtype="int")

# 选 <= 500000的
t_uk = t_uk[t_uk[:, 1] <= 500000]

t_uk_comments = t_uk[:, -1]
t_uk_like = t_uk[:, 1]

plt.figure(figsize=(20, 8), dpi=80)
plt.scatter(t_uk_like, t_uk_comments)

plt.show()

Concatenate two data

import numpy as np
from matplotlib import pyplot as plt

# 加载国家数据
us_file_path = "./youtube_video_data/US_video_data_numbers.csv"
uk_file_path = "./youtube_video_data/GB_video_data_numbers.csv"

us_data = np.loadtxt(us_file_path, delimiter=",", dtype="int")
uk_data = np.loadtxt(uk_file_path, delimiter=",", dtype="int")

# 添加国家信息
# 构造全为0的数据
zero_data = np.zeros((us_data.shape[0], 1)).astype(int) #1 表示列
ones_data = np.ones((uk_data.shape[0], 1)).astype(int)

# 分别添加一列全为0 or 1的数据
us_data = np.hstack((us_data, zero_data))
uk_data = np.hstack((uk_data, ones_data))


# 拼接两组数据
final_data = np.vstack((uk_data, us_data))
print(final_data)

Pandas

Basic silcing, and key:value pair

import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

# Has index in front
#print(pd.Series([1, 2, 3, 4, 12, 3, 4]))

# Customized index
# t2 = pd.Series([1,23,2,2,1], index=list('abcde'))
# print(t2)

temp_dict = {"Name": "Alex", "Age": "30",
             "Tele": 10086}

t3 = pd.Series(temp_dict)

# 数据的切片
# print(t3["Age"])

# print(t3.index)
# 打印里面的值
# for i in t3.index:
#     print(i)

print(list(t3.index)[:2])

import numpy as np
from matplotlib import pyplot as plt
import pandas as pd


# has row index and columns index
# print(pd.DataFrame(np.arange(12).reshape(3, 4),
#                    index=list("abc"), columns=list("WXYZ")))

# t1 = pd.DataFrame(np.arange(12).reshape(3, 4),
#                    index=list("abc"), columns=list("WXYZ"))
#
# print(t1)

# DataFrame 传入字典

d1 = {"name":["alex", "lear"],
      "age": [30, 32], "tel": [100, 123]}
t1 = pd.DataFrame(d1)
print(t1)

loc function

import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

df = pd.read_csv("./dogNames2.csv")
# print(df.head())
# print(df.info())

# DataFrame 排序
df = df.sort_values(by="Count_AnimalName", ascending=False)
# print(df[:20]) #前20个最多的

# 取得具体某一列
# print(df[:20]["Row_Labels"])

'''
1. 方括号写数字表示取行，对行进行操作
2. 写字符串表示取列index，对列进行操作
'''
print(type(df["Row_Labels"]))  # 变成series了

t3 = pd.DataFrame(np.arange(12).reshape(3, 4), index=list("abc"), columns=list("wxyz"))
print(t3)

print(t3.loc["a", "z"])  # 3 type is int64
print(t3.loc["a"]) # 只取a 这一行
print(t3.loc[:, "z"])# 只取z 这一列

iloc function

import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

t3 = pd.DataFrame(np.arange(12).reshape(3, 4), index=list("abc"), columns=list("wxyz"))
print(t3)
print('*' * 100)
# 取第二列
print(t3.iloc[:, 2])

# 取不连续的多列
# print(t3.iloc[:, [2, 1]])

# 互换
# [0,2],[0,1]
# [2,2],[2,1]
#print(t3.iloc[[0, 2], [2, 1]])

# 互换0,3|0,1 和2,3|2,1
# print(t3.iloc[[0, 2], [3, 1]])

# 取第一行之后的每一行，加上第二列之前的每一列
print(t3.iloc[1:, :2])

Select Data

import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

df = pd.read_csv("./dogNames2.csv")

# 名字出现的次数大于800并且小于1000
print(df[(800 < df["Count_AnimalName"]) & (df["Count_AnimalName"] < 1000)])

# 次数大于800，名字长度大于4
print(df[(800 < df["Count_AnimalName"]) & (df["Row_Labels"].str.len() > 4)])

drop nan

import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

t3 = pd.DataFrame(np.arange(12).reshape(3, 4), index=list("abc"), columns=list("wxyz"))
print(t3)
print('*' * 100)
#
# # 把t3里面1行之后，一列之前所有值变成nan
#
t3.iloc[1:, :2] = np.nan
print(t3)
#
# # 查看true/false, t3对应的位置里是否有nan
# print(pd.isnull(t3))

# w 这一列 不为 nan的那一行
#print(t3[pd.notnull(t3["w"])])

# how is all, means 全部为nan才删
# how is any, means只要有nan就删
t3 = t3.dropna(axis=1, how="any")
print(t3)

Exercise: imdb movies, rating & runtime distribution

import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

df = pd.read_csv('IMDB-Movie-Data.csv')

# rating, runtime 分布情况 (连续数据的统计：直方图)

#print(df.head(5))
# 只取runtime的values，不要index
runtime_data = df["Runtime (Minutes)"].values

max_runtime = runtime_data.max()
min_runtime = runtime_data.min()

# 计算组数
nums_bin = (max_runtime - min_runtime) // 5

plt.figure(figsize=(20,8), dpi=80)
plt.hist(runtime_data, nums_bin)

# x轴从最小值开始显示
plt.xticks(range(min_runtime, max_runtime + 5, 5))
plt.show()

Important Exercise: Give a set of movie data, rank their genre

import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

df = pd.read_csv('IMDB-Movie-Data.csv')

print(df.head(1))
# 统计分类的列表

temp_list = df["Genre"].str.split(",").tolist()  # [[],[].[]]

genre_list = list(set([i for j in temp_list for i in j]))

# 构造全为0的数组
# 这样的话genre才是 column index
zero_df = pd.DataFrame(np.zeros((df.shape[0], len(genre_list))), columns=genre_list)


# 给每个分类加上电影出现的次数
for i in range(df.shape[0]):
    # temp_list[i]是一个列表
    # zero_df.loc[0, ["Sci-fi", "Thriller"]] = 1
    zero_df.loc[i, temp_list[i]] = 1

print(zero_df.head(3))

# 统计每个分类电影和的数量

genre_count = zero_df.sum(axis=0)

# 排序
genre_count = genre_count.sort_values()
print(genre_count)

plt.figure(figsize=(20, 8), dpi=80)
_x = genre_count.index
_y = genre_count.values
plt.bar(range(len(_x)), _y)
plt.xticks(range(len(_x)), _x)

plt.show()

Merge, Join默认的合并方式

默认的合并方式inner，并集
merge outer，交集，NaN补全
merge left，左边为准，NaN补全
merge right，右边为准，NaN补全

import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

df1 = pd.DataFrame(np.ones((2, 4)), index=["A", "B"], columns=list("abcd"))

df2 = pd.DataFrame(np.zeros((3, 3)), index=["A", "B", "C"], columns=list("xyz"))

# # 以df1的行为准
# print(df1.join(df2))
#
# # 以df2的行为准
# print(df2.join(df1))

# Merge 按照列为准合并
df3 = pd.DataFrame(np.arange(9).reshape(3, 3), columns=list("fax"))

# print(df1.merge(df3, on="a"))
df3.loc[1, "a"] = 1
# print(df3)
print(df1)
print(df3)
print(df1.merge(df3, on="a"))

'''
     a    b    c    d
A  1.0  1.0  1.0  1.0
B  1.0  1.0  1.0  1.0
   f  a  x
0  0  1  2
1  3  1  5
2  6  7  8
     a    b    c    d  f  x
0  1.0  1.0  1.0  1.0  0  2
1  1.0  1.0  1.0  1.0  3  5
2  1.0  1.0  1.0  1.0  0  2
3  1.0  1.0  1.0  1.0  3  5

因为 a有两次1，所以每次 上面的数据merge下面数据的时候，on="a"
都能匹配成功，所以会出现两次0 2 和 3 5


'''

Exercise: Starbucks store in CN and US

import numpy as np
from matplotlib import pyplot as plt
import pandas as pd


# 美国和中国的星巴克哪个多
df = pd.read_csv('starbucks_store_worldwide.csv')
print(df.head(1))

# 分组 DataFrameGroupBy
grouped = df.groupby(by="Country")

# 可以进行遍历,但没必要

# for i, j in grouped:
#     print(i)
#     print('-' * 100)
#     print(j)
#     print('*' * 100)


# 中美数量
country_count = grouped["Brand"].count()
print(country_count["US"])
print(country_count["CN"])

**Count the number of Starbucks of each state in the US **

import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

df = pd.read_csv('starbucks_store_worldwide.csv')
print(df.head(1))

# 统计美国每个州的星巴克的数量

us_data = df[df["Country"] == "US"]

grouped = cn_data.groupby(by="State/Province").count()["Brand"]
print(grouped)

店铺总数排名前十的国家


# 1. 店铺总数排名前十的国家
data1 = df.groupby(by="Country").count()["Brand"].sort_values(ascending=False)[:10]

_x = data1.index
_y = data1.values

plt.figure(figsize=(20,8), dpi=80)
plt.bar(range(len(_x)), _y)

plt.xticks(range(len(_x)), _x)
plt.show()

Rank cities that has starbucks in US

import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

df = pd.read_csv('starbucks_store_worldwide.csv')

df = df[df["Country"] == "US"]

# Rank cities that has starbucks in US
data1 = df.groupby(by="City").count()["Brand"].sort_values(ascending=False)[:50]

_x = data1.index
_y = data1.values

plt.figure(figsize=(20,8), dpi=80)
plt.bar(range(len(_x)), _y, width=0.3, color="orange")

plt.xticks(range(len(_x)), _x, rotation=45)
plt.show()

不同年份书的数量和不同年份书的平均评分情况

import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

df = pd.read_csv('books.csv')

# data1 = df.groupby(by="original_publication_year").count()
# print(data1)

# 不同年份书的数量
# original_publication_year有缺失值，先取到所有非缺失值
# data1 = df[pd.notnull(df["original_publication_year"])]
# grouped = data1.groupby(by="original_publication_year").count()["title"]
# print(grouped)

# 不同年份书的平均评分情况

data1 = df[pd.notnull(df["original_publication_year"])]
# 取data1里面average_rating的值，然后根据发布年份分组，然后算平均值
grouped = data1["average_rating"].groupby(by=data1["original_publication_year"]).mean()
print(grouped)

_x = grouped.index
_y = grouped.values

plt.figure(figsize=(20, 8), dpi=80)
plt.plot(range(len(_x)), _y)
plt.xticks(list(range(len(_x)))[::10], _x[::10].astype(int), rotation=45)
plt.show()

Exerciese: 不同类型的紧急情况次数

import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

'''
Now we have the data of 250,000 911 emergency calls from 2015 to 2017. 
Please count the number of different types of emergency situations in these data. 
If we also want to count the changes in the number of different types of emergency calls 
in different months, What should I do?
'''
df = pd.read_csv("./911.csv")
# print(df.head(5))
# 获取分类情况
# print(df["title"].str.split(": ")) # series类型，因为取的是一列, [0]取不到ems事故原因

# 遍历取事故分类的情况
temp_list = df["title"].str.split(": ").tolist()  # 出来的是一堆列表，每一个都是两个元素['', '']
cate_list = list(set([i[0] for i in temp_list]))  # 每个i中第一个i的元素
# print(cate_list) # ['Fire', 'EMS', 'Traffic']

# 统计每个事故出现的频率
# 1. 构造一个全为0的数组, row 是0-249736， columns是那三类事故
zero_df = pd.DataFrame(np.zeros((df.shape[0], len(cate_list))), columns=cate_list)
# print(zero_df)

# 2. 赋值,低时间复杂度

for cate in cate_list:
    zero_df[cate][df["title"].str.contains(cate)] = 1
    print(zero_df)


# 非常慢的方法, 需要遍历25万次
# for i in range(df.shape[0]):
#     # 列表嵌套列表的情况，我们要选中第i个大列表，然后选第0个元素
#     zero_df.loc[i, temp_list[i][0]] = 1
# print(zero_df)


# 不同类型的紧急情况次数
sum_ret = zero_df.sum(axis=0) # 竖着求和所以是0，相反的
print(sum_ret)

Exercise: 不同类型的紧急情况次数2

import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

'''
Now we have the data of 250,000 911 emergency calls from 2015 to 2017. 
Please count the number of different types of emergency situations in these data. 
If we also want to count the changes in the number of different types of emergency calls 
in different months, What should I do?
'''
df = pd.read_csv("./911.csv")
# print(df.head(5))
# 获取分类情况
# print(df["title"].str.split(": ")) # series类型，因为取的是一列, [0]取不到ems事故原因

# 遍历取事故分类的情况
temp_list = df["title"].str.split(": ").tolist()  # 出来的是一堆列表，每一个都是两个元素['', '']
cate_list = [i[0] for i in temp_list]  # 每个i中第一个i的元素
df["cate"] = pd.DataFrame(np.array(cate_list).reshape((df.shape[0], 1))) # 把cate加到里面

# 添加一列，然后groupby那一列，很聪明的做法
print(df.head(5))
print(df.groupby(by="cate").count()["title"])