100-Days-Of-ML-Code

原项目地址:https://github.com/MLEveryday/100-Days-Of-ML-Code/blob/master/Code/Day%201_Data_Preprocessing.md

(实际上50多天原作者就弃坑了

-目录-

[TOC]

Day1:数据预处理

![img](https://github.com/MachineLearning100/100-Days-Of-ML-Code/raw/master/Info-graphs/Day 1.jpg)

1
2
3
4
5
#导入库
import numpy as np
import pandas as pd
import sklearn.preprocessing import
dataset=pd.read_csv("C:/Users/匿了匿了/Documents/GitHub/100-Days-Of-ML-Code/datasets/Data.csv")

查看dataset

1
2
3
4
5
#导入数据集
x=dataset.iloc[:,:-1].values
y=dataset.iloc[:,3].values
#全部行 or 列;[a]第a行 or 列
# [a,b,c]第 a,b,c 行 or 列
1
2
3
4
5
#处理丢失数据
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(missing_values=np.nan,strategy='mean')#本来NaN的地方填充上平均值
imputer=imputer.fit(x[:,1:3])
x[:,1:3]=imputer.transform(x[:,1:3])
1
2
3
4
#解析分类数据
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
labelencoder_x=LabelEncoder()
x[:,0]=labelencoder_x.fit_transform(x[:,0])
1
2
3
4
5
6
7
#创建虚拟变量
onehotencoder=OneHotEncoder(categories='auto')
from sklearn.compose import ColumnTransformer
t=ColumnTransformer([('encoder',OneHotEncoder(),[0])],remainder='passthrough')
temp=np.array(t.fit_transform(x))
labelencoder_y=LabelEncoder()
y=labelencoder_y.fit_transform(y)
1
2
3
#拆分数据集为训练集合和测试集合
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)
1
2
3
4
5
#特征量化
from sklearn.preprocessing import StandardScaler
sc_x=StandardScaler()
x_train=sc_x.fit_transform(x_train)
x_test=sc_x.transform(x_test)

Day2:简单线性回归模型

1
2
3
4
5
6
7
8
9
10
11
12
#第一步:数据预处理
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
#Q1:Backend TkAgg is interactive backend. Turning interactive mode on.
#因为在Console里交互,所以不能在图形化界面显示图片
plt.ion()
dataset=pd.read_csv("C:/Users/匿了匿了/Documents/GitHub/100-Days-Of-ML-Code/datasets/studentscores.csv")
x=dataset.iloc[:,:1].values
y=dataset.iloc[:,1].values
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=1/4,random_state=0)
image-20200403195132403
1
2
3
4
#第二步:训练集使用简单线性回归模型来训练
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor=regressor.fit(x_train,y_train)

1
2
#第三步:预测结果
y_pred=regressor.predict(x_test)

![image-20200403195322599](C:\Users\yuehan lian\AppData\Roaming\Typora\typora-user-images\image-20200403195322599.png)

1
2
3
4
#第四步:可视化
#训练集结果可视化
plt.scatter(x_train,y_train,color='red')
plt.plot(x_train,regressor.predict(x_train),color='blue')
1
2
3
#测试集结果可视化
plt.scatter(x_test,y_test,color='pink')
plt.plot(x_test,regressor.predict(x_test),color='green')

Day3:多元线性回归

1
2
3
4
5
6
7
8
#第1步: 数据预处理
#导入库
import pandas as pd
import numpy as np
#导入数据集
dataset=pd.read_csv('C:/Users/匿了匿了/Documents/GitHub/100-Days-Of-ML-Code/datasets/50_Startups.csv')
x = dataset.iloc[ : , :-1].values
y = dataset.iloc[ : , 4 ].values
1
2
3
4
5
6
7
8
#将类别数据数字化
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.compose import ColumnTransformer
labelencoder=LabelEncoder()
x[:,3]=labelencoder.fit_transform(x[:,3])
onehotencoder=OneHotEncoder(categories='auto')
t=ColumnTransformer([('encoder',OneHotEncoder(),[3])],remainder='passthrough')
x=np.array(t.fit_transform(x))
1
2
#躲避虚拟变量陷阱
x=x[:,1:]
1
2
#拆分数据集为训练集和测试集
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)
1
2
3
4
5
#第2步: 在训练集上训练多元线性回归模型
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(x_train,y_train)
y_pred=regressor.predict(x_test)
1
2
#Step 3: 在测试集上预测结果
y_pred=regressor.predict(x_test)