什么是机器学习?
定义
简而言之,机器学习是人工智能的一种,为电脑提供了学习能力而不显式地编程。
分类
- 监督学习,训练样本包含对应的“标签”,如识别问题
- 分类问题,样本标签属于两类或多类(离散)
- 回归问题,样本标签包括一个或多个连续变量(连续)
- 无监督学习,训练样本的属性不包含对应的“标签”,如聚类问题
基本流程
如何得到Learned Model
概念描述
- “学习” 问题通常包括n个样本数据(训练样本),然后预测未知数据(测试样本)的属性
- 每个样本包含的多个属性(多维数据)被称作“特征”
- 训练样本、验证样本、测试样本
通过scikit-learn认识机器学习
什么是scikit-learn
- 面向Python的免费机器学习库
- 包含分类、回归、聚类算法,比如:SVM、随机森林、k-means等
- 包含降维、模型筛选、预处理等算法
- 支持NumPy和SciPy数据结构
- 安装
必要条件: Python(> = 2.6或> = 3.3), NumPy(> = 1.6.1), SciPy(> = 0.9)。 pip install scikit-learn conda install scikit-learn
怎么用scikit-learn
小案例
digits-数字识别
# coding: utf-8
# 加载示例数据集
from sklearn import datasets
digits = datasets.load_digits()
# 查看数据集
print (digits.data)
print (digits.data.shape)
print (digits.target_names)
print (digits.target)
print (digits.images[-2])
# 使用 matplotlib 绘图
import matplotlib.pyplot as plt
plt.figure(figsize=(3, 3))
plt.imshow(digits.images[0], cmap=plt.cm.gray_r, interpolation='nearest')
plt.show()
# 在训练集上训练模型
from sklearn import svm
# 手动划分训练集、测试集
n_test = 1400 # 测试样本个数
train_digits_X = digits.data[:-n_test, :]
train_digits_Y = digits.target[:-n_test]
test_digits_X = digits.data[-n_test:, :]
test_digits_Y = digits.target[-n_test:]
# digits 选择SVM模型
svm_digits = svm.SVC(gamma=0.001, C=100.)
# 训练模型
svm_digits.fit(train_digits_X, train_digits_Y)
# 在测试集上测试模型
digits_pred_Y = svm_digits.predict(test_digits_X)
# 查看结果
from sklearn.metrics import accuracy_score
print ('预测标签:', digits_pred_Y)
print ('真实标签:', test_digits_Y)
print ('svm_digits精确度:',accuracy_score(test_digits_Y, digits_pred_Y))
# 保存模型
# python 2 import cPickle as pickle
import _pickle as pickle
with open('svm_model.pkl', 'wb') as f:
pickle.dump(svm_digits, f)
import numpy as np
# 重新加载模型进行预测
with open('svm_model.pkl', 'rb') as f:
model = pickle.load(f)
random_samples_index = np.random.randint(0, 1797, 100)
random_samples = digits.data[random_samples_index, :]
random_targets = digits.target[random_samples_index]
print (random_samples)
random_predict = model.predict(random_samples)
print (random_predict)
print (random_targets)
print ('svm_digits精确度:',accuracy_score(random_targets, random_predict))
iris-鸢尾花
# coding: utf-8
# 交叉验证
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')
# 查看数据集
# iris
iris = datasets.load_iris()
print (iris.data)
print (iris.data.shape)
print (iris.target_names)
print (iris.target)
X = iris.data
y = iris.target
# 分割训练集、测试集
# random_state确保每次随机分割得到相同的结果
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3., random_state=5)
k_range = range(1, 30)
cv_scores = []
for n in k_range:
knn = KNeighborsClassifier(n)
scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy') # 分类问题使用
#scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='neg_mean_squared_error') # 回归问题使用
cv_scores.append(scores.mean())
plt.plot(k_range, cv_scores)
plt.xlabel('K')
plt.ylabel('Accuracy')
plt.show()
# 选择最优的K
best_knn = KNeighborsClassifier(1)
best_knn.fit(X_train, y_train)
print (best_knn.score(X_test, y_test))
# 特征归一化
import numpy as np
# 特征归一化
from sklearn import preprocessing
x1 = np.random.randint(0, 10, 5).reshape(5, 1)
x2 = np.random.randint(0, 1000, 5).reshape(5,1)
x3 = np.random.randint(0, 100000, 5).reshape(5, 1)
X = np.concatenate([x1, x2, x3], axis=1)
print (X)
print (preprocessing.scale(X))
# 生成分类数据进行验证scale的必要性
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')
#n_samples:样品数量,
#n_features,特征总数,
#n_redundant,冗余特征总数
#X ,数据
#y ,标签
X, y = make_classification(n_samples=1000, n_features=30, n_redundant=10,
random_state=25, n_clusters_per_class=1, scale=100)
from sklearn import svm
#分割数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3., random_state=7)
svm_classifier = svm.SVC()
#训练模型
svm_classifier.fit(X_train, y_train)
#评分
print (svm_classifier.score(X_test, y_test))
X = preprocessing.scale(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3., random_state=7)
svm_classifier = svm.SVC()
svm_classifier.fit(X_train, y_train)
print (svm_classifier.score(X_test, y_test))
# * 训练模型
# 回归模型
from sklearn import datasets
boston_data = datasets.load_boston()
X = boston_data.data
y = boston_data.target
print ('样本:')
print (X[:5, :])
print ('标签:')
print (y[:5])
# 选择线性回顾模型
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()
from sklearn.model_selection import train_test_split
# 分割训练集、测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3., random_state=7)
# 训练模型
lr_model.fit(X_train, y_train)
# 返回参数
lr_model.get_params()
lr_model.score(X_train, y_train)
lr_model.score(X_test, y_test)
拓展
交叉验证
特征归一化
当一个模型在优化参数时就会使用梯度下降算法,那么下降速度则与整个数据跨度有关。因此当存在多个特征时,如果特征数据范围不一致,可能会导致梯度下降的路径摇摆不定,效率低下。
因此我们需要进行特征归一化,简而言之,就是让他们基本在同一个数据范围内。
- #### 特征缩放
让所有的特征的数据范围保持一致。只需某一特征的所有值除以其数据范围(最大值减去最小值)即可。
$$
xi=\frac{xi}{max\lbrace{xi}\rbrace-min\lbrace{xi}\rbrace}
$$
- #### 平均归一化
让所有特征数据以0为中心。只需某一特征的所有值减去其平均值即可。
$$
xi=xi-avg(xi)
$$
大案例:利用声音数据进行性别识别
# -*- coding: utf-8 -*-
import pandas as pd
from pandas_tools import inspect_dataset
from pandas_tools import process_missing_data
from pandas_tools import visualize_two_features, visualize_single_feature, \
visualize_multiple_features
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import numpy as np
def run_main():
"""
主函数
"""
# Step.0 加载数据
filepath = './dataset/voice.csv'
voice_data = pd.read_csv(filepath)
# Step.1 查看数据
inspect_dataset(voice_data)
# 查看各label的数据量
print (voice_data['label'].value_counts())
# Step.2 处理缺失数据
voice_data = process_missing_data(voice_data)
# Step.3 特征分布可视化
fea_name1 = 'meanfun' #基于声信号的基频平均值
fea_name2 = 'centroid'#频率中心
#单个特征
visualize_single_feature(voice_data, fea_name1)
#两个特征
visualize_two_features(voice_data, fea_name1, fea_name2)
#多个特征
fea_names = ['meanfreq', 'Q25', 'Q75',fea_name1, fea_name2,'label']
visualize_multiple_features(voice_data, fea_names)
# Step.4 准备数据
X = voice_data.iloc[:, :-1].values
voice_data['label'].replace('male', 0, inplace=True)
voice_data['label'].replace('female', 1, inplace=True)
y = voice_data['label'].values
# 特征归一化
X = preprocessing.scale(X)
# 分割训练集、测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3., random_state=5)
# 选择模型,交叉验证
k_range = range(1, 31)
cv_scores = []
print ('交叉验证:')
for k in k_range:
knn = KNeighborsClassifier(k)
scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
score_mean = scores.mean()
cv_scores.append(score_mean)
print ('%i: %.4f' % (k, score_mean))
best_k = np.argmax(cv_scores) + 1
print ('最优K:', best_k)
plt.plot(k_range, cv_scores)
plt.xlabel('K')
plt.ylabel('Accuracy')
plt.show()
# 训练模型
knn_model = KNeighborsClassifier(best_k)
knn_model.fit(X_train, y_train)
print ('测试模型,准确率:', knn_model.score(X_test, y_test))
if __name__ == '__main__':
run_main()
# -*- coding: utf-8 -*-
import seaborn as sns
import matplotlib.pyplot as plt
def inspect_dataset(df_data):
"""
查看加载的数据基本信息
"""
print ('数据集基本信息:')
print (df_data.info())
print ('数据集有%i行,%i列' %(df_data.shape[0], df_data.shape[1]))
print ('数据预览:')
print (df_data.head())
def process_missing_data(df_data):
"""
处理缺失数据
"""
if df_data.isnull().values.any():
# 存在缺失数据
print ('存在缺失数据!')
df_data = df_data.fillna(0.) # 填充nan
# df_data = df_data.dropna() # 过滤nan
return df_data.reset_index()
def visualize_two_features(df_data, col_label1, col_label2):
"""
两个特征分布可视化
"""
g = sns.FacetGrid(df_data, hue="label", size=8)
g = g.map(plt.scatter, col_label1, col_label2)
g.add_legend()
plt.show()
def visualize_single_feature(df_data, col_label):
"""
单个特征可视化
"""
sns.boxplot(x="label", y=col_label, data=df_data)
g2 = sns.FacetGrid(df_data, hue="label", size=6)
g2.map(sns.kdeplot, col_label)
#g2.add_legend()
plt.show()
def visualize_multiple_features(voice_data, fea_names):
"""
多个特征可视化
"""
sns.pairplot(voice_data[fea_names], hue='label', size=2)
plt.show()
链接