逻辑回归之Python应用实例
上一篇博客主要介绍了逻辑回归的理论知识,这篇博客咱们用Python机器学习包sklearn中的LogisticRegression做一个分类的实例。 数据还是学生样本,只有两个特征,分别是两门课的分数score1和score2,类标号y表示这个学生是否能被录取。先上分类效果图: 完整的Python代码如下: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 # -*- coding: utf-8 -*- """ Created on Wed Nov 08 17:49:41 2017 @author: zhenlin """ import numpy as np import pandas as pd from sklearn.cross_validation import train_test_split from sklearn.linear_model import LogisticRegression import matplotlib.pyplot as plt from sklearn.metrics import precision_recall_curve from sklearn.metrics import classification_report # 1. 构造数据 sample_number = 200 # 第一个高斯分布参数 mean1 = [0, 4] # 两个维度上的均值 cov1 = [[5, 3], [3, 10]] # 两个维度的协方差矩阵,必须满足对称半正定 # 第二个高斯分布参数 mean2 = [7, 5] cov2 = [[7, 2], [2, 15]] # 从两个二元高斯分布中随机采样数据点 class1_x1, class1_x2 = np.random.multivariate_normal(mean1, cov1, sample_number).T # .T表示转置 class2_x1, class2_x2 = np.random.multivariate_normal(mean2, cov2, sample_number).T # 两个高斯分布对应两个类标号 data = [[class1_x1[i],class1_x2[i],0] for i in range(sample_number)]+[[class2_x1[i],class2_x2[i],1] for i in range(sample_number)] # 填充到pandas中 data = pd.DataFrame(data,columns=['score1','score2','result']) score_data = data[['score1','score2']] result_data = data['result'] # 2. 训练模型 average_precision = 0 # 平均准确度 iters = 10 # 交叉验证次数 for i in xrange(iters): # 数据划分,80%用于训练,20%用于预测 x_train, x_test, y_train, y_test = train_test_split(score_data, result_data, test_size = 0.2) # 构造默认逻辑回归模型 model = LogisticRegression() # 训练 model.fit(x_train, y_train) # 预测 predict_y = model.predict(x_test) # 计算测试集上的准确度 average_precision += np.mean(predict_y == y_test) average_precision /= iters # 3. 绘制分类面 - 法1 x1_min, x1_max = score_data['score1'].min() - .5, score_data['score1'].max() + .5 def generate_face(prob): y = -np.log(1.0 / prob - 1.0) n = 500 x1 = np.linspace(x1_min, x1_max, n) # w1x1+w2x2+b=y x2 = (-model.coef_[0][0] / float(model.coef_[0][1])) * x1 + (y - model.intercept_) / float(model.coef_[0][1]) return x1, x2 pos_data = data[data['result'] == 1] neg_data = data[data['result'] == 0] plt.scatter(x = pos_data['score1'], y = pos_data['score2'], color = 'black', marker = 'o') plt.scatter(x = neg_data['score1'], y = neg_data['score2'], color = 'red', marker = '*') face_04_x1, face_04_x2 = generate_face(0.4) face_05_x1, face_05_x2 = generate_face(0.5) face_06_x1, face_06_x2 = generate_face(0.6) plt.plot(face_04_x1, face_04_x2) plt.plot(face_05_x1, face_05_x2) plt.plot(face_06_x1, face_06_x2) plt.xlim(score_data['score1'].min(), score_data['score1'].max()) plt.ylim(score_data['score2'].min(), score_data['score2'].max()) plt.xlabel('score1') plt.ylabel('score2') plt.legend(['prob_threshold = 0.4', 'prob_threshold = 0.5', 'prob_threshold = 0.6'], loc='center left', bbox_to_anchor=(1, 0.865)) plt.show() # 4. 绘制分类面 - 法2 pos_data = data[data['result'] == 1] neg_data = data[data['result'] == 0] h = 0.02 s1_min, s1_max = score_data['score1'].min() - .5, score_data['score1'].max() + .5 s2_min, s2_max = score_data['score2'].min() - .5, score_data['score2'].max() + .5 # 生成s1在[s1_min, s1_max],且s2在[s2_min, s2_max]的网格数据点 # meshgrid含义参见:http://blog.sciencenet.cn/blog-791749-675394.html s1, s2 = np.meshgrid(np.arange(s1_min, s1_max, h), np.arange(s2_min, s2_max, h)) # 把两个坐标的值按列拼在一起构成二维数据点 Z = model.predict(np.c_[s1.ravel(), s2.ravel()]) # 绘制边界和散点 Z = Z.reshape(s1.shape) # 坐标点是(s1[i], s2[i]),对应颜色是Z[i],颜色主题使用plt.cm.Paired plt.pcolormesh(s1, s2, Z, cmap = plt.cm.Paired) plt.scatter(x = pos_data['score1'], y = pos_data['score2'], color = 'black', marker = 'o') plt.scatter(x = neg_data['score1'], y = neg_data['score2'], color = 'red', marker = '*') plt.xlim(s1.min(), s1.max()) plt.ylim(s2.min(), s2.max()) plt.xlabel('score1') plt.ylabel('score2') plt.show() # 5. 评估模型 # 对于测试数据,模型输出1的概率 answer = model.predict_proba(x_test)[:,1] # 计算不同概率阈值下的P和R precision, recall, thresholds = precision_recall_curve(y_test, answer) # prob > 0.5的报告为1 report = answer > 0.5 print(classification_report(y_test, report, target_names = ['neg', 'pos'])) print('average precision: %f'%average_precision) # 6. 绘制PRC曲线 # step阶跃图,在点(recall[i],precision[i])进行跳变 plt.step(recall, precision, color='b', alpha=0.2, where='post') # 对PRC下方填充颜色 plt.fill_between(recall, precision, step='post', alpha=0.2, color='b') plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.title('2-class Precision-Recall curve') plt.show() 下面将逐模块介绍代码细节,大神可以略过。 ...