Python

上一篇博客主要介绍了逻辑回归的理论知识，这篇博客咱们用Python机器学习包sklearn中的LogisticRegression做一个分类的实例。数据还是学生样本，只有两个特征，分别是两门课的分数score1和score2，类标号y表示这个学生是否能被录取。先上分类效果图：完整的Python代码如下： 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 # -*- coding: utf-8 -*- """ Created on Wed Nov 08 17:49:41 2017 @author: zhenlin """ import numpy as np import pandas as pd from sklearn.cross_validation import train_test_split from sklearn.linear_model import LogisticRegression import matplotlib.pyplot as plt from sklearn.metrics import precision_recall_curve from sklearn.metrics import classification_report # 1. 构造数据 sample_number = 200 # 第一个高斯分布参数 mean1 = [0, 4] # 两个维度上的均值 cov1 = [[5, 3], [3, 10]] # 两个维度的协方差矩阵，必须满足对称半正定 # 第二个高斯分布参数 mean2 = [7, 5] cov2 = [[7, 2], [2, 15]] # 从两个二元高斯分布中随机采样数据点 class1_x1, class1_x2 = np.random.multivariate_normal(mean1, cov1, sample_number).T # .T表示转置 class2_x1, class2_x2 = np.random.multivariate_normal(mean2, cov2, sample_number).T # 两个高斯分布对应两个类标号 data = [[class1_x1[i],class1_x2[i],0] for i in range(sample_number)]+[[class2_x1[i],class2_x2[i],1] for i in range(sample_number)] # 填充到pandas中 data = pd.DataFrame(data,columns=['score1','score2','result']) score_data = data[['score1','score2']] result_data = data['result'] # 2. 训练模型 average_precision = 0 # 平均准确度 iters = 10 # 交叉验证次数 for i in xrange(iters): # 数据划分，80%用于训练，20%用于预测 x_train, x_test, y_train, y_test = train_test_split(score_data, result_data, test_size = 0.2) # 构造默认逻辑回归模型 model = LogisticRegression() # 训练 model.fit(x_train, y_train) # 预测 predict_y = model.predict(x_test) # 计算测试集上的准确度 average_precision += np.mean(predict_y == y_test) average_precision /= iters # 3. 绘制分类面 - 法1 x1_min, x1_max = score_data['score1'].min() - .5, score_data['score1'].max() + .5 def generate_face(prob): y = -np.log(1.0 / prob - 1.0) n = 500 x1 = np.linspace(x1_min, x1_max, n) # w1x1+w2x2+b=y x2 = (-model.coef_[0][0] / float(model.coef_[0][1])) * x1 + (y - model.intercept_) / float(model.coef_[0][1]) return x1, x2 pos_data = data[data['result'] == 1] neg_data = data[data['result'] == 0] plt.scatter(x = pos_data['score1'], y = pos_data['score2'], color = 'black', marker = 'o') plt.scatter(x = neg_data['score1'], y = neg_data['score2'], color = 'red', marker = '*') face_04_x1, face_04_x2 = generate_face(0.4) face_05_x1, face_05_x2 = generate_face(0.5) face_06_x1, face_06_x2 = generate_face(0.6) plt.plot(face_04_x1, face_04_x2) plt.plot(face_05_x1, face_05_x2) plt.plot(face_06_x1, face_06_x2) plt.xlim(score_data['score1'].min(), score_data['score1'].max()) plt.ylim(score_data['score2'].min(), score_data['score2'].max()) plt.xlabel('score1') plt.ylabel('score2') plt.legend(['prob_threshold = 0.4', 'prob_threshold = 0.5', 'prob_threshold = 0.6'], loc='center left', bbox_to_anchor=(1, 0.865)) plt.show() # 4. 绘制分类面 - 法2 pos_data = data[data['result'] == 1] neg_data = data[data['result'] == 0] h = 0.02 s1_min, s1_max = score_data['score1'].min() - .5, score_data['score1'].max() + .5 s2_min, s2_max = score_data['score2'].min() - .5, score_data['score2'].max() + .5 # 生成s1在[s1_min, s1_max]，且s2在[s2_min, s2_max]的网格数据点 # meshgrid含义参见：http://blog.sciencenet.cn/blog-791749-675394.html s1, s2 = np.meshgrid(np.arange(s1_min, s1_max, h), np.arange(s2_min, s2_max, h)) # 把两个坐标的值按列拼在一起构成二维数据点 Z = model.predict(np.c_[s1.ravel(), s2.ravel()]) # 绘制边界和散点 Z = Z.reshape(s1.shape) # 坐标点是(s1[i], s2[i])，对应颜色是Z[i]，颜色主题使用plt.cm.Paired plt.pcolormesh(s1, s2, Z, cmap = plt.cm.Paired) plt.scatter(x = pos_data['score1'], y = pos_data['score2'], color = 'black', marker = 'o') plt.scatter(x = neg_data['score1'], y = neg_data['score2'], color = 'red', marker = '*') plt.xlim(s1.min(), s1.max()) plt.ylim(s2.min(), s2.max()) plt.xlabel('score1') plt.ylabel('score2') plt.show() # 5. 评估模型 # 对于测试数据，模型输出1的概率 answer = model.predict_proba(x_test)[:,1] # 计算不同概率阈值下的P和R precision, recall, thresholds = precision_recall_curve(y_test, answer) # prob > 0.5的报告为1 report = answer > 0.5 print(classification_report(y_test, report, target_names = ['neg', 'pos'])) print('average precision: %f'%average_precision) # 6. 绘制PRC曲线 # step阶跃图，在点(recall[i],precision[i])进行跳变 plt.step(recall, precision, color='b', alpha=0.2, where='post') # 对PRC下方填充颜色 plt.fill_between(recall, precision, step='post', alpha=0.2, color='b') plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.title('2-class Precision-Recall curve') plt.show() 下面将逐模块介绍代码细节，大神可以略过。 ...

去年暑假在北大计算所实习的时候，任务之一就是批量下载百度图片。当时没学python，用c#实现了一个简易版本的批量下载器，如下图。 C#版本百度图片批量下载器（抓的是百度的wap站点，现在好像不能用了）当时“时间紧，任务重“，既没仔细研究百度图片API，也没处理好界面线程阻塞的问题。这个问题其实很有意思，趁着暑假在家，实现了一个比较完美的python版本，先上效果图。 python3版本百度图片批量下载器新版使用了python-3.4.3.amd64.msi + PyQt5-5.5-gpl-Py3.4-Qt5.5.0-x64.exe + eric6-6.0.8.zip + cx_Freeze-4.3.4-cp34-none-win_amd64.whl，完整项目在我的GitHub上。大致有如下几点工作：研究百度图片API，获取原始图片URL列表使用python3进行多线程下载利用pyqt5实现界面利用cx_Freeze4打包整个程序下面记录每个步骤的要点，供后人参考。百度图片API 正常使用百度图片搜索的时候，URL是这样的： http://image.baidu.com/search/index?ct=201326592&z=0&tn=baiduimage&ipn=r&word=%E6%AD%A6%E6%B1%89%E5%A4%A7%E5%AD%A6&pn=0&istype=2&ie=utf-8&oe=utf-8&cl=2&lm=-1&st=-1&fr=&fmq=1439374041843_R&ic=0&se=&sme=&width=0&height=0&face=0 里面有很多参数，有些我们并不需要，精简之后大概是这样的： http://image.baidu.com/i?tn=baiduimage&ie=utf-8&word=%E7%BE%8E%E5%A5%B3&pn=&rn=&z= word为搜索关键词；pn为page number当前是第几页，实际含义是image id，表示第几张图片，从0开始；rn为每一页的图片数量，最大为60；z表示图片尺寸，z=9特大尺寸，z=3大尺寸，z=2中等尺寸，z=1小尺寸，z=0所有尺寸。但是这个URL是给”人“看的，下一页的图片是动态加载的，其html代码的图片URL数量固定。一番查询之后发现，将tn=baiduimage换成tn=resultjson_com能获取到所有图片URL的json，json当然是给”猴“看的，这样就能轻松获取到所有图片的URL。慢着，仔细看看json中的objURL，是一串连”猴“都看不懂的字符串，原来百度把图片真实URL加密了，好在加密方法是简单的字符映射，参考这篇博客成功解密。更新：tn=resultjson_com的objURL是加密了，但是tn=resultjson的objURL并没有加密，所以采用tn=resultjson最佳。通过控制pn和rn就能获取指定数量的图片URL，但是我发现rn最大只能为60，并且不同的pn可能会有相同的图片url（比如pn=0和pn=1都有ippr_z2C$qAzdH3FAzdH3Fooo_z&e3Bd8vs7k_z&e3Bv54_z&e3BvgAzdH3F7rs5w1utsjAzdH3Fda8nAzdH3Fa080AzdH3Fda8na080aldm9bb8m_z&e3B3r2这个objURL），所以使用python的集合（set）去重。更新：pn实际上指图片的id，pn=0、rn=60能获取到从0~59这60个URL列表，pn=1、rn=60能获取到从1~60这60个URL列表，所以pn=0和pn=1的列表中当然有59个是重复的。正确的做法是pn=0、rn=60获取0~59这60个URL列表，然后pn=60、rn=60获取60~119这60个列表，以此类推，这样获取到的URL就不会有重复的了。获取图片URL列表的简要代码如下： 1 2 3 4 5 6 7 8 9 10 11 def ParseJSON(self, pn, rn, st): url = 'http://image.baidu.com/i?tn=resultjson_com&amp;ie=utf-8&amp;word=%s&amp;pn=%d&amp;rn=%d&amp;z=%d'%(self.word, pn, rn, self.size) #print(url) request = urllib.request.Request(url = url, headers = my_header) html = urllib.request.urlopen(request).read() hjson = json.loads(html.decode('gbk')) for i in range(0, len(hjson['data'])-1):#最后一个数据为空 img_url = self.DecodeURL(hjson['data'][i]['objURL']) if img_url not in st: st.add(img_url)#去重 self.progressBar_updated_signal.emit()#更新进度条 DecodeURL是解密函数。很奇怪，json最后一个数据是空的。 ...

Python

逻辑回归之Python应用实例

百度图片批量下载器（python3 + pyqt5 + eric6 + cx_Freeze4）