Jieba

虽然主要的检索功能实现了，但是我们还需要一个“推荐阅读”的功能。当用户浏览某条具体新闻时，我们在页面底端给出5条和该新闻相关的新闻，也就是一个最简单的推荐系统。搜狐新闻“相关新闻”模块推荐模块的思路是度量两两新闻之间的相似度，取相似度最高的前5篇新闻作为推荐阅读的新闻。我们前面讲过，一篇文档可以用一个向量表示，向量中的每个值是不同词项t在该文档d中的词频tf。但是一篇较短的文档（如新闻）的关键词并不多，所以我们可以提取每篇新闻的关键词，用这些关键词的tfidf值构成文档的向量表示，这样能够大大减少相似度计算量，同时保持较好的推荐效果。 jieba分词组件自带关键词提取功能，并能返回关键词的tfidf值。所以对每篇新闻，我们先提取tfidf得分最高的前25个关键词，用这25个关键词的tfidf值作为文档的向量表示。由此能够得到一个1000*m的文档词项矩阵M，矩阵每行表示一个文档，每列表示一个词项，m为1000个文档的所有互异的关键词（大概10000个）。矩阵M当然也是稀疏矩阵。得到文档词项矩阵M之后，我们利用sklearn的pairwise_distances函数计算M中行向量之间的cosine相似度，对每个文档，得到与其最相似的前5篇新闻id，并把结果写入数据库。推荐阅读模块的代码如下： 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 # -*- coding: utf-8 -*- """ Created on Wed Dec 23 14:06:10 2015 @author: bitjoy.net """ from os import listdir import xml.etree.ElementTree as ET import jieba import jieba.analyse import sqlite3 import configparser from datetime import * import math import pandas as pd import numpy as np from sklearn.metrics import pairwise_distances class RecommendationModule: stop_words = set() k_nearest = [] config_path = '' config_encoding = '' doc_dir_path = '' doc_encoding = '' stop_words_path = '' stop_words_encoding = '' idf_path = '' db_path = '' def __init__(self, config_path, config_encoding): self.config_path = config_path self.config_encoding = config_encoding config = configparser.ConfigParser() config.read(config_path, config_encoding) self.doc_dir_path = config['DEFAULT']['doc_dir_path'] self.doc_encoding = config['DEFAULT']['doc_encoding'] self.stop_words_path = config['DEFAULT']['stop_words_path'] self.stop_words_encoding = config['DEFAULT']['stop_words_encoding'] self.idf_path = config['DEFAULT']['idf_path'] self.db_path = config['DEFAULT']['db_path'] f = open(self.stop_words_path, encoding = self.stop_words_encoding) words = f.read() self.stop_words = set(words.split('\n')) def write_k_nearest_matrix_to_db(self): conn = sqlite3.connect(self.db_path) c = conn.cursor() c.execute("'DROP TABLE IF EXISTS knearest'") c.execute("'CREATE TABLE knearest(id INTEGER PRIMARY KEY, first INTEGER, second INTEGER, third INTEGER, fourth INTEGER, fifth INTEGER)'") for docid, doclist in self.k_nearest: c.execute("INSERT INTO knearest VALUES (?, ?, ?, ?, ?, ?)", tuple([docid] + doclist)) conn.commit() conn.close() def is_number(self, s): try: float(s) return True except ValueError: return False def construct_dt_matrix(self, files, topK = 200): jieba.analyse.set_stop_words(self.stop_words_path) jieba.analyse.set_idf_path(self.idf_path) M = len(files) N = 1 terms = {} dt = [] for i in files: root = ET.parse(self.doc_dir_path + i).getroot() title = root.find('title').text body = root.find('body').text docid = int(root.find('id').text) tags = jieba.analyse.extract_tags(title + '。' + body, topK=topK, withWeight=True) #tags = jieba.analyse.extract_tags(title, topK=topK, withWeight=True) cleaned_dict = {} for word, tfidf in tags: word = word.strip().lower() if word == '' or self.is_number(word): continue cleaned_dict[word] = tfidf if word not in terms: terms[word] = N N += 1 dt.append([docid, cleaned_dict]) dt_matrix = [[0 for i in range(N)] for j in range(M)] i =0 for docid, t_tfidf in dt: dt_matrix[i][0] = docid for term, tfidf in t_tfidf.items(): dt_matrix[i][terms[term]] = tfidf i += 1 dt_matrix = pd.DataFrame(dt_matrix) dt_matrix.index = dt_matrix[0] print('dt_matrix shape:(%d %d)'%(dt_matrix.shape)) return dt_matrix def construct_k_nearest_matrix(self, dt_matrix, k): tmp = np.array(1 – pairwise_distances(dt_matrix[dt_matrix.columns[1:]], metric = "cosine")) similarity_matrix = pd.DataFrame(tmp, index = dt_matrix.index.tolist(), columns = dt_matrix.index.tolist()) for i in similarity_matrix.index: tmp = [int(i),[]] j = 0 while j <= k: max_col = similarity_matrix.loc[i].idxmax(axis = 1) similarity_matrix.loc[i][max_col] = -1 if max_col != i: tmp[1].append(int(max_col)) #max column name j += 1 self.k_nearest.append(tmp) def gen_idf_file(self): files = listdir(self.doc_dir_path) n = float(len(files)) idf = {} for i in files: root = ET.parse(self.doc_dir_path + i).getroot() title = root.find('title').text body = root.find('body').text seg_list = jieba.lcut(title + '。' + body, cut_all=False) seg_list = set(seg_list) – self.stop_words for word in seg_list: word = word.strip().lower() if word == '' or self.is_number(word): continue if word not in idf: idf[word] = 1 else: idf[word] = idf[word] + 1 idf_file = open(self.idf_path, 'w', encoding = 'utf-8') for word, df in idf.items(): idf_file.write('%s %.9f\n'%(word, math.log(n / df))) idf_file.close() def find_k_nearest(self, k, topK): self.gen_idf_file() files = listdir(self.doc_dir_path) dt_matrix = self.construct_dt_matrix(files, topK) self.construct_k_nearest_matrix(dt_matrix, k) self.write_k_nearest_matrix_to_db() if __name__ == "__main__": print('—–start time: %s—–'%(datetime.today())) rm = RecommendationModule('../config.ini', 'utf-8') rm.find_k_nearest(5, 25) print('—–finish time: %s—–'%(datetime.today())) 这个模块的代码量最多，主要原因是需要构建文档词项矩阵，并且计算k邻居矩阵。矩阵数据结构的设计需要特别注意，否则会严重影响系统的效率。我刚开始把任务都扔给了pandas.DataFrame，后来发现当两个文档向量合并时，需要join连接操作，当数据量很大时，非常耗时，所以改成了先用python原始的list存储，最后一次性构造一个完整的pandas.DataFrame，速度比之前快了不少。 ...

目前正是所谓的“大数据”时代，数据量多到难以计数，怎样结构化的存储以便于分析计算，是当前的一大难题。上一篇博客我们简单抓取了1000个搜狐新闻数据，搜索的过程就是从这1000个新闻中找出和关键词相关的新闻来，那么怎样快速搜索呢，总不可能依次打开xml文件一个字一个字的找吧，这时就需要借助倒排索引这个强大的数据结构。在讲倒排索引之前，我们先介绍一下布尔检索。布尔检索只是简单返回包含某个关键词的文档，比如查询“苹果手机”，则返回所有包含“苹果”和“手机”关键词的文档，布尔检索并不对返回结果排序，所以有可能返回的第一个文档是“某个男孩边吃苹果边玩手机…“。实现布尔检索并不难，我们需要构建一个如下图的词项文档矩阵：图1. 布尔检索中的词项文档矩阵每行对应一个词项，每列对应一个文档，如果该值为1，表示该行词项出现在该列文档中。比如词项”苹果“出现在doc1和doc3文档中，如果我们要找同时出现”苹果“和”手机“的文档，只需把他们对应的向量取出来进行”与“操作，此为101&011=001，所以doc3同时出现了”苹果“和”手机“两个关键词，我们将其返回。布尔检索虽然很快，但是它也有很多缺陷，比如不能对结果排序，词项只有出现和不出现两种状态，但是一篇文档中出现10次“苹果“和只出现1次”苹果“，他们的相关度肯定是不相同的。所以需要对布尔检索进行改进。在扫描文档时，不但记录某词项出现与否，还记录该词项出现的次数，即词项频率(tf)；同时我们记录该文档的长度(ld)，以及某词项在不同文档中出现的次数，即文档频率(df)。图2. 倒排索引结构图这样我们就得到了如上图的倒排索引。左边部分被称为词典，存储的是1000个新闻中所有不同的词项；右边部分被称为倒排记录表，存储的是出现Term_i的那些文档信息。倒排索引中存储的变量都是为了给后续检索模型使用。讲到这里，我们需要解决如下几个问题。怎样得到一篇文档中的所有词项。给我们一篇新闻稿子，人类很容易分辨出”苹果“和”手机“是两个不同的词项，但是计算机怎么知道是这两个词呢？为什么不是”苹”、”国手“和”机“呢？这就需要进行中文分词，我们可以借助开源的jieba中文分词组件来完成，jieba分词能够将一个中文句子切成一个个词项，这样我们就可以统计tf, df了。有些词，如”的“、”地“、”得“、”如果“等，几乎每篇文档都会出现，他们起不到很好的区分文档的效果，这类词被称为”停用词“，我们需要把他们去掉。去停词的步骤可以在jieba分词之后完成。怎样存储倒排记录表。假设1000个文档共有20000个不同的词项，如果用类似图1的矩阵形式存储，需要耗费100020000=210^7个存储单元，但是图1往往是一个稀疏矩阵，因为一个文档中可能只出现了200个不同的词项，剩余的19800个词项都是空的。用矩阵方式存储时空效率都不高。所以我们可以采用图2的方式，词典用B-树或hash存储，倒排记录表用邻接链表存储方式，这样能大大减少存储空间。如果我们要将图2保存到数据库，可以对倒排记录表序列化成一个长的字符串，写入到一个单元格，读取的时候再反序列化。比如每个Doc内部用’\t’连接，Doc之间用’\n’连接，读取的时候split即可。倒排索引构建算法使用内存式单遍扫描索引构建方法（SPIMI），其实就是依次对每篇新闻进行分词，如果出现新的词项则插入到词典中，否则将该文档的信息追加到词项对应的倒排记录表中。SPIMI的伪代码如下：图3. SPIMI算法伪代码下面是构建索引的所有代码： 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 # -*- coding: utf-8 -*- """ Created on Sat Dec 5 23:31:22 2015 @author: bitjoy.net """ from os import listdir import xml.etree.ElementTree as ET import jieba import sqlite3 import configparser class Doc: docid = 0 date_time = '' tf = 0 ld = 0 def __init__(self, docid, date_time, tf, ld): self.docid = docid self.date_time = date_time self.tf = tf self.ld = ld def __repr__(self): return(str(self.docid) + '\t' + self.date_time + '\t' + str(self.tf) + '\t' + str(self.ld)) def __str__(self): return(str(self.docid) + '\t' + self.date_time + '\t' + str(self.tf) + '\t' + str(self.ld)) class IndexModule: stop_words = set() postings_lists = {} config_path = '' config_encoding = '' def __init__(self, config_path, config_encoding): self.config_path = config_path self.config_encoding = config_encoding config = configparser.ConfigParser() config.read(config_path, config_encoding) f = open(config['DEFAULT']['stop_words_path'], encoding = config['DEFAULT']['stop_words_encoding']) words = f.read() self.stop_words = set(words.split('\n')) def is_number(self, s): try: float(s) return True except ValueError: return False def clean_list(self, seg_list): cleaned_dict = {} n = 0 for i in seg_list: i = i.strip().lower() if i != '' and not self.is_number(i) and i not in self.stop_words: n = n + 1 if i in cleaned_dict: cleaned_dict[i] = cleaned_dict[i] + 1 else: cleaned_dict[i] = 1 return n, cleaned_dict def write_postings_to_db(self, db_path): conn = sqlite3.connect(db_path) c = conn.cursor() c.execute("'DROP TABLE IF EXISTS postings'") c.execute("'CREATE TABLE postings(term TEXT PRIMARY KEY, df INTEGER, docs TEXT)'") for key, value in self.postings_lists.items(): doc_list = '\n'.join(map(str,value[1])) t = (key, value[0], doc_list) c.execute("INSERT INTO postings VALUES (?, ?, ?)", t) conn.commit() conn.close() def construct_postings_lists(self): config = configparser.ConfigParser() config.read(self.config_path, self.config_encoding) files = listdir(config['DEFAULT']['doc_dir_path']) AVG_L = 0 for i in files: root = ET.parse(config['DEFAULT']['doc_dir_path'] + i).getroot() title = root.find('title').text body = root.find('body').text docid = int(root.find('id').text) date_time = root.find('datetime').text seg_list = jieba.lcut(title + '。' + body, cut_all=False) ld, cleaned_dict = self.clean_list(seg_list) AVG_L = AVG_L + ld for key, value in cleaned_dict.items(): d = Doc(docid, date_time, value, ld) if key in self.postings_lists: self.postings_lists[key][0] = self.postings_lists[key][0] + 1 # df++ self.postings_lists[key][1].append(d) else: self.postings_lists[key] = [1, [d]] # [df, [Doc]] AVG_L = AVG_L / len(files) config.set('DEFAULT', 'N', str(len(files))) config.set('DEFAULT', 'avg_l', str(AVG_L)) with open(self.config_path, ‘w’, encoding = self.config_encoding) as configfile: config.write(configfile) self.write_postings_to_db(config[‘DEFAULT’][‘db_path’]) if __name__ == "__main__": im = IndexModule('../config.ini', 'utf-8') im.construct_postings_lists() 运行之后会在./data/下生成一个ir.db数据库文件，这就是构建好的索引数据库。 ...

Jieba

和我一起构建搜索引擎（五）推荐阅读

和我一起构建搜索引擎（三）构建索引