import jieba from jieba import analyse import numpy import gensim import codecs import pandas as pd import jieba.posseg as pog from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence #获取训练语料 def data_handle(data): n = data.shape[0] data_str = '' for i in numpy.arange(n): data_str += str(data.ix[i, 'comment']) return data_str def fenci(data_str,stop_property,stopfile): # 停用词 stop_word = [word.strip() for word in open(stopfile, encoding='utf-8').readlines()] # 分词 word_cut = pog.cut(data_str) with open('weibo.txt','w',encoding='utf-8') as f: for word, flag in word_cut: if flag not in stop_property: if word not in stop_word: f.write(word+'\n') # 原始的训练语料转化成一个sentence的迭代器,每一次迭代返回的sentence是一个word(utf8格式)的列表 def vctor_word(): wiki_news = open('weibo.txt', 'r',encoding='utf-8') sentences=LineSentence(wiki_news) model=Word2Vec(sentences,sg=0,size=100,window=5,min_count=5,workers=9) model.save('weibo.word2vec') # 实现给出任意字符串,获取字符串中某字符的位置以及出现的总次数 def get_char_pos(string, char): chPos = [] try: chPos = list(((pos, char) for pos, val in enumerate(string) if (val == char))) except: pass return chPos # 利用训练好的词向量获取关键词的词向量 def cut_data(data,stopfile): data.fillna(0,inplace=True) stop_word = [word.strip() for word in open(stopfile, encoding='utf-8').readlines()] charater=['a', 'nr', 'ns', 'nt', 'ng', 'vn', 'vi', 'l', 'n', 'v'] m=data.shape[0] with open('seg_word.txt', 'w', encoding='utf-8') as f: for i in range(m): str_cut = '' str=data.ix[i,'comment'] if str!=0: segs=jieba.posseg.cut(str) for word,flag in segs: if flag in charater: if word not in stop_word: str_cut+=word+'/' f.write(str_cut ) else: str_cut='' f.write('\n ') def get_vector(data,model):#str wordvec_size = 100 word_vec_all = numpy.zeros(wordvec_size) space_pos = get_char_pos(data, '/') first_word = data[0:space_pos[0][0]] print('first_word', first_word) if first_word in model: print('yes') word_vec_all = word_vec_all + model[first_word] for i in range(len(space_pos) - 2): word = data[space_pos[i][0]:space_pos[i + 1][0]] print('word',word) if word in model: print('yes') word_vec_all = word_vec_all + model[first_word] print('word_vec_all',word_vec_all) return word_vec_all def word2vec(file_name, model,str): DataFile = codecs.open(file_name, "r", encoding='utf-8') DataSet = DataFile.readlines()[:-1] score_list=[] str_vector=get_vector(str,model) for data in DataSet: # if data.strip()!='': word_vec_all=get_vector(data,model) score=simlarityCalu(word_vec_all, str_vector) else: score=0 score_list.append(score) print('score_list',score_list) return score_list # 词向量相似度计算代码:余弦 def simlarityCalu(vector1, vector2): vector1Mod = numpy.sqrt(vector1.dot(vector1)) vector2Mod = numpy.sqrt(vector2.dot(vector2)) if vector2Mod != 0 and vector1Mod != 0: simlarity = (vector1.dot(vector2)) / (vector1Mod * vector2Mod) else: simlarity = 0 return simlarity if __name__ == '__main__': stop_property = ['b', 'c', 'd', 'e', 'f', 'm', 'o', 'p', 'q', 'r', 't', 'u', 'x', 'y', 'z', 'uj', 'nrt', 'eng', 'zg', 'ul'] stop_file='stop.txt' # 读取数据 data = pd.read_excel('C:/E/weibo.xlsx') data.rename(columns={'粉丝ID': 'fans_id', '粉丝': 'fans_name', '微博账户id': 'weibo_user_id', '微博名': 'weibo_name', '微博id': 'weibo_id', '评论id': 'comment_id', '评论': 'comment'}, inplace=True) # 获取评论字符串 comment_str=data_handle(data) #获取语料 fenci(comment_str, stop_property, stop_file) #训练模型 vctor_word() #获取关键词 cut_data(data, stop_file) p1_keywords = 'seg_word.txt' str1 = '农农/陈利农/宝贝' # model = gensim.models.Word2Vec.load('weibo.word2vec') model = gensim.models.Word2Vec.load('zhiwiki_news.word2vec') p1_vec = word2vec(p1_keywords, model,str1) str2='舒蔻 尤妮佳 买'