import numpy as np
from collections import defaultdict #得到词频字典
import pandas as pd
import gensim #word2vec
from sklearn.model_selection import train_test_split #切分数据集
from operator import itemgetter
from tqdm import tqdm
import os
import pickle
###用法
dict1 = defaultdict(int)
dict1[word] +=1
data = pd.read_csv('')[:20].text
#加载
model_w2v = gensim.models.Word2Vec.load('data/wiki.Mode')
#model_w2v.wv.most_similar("民生银行") # 找最相似的词
# model_w2v.wv.get_vector("民生银行") # 查看向量
# model_w2v.wv.syn0 # model_w2v.wv.vectors 一样都是查看向量
# model_w2v.wv.vocab # 查看词和对应向量
# model_w2v.wv.index2word # 每个index对应的词
train_words,test_words,train_labels,test_labels = train_test_split(x,label,test_size=0.2,random_state=42)
lists = (itemgetter *([0,1,2]))([a,b,c,d]) #结果:[a,b,c]
pickle.dump('',open('','wb'))
pickle.load('',open('','rb'))
device = torch.device('cuda' if torch.cuda.is_availabel() else 'cpu')
parameters = {
#词最低频率设置
'min_count_word':1,
'word2ind':None,
'ind2word':None,
'ind2embedding':None,
'output_size':None,
'epoch':20,
'batch_size':10,
'embedding_dim':300,
'hidden_size':128,
'num_layers':2, #堆叠lstm层数
'dropout':0.5,
'cuda':device,
'lr':0.01,
'num_unknow':0
}
while 1:
x,y,keys,epoch = next(train_yield)
if not keys:
break
全部评论