import spacy import json from tqdm import tqdm from collections import Counter import random import codecs import numpy as np import os import tensorflow as tf
#加载模型 nlp=spacy.blank('en')
#对句子进行分词 def word_tokenize(sent): doc=nlp(sent) return [token.text for token in doc]
#常用的word2idx #此处输出spans形式:[(0, 1), (2, 6), (7, 8), (8, 9), (9, 10), (11, 15), (16, 18)] #意为取出该词当前所在的位置,并且结束长度+当前长度 #两者之差即为该单词长度 def convert_idx(text,tokens): current=0 spans=[] for token in tokens: current=text.find(token,current) if current<0: print('Token {} cannot be found!'.format(token)) raise Exception() #[(0, 1), (2, 6), (7, 8), (8, 9), (9, 10), (11, 15), (16, 18)] #取出该词当前所在的位置,并且结束长度+当前长度 #两者之差即为该单词长度 spans.append((current,current+len(token))) current+=len(token) return spans
#预处理文件 def process_file(filename,data_type=None,word_counter=None,char_counter=None): print("Generating {} examples...".format(data_type)) examples = [] eval_examples = {} total=0 with open(filename,'r') as fh: source=json.load(fh) print(len(source['data'])) #遍历每篇文章dev有48篇文章 for article in tqdm(source["data"]): #遍历每篇文章的段落 for para in article['paragraphs']: #替换段落中的''和`` context = para["context"].replace("''", '" ').replace("``", '" ') #并对段落进行分词,分词中还是带了标点和特殊符号,需要后面进行处理 context_tokens=word_tokenize(context) #['The', 'connection', 'between', 'macroscopic', 'nonconservative', 'forces', 'and', 'microscopic', 'conservative', 'forces', 'is', 'described', 'by', 'detailed', 'treatment', 'with', 'statistical', 'mechanics', '.', 'In', 'macroscopic', 'closed', 'systems', ',', 'nonconservative', 'forces', 'act', 'to', 'change', 'the', 'internal', 'energies', 'of', 'the', 'system', ',', 'and', 'are', 'often', 'associated', 'with', 'the', 'transfer', 'of', 'heat', '.', 'According', 'to', 'the', 'Second', 'law', 'of', 'thermodynamics', ',', 'nonconservative', 'forces', 'necessarily', 'result', 'in', 'energy', 'transformations', 'within', 'closed', 'systems', 'from', 'ordered', 'to', 'more', 'random', 'conditions', 'as', 'entropy', 'increases', '.'] #获取每个单词的字符表示 context_chars = [list(token) for token in context_tokens] #word2idx 每个词开始的位置和结束的位置 spans = convert_idx(context, context_tokens) for token in context_tokens: #这儿加的是每个qas的长度?? word_counter[token] += len(para["qas"]) for char in token: #每个单词的字符这儿也加的是每个qas的长度 #Counter({'e': 28293, 'a': 19610, 'n': 17317, 't': 17071, 'r': 15443, 'o': 15358, 'i': # 14669, 's': 14081, 'h': 11839, 'l': 9031, 'd': 8982, 'c': 6540, 'u': 5885, # 'w': 5806, 'f': 4516, 'g': 4463, 'p': 4372, 'm': 4165, ',': 3116, 'y': 2842, 'b': 2321, 'v': 2152, # '.': 2057, 'B': 2052, 'S': 1832, '1': 1776, 'k': 1553, '0': 1168, 'C': 1107, 'F': 963, 'T': 876, # '2': 856, 'P': 836, 'I': 819, '5': 798, 'N': 766, 'L': 741, 'X': 714, 'M': 672, '4': 662, '3': 636, # 'A': 619, '9': 584, "'": 552, '-': 523, '7': 488, 'D': 470, '–': 415, '(': 412, ')': 412, '8': 380, # '6': 371, 'V': 352, 'O': 272, 'J': 268, 'j': 249, 'q': 235, '"': 222, 'G': 221, 'x': 220, 'E': 177, # 'R': 173, 'W': 168, 'K': 159, 'H': 117, 'U': 108, 'z': 107, '½': 81, ':': 81, ';': 63, '$': 49, '#': 30, # 'é': 26, '/': 21, 'Q': 15}) char_counter[char] += len(para["qas"]) #遍历qas for qa in para["qas"]: total += 1 #替换问题'' `` ques = qa["question"].replace( "''", '" ').replace("``", '" ') #对问题进行分词 ques_tokens = word_tokenize(ques) #取出问题中的字符 ques_chars = [list(token) for token in ques_tokens] #遍历问题每个词 for token in ques_tokens: #此处真的正确 word_counter[token] += 1 for char in token: char_counter[char] += 1 y1s, y2s = [], [] answer_texts = [] #遍历答案文本 for answer in qa["answers"]: #答案文本 answer_text = answer["text"] #开始位置 answer_start = answer['answer_start'] answer_end = answer_start + len(answer_text) answer_texts.append(answer_text) answer_span = [] #加入答案span answer_span for idx, span in enumerate(spans): if not (answer_end <= span[0] or answer_start >= span[1]): answer_span.append(idx) y1, y2 = answer_span[0], answer_span[-1] y1s.append(y1) y2s.append(y2) example = {"context_tokens": context_tokens, "context_chars": context_chars, "ques_tokens": ques_tokens, "ques_chars": ques_chars, "y1s": y1s, "y2s": y2s, "id": total} examples.append(example) #未分词结果 eval_examples[str(total)] = { "context": context, "spans": spans, "answers": answer_texts, "uuid": qa["id"]} random.shuffle(examples) print("{} questions in total".format(len(examples))) return examples, eval_examples
#获取词向量 def get_embedding(counter, data_type, limit=-1, emb_file=None, size=None, vec_size=None, token2idx_dict=None): print("Generating {} embedding...".format(data_type)) embedding_dict={} #过滤掉低频词,仅取出频率较高的词 filtered_elements=[k for k,v in counter.items() if v>limit] #判断词向量文件是否为空 if emb_file is not None: assert size is not None#如果size为空直接退出程序 assert vec_size is not None#如果vec_size为空直接退出程序 #读取词向量 with codecs.open(emb_file, "r", encoding="utf-8") as fh: # 依次遍历词向量每一行 for line in tqdm(fh, total=size): #分开词和向量 array = line.split() #取出开头的单词 word="".join(array[0:-vec_size]) #取出单词对应的词向量 vector=list(map(float,array[-vec_size:])) #词向量的单词在counter单词中,并且 在文本中的单词数目>limit if word in counter and counter[word]>limit: embedding_dict[word]=vector print("{} / {} tokens have corresponding {} embedding vector".format( len(embedding_dict), len(filtered_elements), data_type)) #如果词向量文件为空 else: assert vec_size is not None #遍历所有过滤的词 for token in filtered_elements: #对每个单词进行随机初始化向量 embedding_dict[token]=[np.random.normal(scale=0.01) for _ in range(vec_size)] print("{} tokens have corresponding embedding vector".format( len(filtered_elements))) #处理OOV词 NULL = "--NULL--" OOV = "--OOV--" #从下标2索引开始,过滤掉NULL和OOV 创建token2_idx_dict token2idx_dict={token:idx for idx,token in enumerate(embedding_dict.keys(),2)} if token2idx_dict is None else token2idx_dict #NULL OOV 设置token2idx token2idx_dict[NULL] = 0 token2idx_dict[OOV] = 1 #NULL OOV设置embedding_dict embedding_dict[NULL] = [0. for _ in range(vec_size)] embedding_dict[OOV] = [0. for _ in range(vec_size)] #id2embedding 单词id对应的词向量 id2emb_dict={idx:embedding_dict[token] for token,idx in token2idx_dict.items() } #获取词向量矩阵 emb_mat=[id2emb_dict[idx] for idx in range(id2emb_dict)] #仅返回 词向量矩阵,token2idx_dict return emb_mat, token2idx_dict
#构建文本特征question paragraph answer and so on def build_features(config, examples, data_type, out_file, word2idx_dict, char2idx_dict, is_test=False): #文章长度 para_limit=config.test_para_limit if is_test else config.para_limit #问题长度 ques_limit = config.test_ques_limit if is_test else config.ques_limit #字符限制长度 char_limit = config.char_limit #过滤文章和问题长度函数 def filter_func(example, is_test=False): return len(example["context_tokens"]) > para_limit or len(example["ques_tokens"]) > ques_limit
print("Processing {} examples...".format(data_type)) writer = tf.python_io.TFRecordWriter(out_file) total = 0 total_ = 0 meta = {} #处理文章 for example in tqdm(examples): total_+=1 #过滤长度大于限制值的文章 if filter_func(example, is_test): continue total += 1 #段落ids context_idxs = np.zeros([para_limit], dtype=np.int32) #段落id char对应的矩阵 context_char_idxs = np.zeros([para_limit, char_limit], dtype=np.int32) ##问题ids ques_idxs = np.zeros([ques_limit], dtype=np.int32) ##问题id char对应的矩阵 ques_char_idxs = np.zeros([ques_limit, char_limit], dtype=np.int32) #段落长度 y1 = np.zeros([para_limit], dtype=np.float32) y2 = np.zeros([para_limit], dtype=np.float32)
#获取单词 def _get_word(word): for each in (word, word.lower(), word.capitalize(), word.upper()): if each in word2idx_dict: #返回每个单词对应的id return word2idx_dict[each] return 1
#获取字符 def _get_char(char): if char in char2idx_dict: # 返回每个字符对应的id return char2idx_dict[char] return 1 #为每个文章内容获取对应的ids context_tokens为已经分好词的文章 for i, token in enumerate(example["context_tokens"]): context_idxs[i] = _get_word(token) # 为每个问题内容获取对应的ids ques_tokens为已经分好词的问题 for i, token in enumerate(example["ques_tokens"]): ques_idxs[i] = _get_word(token) # 为每个文章内容获取对应的chars for i, token in enumerate(example["context_chars"]): for j, char in enumerate(token): #不能超出char的限制 if j == char_limit: break #赋值char 不够的用0填充 context_char_idxs[i, j] = _get_char(char) # 为每个问题内容获取对应的chars for i, token in enumerate(example["ques_chars"]): for j, char in enumerate(token): #不能超出char的限制 if j == char_limit: break # 赋值char 不够的用0填充 ques_char_idxs[i, j] = _get_char(char) #开始,结束位置 start, end = example["y1s"][-1], example["y2s"][-1] y1[start], y2[end] = 1.0, 1.0 #构建tensorflow 记录 record = tf.train.Example(features=tf.train.Features(feature={ "context_idxs": tf.train.Feature(bytes_list=tf.train.BytesList(value=[context_idxs.tostring()])), "ques_idxs": tf.train.Feature(bytes_list=tf.train.BytesList(value=[ques_idxs.tostring()])), "context_char_idxs": tf.train.Feature(bytes_list=tf.train.BytesList(value=[context_char_idxs.tostring()])), "ques_char_idxs": tf.train.Feature(bytes_list=tf.train.BytesList(value=[ques_char_idxs.tostring()])), "y1": tf.train.Feature(bytes_list=tf.train.BytesList(value=[y1.tostring()])), "y2": tf.train.Feature(bytes_list=tf.train.BytesList(value=[y2.tostring()])), "id": tf.train.Feature(int64_list=tf.train.Int64List(value=[example["id"]])) })) writer.write(record.SerializeToString()) print("Build {} / {} instances of features in total".format(total, total_)) meta["total"] = total writer.close() return meta
#保存文件 def save(filename, obj, message=None): if message is not None: print("Saving {}...".format(message)) with open(filename, "w") as fh: json.dump(obj, fh)
# 预处理文件 def prepro(config): #单词,字符计数器 word_counter, char_counter = Counter(), Counter() #处理训练集 train_examples, train_eval = process_file( config.train_file, "train", word_counter, char_counter) #处理验证集 dev_examples, dev_eval = process_file( config.dev_file, "dev", word_counter, char_counter) #处理测试集 test_examples, test_eval = process_file( config.test_file, "test", word_counter, char_counter) #词向量文件 word_emb_file = config.fasttext_file if config.fasttext else config.glove_word_file #字符向量文件 char_emb_file = config.glove_char_file if config.pretrained_char else None #字符向量大小 char_emb_size = config.glove_char_size if config.pretrained_char else None #字符向量维度 char_emb_dim = config.glove_dim if config.pretrained_char else config.char_dim #word2idx字典 word2idx_dict = None #如果存在word2idx字典 则直接导入 if os.path.isfile(config.word2idx_file): with open(config.word2idx_file, "r") as fh: word2idx_dict = json.load(fh) #构建词向量矩阵 word_emb_mat, word2idx_dict = get_embedding(word_counter, "word", emb_file=word_emb_file, size=config.glove_word_size, vec_size=config.glove_dim, token2idx_dict=word2idx_dict)
#构建字符向量矩阵 char2idx_dict = None # 如果存在char2idx字典 则直接导入 if os.path.isfile(config.char2idx_file): with open(config.char2idx_file, "r") as fh: char2idx_dict = json.load(fh) # 构建字符向量矩阵 char_emb_mat, char2idx_dict = get_embedding( char_counter, "char", emb_file=char_emb_file, size=char_emb_size, vec_size=char_emb_dim, token2idx_dict=char2idx_dict)
#对训练集、验证集、测试集构建特征 build_features(config, train_examples, "train", config.train_record_file, word2idx_dict, char2idx_dict) dev_meta = build_features(config, dev_examples, "dev", config.dev_record_file, word2idx_dict, char2idx_dict) test_meta = build_features(config, test_examples, "test", config.test_record_file, word2idx_dict, char2idx_dict, is_test=True)
#对预处理的文件进行保存 save(config.word_emb_file, word_emb_mat, message="word embedding") save(config.char_emb_file, char_emb_mat, message="char embedding") save(config.train_eval_file, train_eval, message="train eval") save(config.dev_eval_file, dev_eval, message="dev eval") save(config.test_eval_file, test_eval, message="test eval") save(config.dev_meta, dev_meta, message="dev meta") save(config.word2idx_file, word2idx_dict, message="word2idx") save(config.char2idx_file, char2idx_dict, message="char2idx") save(config.test_meta, test_meta, message="test meta")
|