BIDAF代码阅读

1.assert的用法 : 主要用于检查条件,不符合就终止程序

a=-1
#报错
assert a>0,"a超出范围"
#正常运行
assert a<0

2. 打开文件codecs.open()解决不同文件的编码问题,会将文件内容转为unicode

import  codecs, sys

# 用codecs提供的open方法来指定打开的文件的语言编码,它会在读 取的时候自动转换为内部unicode
bfile = codecs.open( " dddd.txt " , ' r ' , " big5 " )
# bfile = open("dddd.txt", 'r')

ss = bfile.read()
bfile.close()
# 输出,这个时候看到的就是转换后的结果。如果使用语言内建的open函数 来打开文件,这里看到的必定是乱码

以下是prepro.py文件的代码阅读与分析:

import spacy
import json
from tqdm import tqdm
from collections import Counter
import random
import codecs
import numpy as np
import os
import tensorflow as tf

#加载模型
nlp=spacy.blank('en')

#对句子进行分词
def word_tokenize(sent):
doc=nlp(sent)
return [token.text for token in doc]


#常用的word2idx
#此处输出spans形式:[(0, 1), (2, 6), (7, 8), (8, 9), (9, 10), (11, 15), (16, 18)]
#意为取出该词当前所在的位置,并且结束长度+当前长度
#两者之差即为该单词长度
def convert_idx(text,tokens):
current=0
spans=[]
for token in tokens:
current=text.find(token,current)
if current<0:
print('Token {} cannot be found!'.format(token))
raise Exception()
#[(0, 1), (2, 6), (7, 8), (8, 9), (9, 10), (11, 15), (16, 18)]
#取出该词当前所在的位置,并且结束长度+当前长度
#两者之差即为该单词长度
spans.append((current,current+len(token)))
current+=len(token)
return spans


#预处理文件
def process_file(filename,data_type=None,word_counter=None,char_counter=None):
print("Generating {} examples...".format(data_type))
examples = []
eval_examples = {}
total=0
with open(filename,'r') as fh:
source=json.load(fh)
print(len(source['data']))
#遍历每篇文章dev有48篇文章
for article in tqdm(source["data"]):
#遍历每篇文章的段落
for para in article['paragraphs']:
#替换段落中的''和``
context = para["context"].replace("''", '" ').replace("``", '" ')
#并对段落进行分词,分词中还是带了标点和特殊符号,需要后面进行处理
context_tokens=word_tokenize(context)
#['The', 'connection', 'between', 'macroscopic', 'nonconservative', 'forces', 'and', 'microscopic', 'conservative', 'forces', 'is', 'described', 'by', 'detailed', 'treatment', 'with', 'statistical', 'mechanics', '.', 'In', 'macroscopic', 'closed', 'systems', ',', 'nonconservative', 'forces', 'act', 'to', 'change', 'the', 'internal', 'energies', 'of', 'the', 'system', ',', 'and', 'are', 'often', 'associated', 'with', 'the', 'transfer', 'of', 'heat', '.', 'According', 'to', 'the', 'Second', 'law', 'of', 'thermodynamics', ',', 'nonconservative', 'forces', 'necessarily', 'result', 'in', 'energy', 'transformations', 'within', 'closed', 'systems', 'from', 'ordered', 'to', 'more', 'random', 'conditions', 'as', 'entropy', 'increases', '.']
#获取每个单词的字符表示
context_chars = [list(token) for token in context_tokens]
#word2idx 每个词开始的位置和结束的位置
spans = convert_idx(context, context_tokens)
for token in context_tokens:
#这儿加的是每个qas的长度??
word_counter[token] += len(para["qas"])
for char in token:
#每个单词的字符这儿也加的是每个qas的长度
#Counter({'e': 28293, 'a': 19610, 'n': 17317, 't': 17071, 'r': 15443, 'o': 15358, 'i':
# 14669, 's': 14081, 'h': 11839, 'l': 9031, 'd': 8982, 'c': 6540, 'u': 5885,
# 'w': 5806, 'f': 4516, 'g': 4463, 'p': 4372, 'm': 4165, ',': 3116, 'y': 2842, 'b': 2321, 'v': 2152,
# '.': 2057, 'B': 2052, 'S': 1832, '1': 1776, 'k': 1553, '0': 1168, 'C': 1107, 'F': 963, 'T': 876,
# '2': 856, 'P': 836, 'I': 819, '5': 798, 'N': 766, 'L': 741, 'X': 714, 'M': 672, '4': 662, '3': 636,
# 'A': 619, '9': 584, "'": 552, '-': 523, '7': 488, 'D': 470, '–': 415, '(': 412, ')': 412, '8': 380,
# '6': 371, 'V': 352, 'O': 272, 'J': 268, 'j': 249, 'q': 235, '"': 222, 'G': 221, 'x': 220, 'E': 177,
# 'R': 173, 'W': 168, 'K': 159, 'H': 117, 'U': 108, 'z': 107, '½': 81, ':': 81, ';': 63, '$': 49, '#': 30,
# 'é': 26, '/': 21, 'Q': 15})
char_counter[char] += len(para["qas"])
#遍历qas
for qa in para["qas"]:
total += 1
#替换问题'' ``
ques = qa["question"].replace(
"''", '" ').replace("``", '" ')
#对问题进行分词
ques_tokens = word_tokenize(ques)
#取出问题中的字符
ques_chars = [list(token) for token in ques_tokens]
#遍历问题每个词
for token in ques_tokens:
#此处真的正确
word_counter[token] += 1
for char in token:
char_counter[char] += 1
y1s, y2s = [], []
answer_texts = []
#遍历答案文本
for answer in qa["answers"]:
#答案文本
answer_text = answer["text"]
#开始位置
answer_start = answer['answer_start']
answer_end = answer_start + len(answer_text)
answer_texts.append(answer_text)
answer_span = []
#加入答案span answer_span
for idx, span in enumerate(spans):
if not (answer_end <= span[0] or answer_start >= span[1]):
answer_span.append(idx)
y1, y2 = answer_span[0], answer_span[-1]
y1s.append(y1)
y2s.append(y2)
example = {"context_tokens": context_tokens, "context_chars": context_chars,
"ques_tokens": ques_tokens,
"ques_chars": ques_chars, "y1s": y1s, "y2s": y2s, "id": total}
examples.append(example)
#未分词结果
eval_examples[str(total)] = {
"context": context, "spans": spans, "answers": answer_texts, "uuid": qa["id"]}
random.shuffle(examples)
print("{} questions in total".format(len(examples)))
return examples, eval_examples

#获取词向量
def get_embedding(counter, data_type, limit=-1, emb_file=None, size=None, vec_size=None, token2idx_dict=None):
print("Generating {} embedding...".format(data_type))
embedding_dict={}
#过滤掉低频词,仅取出频率较高的词
filtered_elements=[k for k,v in counter.items() if v>limit]
#判断词向量文件是否为空
if emb_file is not None:
assert size is not None#如果size为空直接退出程序
assert vec_size is not None#如果vec_size为空直接退出程序
#读取词向量
with codecs.open(emb_file, "r", encoding="utf-8") as fh:
# 依次遍历词向量每一行
for line in tqdm(fh, total=size):
#分开词和向量
array = line.split()
#取出开头的单词
word="".join(array[0:-vec_size])
#取出单词对应的词向量
vector=list(map(float,array[-vec_size:]))
#词向量的单词在counter单词中,并且 在文本中的单词数目>limit
if word in counter and counter[word]>limit:
embedding_dict[word]=vector
print("{} / {} tokens have corresponding {} embedding vector".format(
len(embedding_dict), len(filtered_elements), data_type))
#如果词向量文件为空
else:
assert vec_size is not None
#遍历所有过滤的词
for token in filtered_elements:
#对每个单词进行随机初始化向量
embedding_dict[token]=[np.random.normal(scale=0.01) for _ in range(vec_size)]
print("{} tokens have corresponding embedding vector".format(
len(filtered_elements)))
#处理OOV词
NULL = "--NULL--"
OOV = "--OOV--"
#从下标2索引开始,过滤掉NULL和OOV 创建token2_idx_dict
token2idx_dict={token:idx for idx,token in enumerate(embedding_dict.keys(),2)} if token2idx_dict is None else token2idx_dict
#NULL OOV 设置token2idx
token2idx_dict[NULL] = 0
token2idx_dict[OOV] = 1
#NULL OOV设置embedding_dict
embedding_dict[NULL] = [0. for _ in range(vec_size)]
embedding_dict[OOV] = [0. for _ in range(vec_size)]
#id2embedding 单词id对应的词向量
id2emb_dict={idx:embedding_dict[token] for token,idx in token2idx_dict.items() }
#获取词向量矩阵
emb_mat=[id2emb_dict[idx] for idx in range(id2emb_dict)]
#仅返回 词向量矩阵,token2idx_dict
return emb_mat, token2idx_dict


#构建文本特征question paragraph answer and so on
def build_features(config, examples, data_type, out_file, word2idx_dict, char2idx_dict, is_test=False):
#文章长度
para_limit=config.test_para_limit if is_test else config.para_limit
#问题长度
ques_limit = config.test_ques_limit if is_test else config.ques_limit
#字符限制长度
char_limit = config.char_limit
#过滤文章和问题长度函数
def filter_func(example, is_test=False):
return len(example["context_tokens"]) > para_limit or len(example["ques_tokens"]) > ques_limit

print("Processing {} examples...".format(data_type))
writer = tf.python_io.TFRecordWriter(out_file)
total = 0
total_ = 0
meta = {}
#处理文章
for example in tqdm(examples):
total_+=1
#过滤长度大于限制值的文章
if filter_func(example, is_test):
continue
total += 1
#段落ids
context_idxs = np.zeros([para_limit], dtype=np.int32)
#段落id char对应的矩阵
context_char_idxs = np.zeros([para_limit, char_limit], dtype=np.int32)
##问题ids
ques_idxs = np.zeros([ques_limit], dtype=np.int32)
##问题id char对应的矩阵
ques_char_idxs = np.zeros([ques_limit, char_limit], dtype=np.int32)
#段落长度
y1 = np.zeros([para_limit], dtype=np.float32)
y2 = np.zeros([para_limit], dtype=np.float32)

#获取单词
def _get_word(word):
for each in (word, word.lower(), word.capitalize(), word.upper()):
if each in word2idx_dict:
#返回每个单词对应的id
return word2idx_dict[each]
return 1

#获取字符
def _get_char(char):
if char in char2idx_dict:
# 返回每个字符对应的id
return char2idx_dict[char]
return 1
#为每个文章内容获取对应的ids context_tokens为已经分好词的文章
for i, token in enumerate(example["context_tokens"]):
context_idxs[i] = _get_word(token)
# 为每个问题内容获取对应的ids ques_tokens为已经分好词的问题
for i, token in enumerate(example["ques_tokens"]):
ques_idxs[i] = _get_word(token)
# 为每个文章内容获取对应的chars
for i, token in enumerate(example["context_chars"]):
for j, char in enumerate(token):
#不能超出char的限制
if j == char_limit:
break
#赋值char 不够的用0填充
context_char_idxs[i, j] = _get_char(char)
# 为每个问题内容获取对应的chars
for i, token in enumerate(example["ques_chars"]):
for j, char in enumerate(token):
#不能超出char的限制
if j == char_limit:
break
# 赋值char 不够的用0填充
ques_char_idxs[i, j] = _get_char(char)
#开始,结束位置
start, end = example["y1s"][-1], example["y2s"][-1]
y1[start], y2[end] = 1.0, 1.0
#构建tensorflow 记录
record = tf.train.Example(features=tf.train.Features(feature={
"context_idxs": tf.train.Feature(bytes_list=tf.train.BytesList(value=[context_idxs.tostring()])),
"ques_idxs": tf.train.Feature(bytes_list=tf.train.BytesList(value=[ques_idxs.tostring()])),
"context_char_idxs": tf.train.Feature(bytes_list=tf.train.BytesList(value=[context_char_idxs.tostring()])),
"ques_char_idxs": tf.train.Feature(bytes_list=tf.train.BytesList(value=[ques_char_idxs.tostring()])),
"y1": tf.train.Feature(bytes_list=tf.train.BytesList(value=[y1.tostring()])),
"y2": tf.train.Feature(bytes_list=tf.train.BytesList(value=[y2.tostring()])),
"id": tf.train.Feature(int64_list=tf.train.Int64List(value=[example["id"]]))
}))
writer.write(record.SerializeToString())
print("Build {} / {} instances of features in total".format(total, total_))
meta["total"] = total
writer.close()
return meta

#保存文件
def save(filename, obj, message=None):
if message is not None:
print("Saving {}...".format(message))
with open(filename, "w") as fh:
json.dump(obj, fh)

# 预处理文件
def prepro(config):
#单词,字符计数器
word_counter, char_counter = Counter(), Counter()
#处理训练集
train_examples, train_eval = process_file(
config.train_file, "train", word_counter, char_counter)
#处理验证集
dev_examples, dev_eval = process_file(
config.dev_file, "dev", word_counter, char_counter)
#处理测试集
test_examples, test_eval = process_file(
config.test_file, "test", word_counter, char_counter)
#词向量文件
word_emb_file = config.fasttext_file if config.fasttext else config.glove_word_file
#字符向量文件
char_emb_file = config.glove_char_file if config.pretrained_char else None
#字符向量大小
char_emb_size = config.glove_char_size if config.pretrained_char else None
#字符向量维度
char_emb_dim = config.glove_dim if config.pretrained_char else config.char_dim
#word2idx字典
word2idx_dict = None
#如果存在word2idx字典 则直接导入
if os.path.isfile(config.word2idx_file):
with open(config.word2idx_file, "r") as fh:
word2idx_dict = json.load(fh)
#构建词向量矩阵
word_emb_mat, word2idx_dict = get_embedding(word_counter, "word", emb_file=word_emb_file,
size=config.glove_word_size, vec_size=config.glove_dim, token2idx_dict=word2idx_dict)

#构建字符向量矩阵
char2idx_dict = None
# 如果存在char2idx字典 则直接导入
if os.path.isfile(config.char2idx_file):
with open(config.char2idx_file, "r") as fh:
char2idx_dict = json.load(fh)
# 构建字符向量矩阵
char_emb_mat, char2idx_dict = get_embedding(
char_counter, "char", emb_file=char_emb_file, size=char_emb_size, vec_size=char_emb_dim,
token2idx_dict=char2idx_dict)

#对训练集、验证集、测试集构建特征
build_features(config, train_examples, "train",
config.train_record_file, word2idx_dict, char2idx_dict)
dev_meta = build_features(config, dev_examples, "dev",
config.dev_record_file, word2idx_dict, char2idx_dict)
test_meta = build_features(config, test_examples, "test",
config.test_record_file, word2idx_dict, char2idx_dict, is_test=True)

#对预处理的文件进行保存
save(config.word_emb_file, word_emb_mat, message="word embedding")
save(config.char_emb_file, char_emb_mat, message="char embedding")
save(config.train_eval_file, train_eval, message="train eval")
save(config.dev_eval_file, dev_eval, message="dev eval")
save(config.test_eval_file, test_eval, message="test eval")
save(config.dev_meta, dev_meta, message="dev meta")
save(config.word2idx_file, word2idx_dict, message="word2idx")
save(config.char2idx_file, char2idx_dict, message="char2idx")
save(config.test_meta, test_meta, message="test meta")
Author: CinKate
Link: http://renxingkai.github.io/2019/03/21/bidaf/
Copyright Notice: All articles in this blog are licensed under CC BY-NC-SA 4.0 unless stating additionally.