opulating the interactive namespace from numpy and matplotlib count 100.000000 mean 872.320000 std 923.138191 min 64.000000 25% 359.500000 50% 598.000000 75% 1058.000000 max 7125.000000 Name: text_len, dtype: float64
from sklearn.feature_extraction.text import CountVectorizer corpus = [ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the first document?', ] vectorizer = CountVectorizer() vectorizer.fit_transform(corpus).toarray()
batch_size = int(total / fold_num) other_texts = [] other_labels = [] other_num = 0 start = 0 # 将10折分类后的all_index按照index分别划分 for fold inrange(fold_num): num = len(all_index[fold]) texts = [all_texts[i] for i in all_index[fold]] labels = [all_labels[i] for i in all_index[fold]]
# 单折数量>10折平均数时,文本数截断至平均数batch_size,并降截断的文本分至other_texts,标签fold_labels同上 if num > batch_size: fold_texts = texts[:batch_size] other_texts.extend(texts[batch_size:]) fold_labels = labels[:batch_size] other_labels.extend(labels[batch_size:]) other_num += num - batch_size # 单折数量<10折平均数时,原有的文本加other_size中batch_size-num的文本数,标签fold_labels同上 elif num < batch_size: end = start + batch_size - num fold_texts = texts + other_texts[start: end] fold_labels = labels + other_labels[start: end] start = end # 否则,文本数和标签数不变。 else: fold_texts = texts fold_labels = labels
assert batch_size == len(fold_labels)
# shuffle,对10折文本和标签重新刷新。 index = list(range(batch_size)) np.random.shuffle(index)
shuffle_fold_texts = [] shuffle_fold_labels = [] for i in index: shuffle_fold_texts.append(fold_texts[i]) shuffle_fold_labels.append(fold_labels[i])
data = {'label': shuffle_fold_labels, 'text': shuffle_fold_texts} fold_data.append(data)
logging.info("Fold lens %s", str([len(data['label']) for data in fold_data]))
return fold_data
fold_data = all_data2fold(10)
1 2 3 4 5 6 7 8 9
# build train data for word2vec fold_id = 9
train_texts = [] for i inrange(0, fold_id): data = fold_data[i] train_texts.extend(data['text'])
logging.info('Total %d docs.' % len(train_texts))
1 2 3 4 5 6 7 8 9 10 11 12
logging.info('Start training...') from gensim.models.word2vec import Word2Vec
num_features = 100# Word vector dimensionality num_workers = 8# Number of threads to run in parallel
index = list(range(total)) np.random.shuffle(index)
all_texts = [] all_labels = [] for i in index: all_texts.append(texts[i]) all_labels.append(labels[i])
label2id = {} for i inrange(total): label = str(all_labels[i]) if label notin label2id: label2id[label] = [i] else: label2id[label].append(i)
all_index = [[] for _ inrange(fold_num)] for label, data in label2id.items(): # print(label, len(data)) batch_size = int(len(data) / fold_num) other = len(data) - batch_size * fold_num for i inrange(fold_num): cur_batch_size = batch_size + 1if i < other else batch_size # print(cur_batch_size) batch_data = [data[i * batch_size + b] for b inrange(cur_batch_size)] all_index[i].extend(batch_data)
batch_size = int(total / fold_num) other_texts = [] other_labels = [] other_num = 0 start = 0 for fold inrange(fold_num): num = len(all_index[fold]) texts = [all_texts[i] for i in all_index[fold]] labels = [all_labels[i] for i in all_index[fold]]
if num > batch_size: fold_texts = texts[:batch_size] other_texts.extend(texts[batch_size:]) fold_labels = labels[:batch_size] other_labels.extend(labels[batch_size:]) other_num += num - batch_size elif num < batch_size: end = start + batch_size - num fold_texts = texts + other_texts[start: end] fold_labels = labels + other_labels[start: end] start = end else: fold_texts = texts fold_labels = labels
assert batch_size == len(fold_labels)
# shuffle index = list(range(batch_size)) np.random.shuffle(index)
shuffle_fold_texts = [] shuffle_fold_labels = [] for i in index: shuffle_fold_texts.append(fold_texts[i]) shuffle_fold_labels.append(fold_labels[i])
data = {'label': shuffle_fold_labels, 'text': shuffle_fold_texts} fold_data.append(data)
logging.info("Fold lens %s", str([len(data['label']) for data in fold_data]))
logging.info('Build model with lstm word encoder, lstm sent encoder.')
para_num = sum([np.prod(list(p.size())) for p in self.parameters()]) logging.info('Model param num: %.2f M.' % (para_num / 1e6))
defforward(self, batch_inputs): # batch_inputs(batch_inputs1, batch_inputs2): b x doc_len x sent_len # batch_masks : b x doc_len x sent_len batch_inputs1, batch_inputs2, batch_masks = batch_inputs batch_size, max_doc_len, max_sent_len = batch_inputs1.shape[0], batch_inputs1.shape[1], batch_inputs1.shape[2] batch_inputs1 = batch_inputs1.view(batch_size * max_doc_len, max_sent_len) # sen_num x sent_len batch_inputs2 = batch_inputs2.view(batch_size * max_doc_len, max_sent_len) # sen_num x sent_len batch_masks = batch_masks.view(batch_size * max_doc_len, max_sent_len) # sen_num x sent_len
batch_hiddens = self.word_encoder(batch_inputs1, batch_inputs2, batch_masks) # sen_num x sent_len x sent_rep_size sent_reps, atten_scores = self.word_attention(batch_hiddens, batch_masks) # sen_num x sent_rep_size
sent_reps = sent_reps.view(batch_size, max_doc_len, self.sent_rep_size) # b x doc_len x sent_rep_size batch_masks = batch_masks.view(batch_size, max_doc_len, max_sent_len) # b x doc_len x max_sent_len sent_masks = batch_masks.bool().any(2).float() # b x doc_len
sent_hiddens = self.sent_encoder(sent_reps, sent_masks) # b x doc_len x doc_rep_size doc_reps, atten_scores = self.sent_attention(sent_hiddens, sent_masks) # b x doc_rep_size
batch_outputs = self.out(doc_reps) # b x num_labels
for name, parameters in model_parameters.items(): if name.startswith("basic"): optim = torch.optim.Adam(parameters, lr=learning_rate) self.optims.append(optim)
index = list(range(0, document_len, max_sent_len)) index.append(document_len)
segments = [] for i inrange(len(index) - 1): segment = words[index[i]: index[i + 1]] assertlen(segment) > 0 segment = [word if word in vocab._id2word else'<UNK>'for word in segment] segments.append([len(segment), segment])
defbatch_slice(data, batch_size): batch_num = int(np.ceil(len(data) / float(batch_size))) for i inrange(batch_num): cur_batch_size = batch_size if i < batch_num - 1elselen(data) - batch_size * i docs = [data[i * batch_size + b] for b inrange(cur_batch_size)]
yield docs
defdata_iter(data, batch_size, shuffle=True, noise=1.0): """ randomly permute data, then sort by source length, and partition into batches ensure that the length of sentences in each batch """
batched_data = [] if shuffle: np.random.shuffle(data)
lengths = [example[1] for example in data] noisy_lengths = [- (l + np.random.uniform(- noise, noise)) for l in lengths] sorted_indices = np.argsort(noisy_lengths).tolist() sorted_data = [data[i] for i in sorted_indices]
if self.best_dev_f1 <= dev_f1: logging.info( "Exceed history dev = %.2f, current dev = %.2f" % (self.best_dev_f1, dev_f1)) torch.save(self.model.state_dict(), save_model)
index = list(range(total)) np.random.shuffle(index)
all_texts = [] all_labels = [] for i in index: all_texts.append(texts[i]) all_labels.append(labels[i])
label2id = {} for i inrange(total): label = str(all_labels[i]) if label notin label2id: label2id[label] = [i] else: label2id[label].append(i)
all_index = [[] for _ inrange(fold_num)] for label, data in label2id.items(): # print(label, len(data)) batch_size = int(len(data) / fold_num) other = len(data) - batch_size * fold_num for i inrange(fold_num): cur_batch_size = batch_size + 1if i < other else batch_size # print(cur_batch_size) batch_data = [data[i * batch_size + b] for b inrange(cur_batch_size)] all_index[i].extend(batch_data)
batch_size = int(total / fold_num) other_texts = [] other_labels = [] other_num = 0 start = 0 for fold inrange(fold_num): num = len(all_index[fold]) texts = [all_texts[i] for i in all_index[fold]] labels = [all_labels[i] for i in all_index[fold]]
if num > batch_size: fold_texts = texts[:batch_size] other_texts.extend(texts[batch_size:]) fold_labels = labels[:batch_size] other_labels.extend(labels[batch_size:]) other_num += num - batch_size elif num < batch_size: end = start + batch_size - num fold_texts = texts + other_texts[start: end] fold_labels = labels + other_labels[start: end] start = end else: fold_texts = texts fold_labels = labels
assert batch_size == len(fold_labels)
# shuffle index = list(range(batch_size)) np.random.shuffle(index)
shuffle_fold_texts = [] shuffle_fold_labels = [] for i in index: shuffle_fold_texts.append(fold_texts[i]) shuffle_fold_labels.append(fold_labels[i])
data = {'label': shuffle_fold_labels, 'text': shuffle_fold_texts} fold_data.append(data)
logging.info("Fold lens %s", str([len(data['label']) for data in fold_data]))
return fold_data
fold_data = all_data2fold(10)
# build train, dev, test data fold_id = 9
# dev dev_data = fold_data[fold_id]
# train train_texts = [] train_labels = [] for i inrange(0, fold_id): data = fold_data[i] train_texts.extend(data['text']) train_labels.extend(data['label'])
defget_bert_parameters(self): no_decay = ['bias', 'LayerNorm.weight'] optimizer_parameters = [ {'params': [p for n, p in self.bert.named_parameters() ifnotany(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in self.bert.named_parameters() ifany(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] return optimizer_parameters
defforward(self, input_ids, token_type_ids): # input_ids: sen_num x bert_len # token_type_ids: sen_num x bert_len
# sen_num x bert_len x 256, sen_num x 256 sequence_output, pooled_output = self.bert(input_ids=input_ids, token_type_ids=token_type_ids)
if self.pooled: reps = pooled_output else: reps = sequence_output[:, 0, :] # sen_num x 256
if self.training: reps = self.dropout(reps)
return reps
classWhitespaceTokenizer(): """WhitespaceTokenizer with vocab."""
def__init__(self): vocab_file = bert_path + 'vocab.txt' self._token2id = self.load_vocab(vocab_file) self._id2token = {v: k for k, v in self._token2id.items()} self.max_len = 256 self.unk = 1
logging.info("Build Bert vocab with size %d." % (self.vocab_size))
logging.info('Build model with bert word encoder, lstm sent encoder.')
para_num = sum([np.prod(list(p.size())) for p in self.parameters()]) logging.info('Model param num: %.2f M.' % (para_num / 1e6))
defforward(self, batch_inputs): # batch_inputs(batch_inputs1, batch_inputs2): b x doc_len x sent_len # batch_masks : b x doc_len x sent_len batch_inputs1, batch_inputs2, batch_masks = batch_inputs batch_size, max_doc_len, max_sent_len = batch_inputs1.shape[0], batch_inputs1.shape[1], batch_inputs1.shape[2] batch_inputs1 = batch_inputs1.view(batch_size * max_doc_len, max_sent_len) # sen_num x sent_len batch_inputs2 = batch_inputs2.view(batch_size * max_doc_len, max_sent_len) # sen_num x sent_len batch_masks = batch_masks.view(batch_size * max_doc_len, max_sent_len) # sen_num x sent_len
sent_reps = self.word_encoder(batch_inputs1, batch_inputs2) # sen_num x sent_rep_size
sent_reps = sent_reps.view(batch_size, max_doc_len, self.sent_rep_size) # b x doc_len x sent_rep_size batch_masks = batch_masks.view(batch_size, max_doc_len, max_sent_len) # b x doc_len x max_sent_len sent_masks = batch_masks.bool().any(2).float() # b x doc_len
sent_hiddens = self.sent_encoder(sent_reps, sent_masks) # b x doc_len x doc_rep_size doc_reps, atten_scores = self.sent_attention(sent_hiddens, sent_masks) # b x doc_rep_size
batch_outputs = self.out(doc_reps) # b x num_labels
for name, parameters in model_parameters.items(): if name.startswith("basic"): optim = torch.optim.Adam(parameters, lr=learning_rate) self.optims.append(optim)
index = list(range(0, document_len, max_sent_len)) index.append(document_len)
segments = [] for i inrange(len(index) - 1): segment = words[index[i]: index[i + 1]] assertlen(segment) > 0 segment = [word if word in vocab._id2word else'<UNK>'for word in segment] segments.append([len(segment), segment])
defbatch_slice(data, batch_size): batch_num = int(np.ceil(len(data) / float(batch_size))) for i inrange(batch_num): cur_batch_size = batch_size if i < batch_num - 1elselen(data) - batch_size * i docs = [data[i * batch_size + b] for b inrange(cur_batch_size)]
yield docs
defdata_iter(data, batch_size, shuffle=True, noise=1.0): """ randomly permute data, then sort by source length, and partition into batches ensure that the length of sentences in each batch """
batched_data = [] if shuffle: np.random.shuffle(data)
lengths = [example[1] for example in data] noisy_lengths = [- (l + np.random.uniform(- noise, noise)) for l in lengths] sorted_indices = np.argsort(noisy_lengths).tolist() sorted_data = [data[i] for i in sorted_indices] else: sorted_data = data
if self.best_dev_f1 <= dev_f1: logging.info( "Exceed history dev = %.2f, current dev = %.2f" % (self.best_dev_f1, dev_f1)) torch.save(self.model.state_dict(), save_model)