Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions knlp/Pipeline/PipeEval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
class PipeEvaluator:
def __init__(self):
pass

def evaluate(self):
pass
6 changes: 6 additions & 0 deletions knlp/Pipeline/PipeTrainer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
class PipeTrainer:
def __init__(self):
pass

def train(self):
pass
Empty file added knlp/Pipeline/__init__.py
Empty file.
12 changes: 12 additions & 0 deletions knlp/Pipeline/pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
class Pipeline:
def __init__(self):
pass

def train(self, model):
pass

def inference(self, model, input):
pass

def evaluate(self, model):
pass
3 changes: 2 additions & 1 deletion knlp/common/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,5 @@
SEP = "[SEP]"
CLS = "[CLS]"
MASK = "MASK"
model_list = ['hmm', 'crf', 'trie', 'bilstm', 'bert_mrc', 'bert_tagger'] # ner pipeline中目前支持的模型列表
model_list = ['hmm', 'crf', 'trie', 'bilstm', 'bert_mrc', 'bert_tagger'] # ner pipeline中目前支持的模型列表
class_model_list = ['bert', 'textcnn', 'beyas'] # classification pipeline中目前支持的模型列表
17 changes: 8 additions & 9 deletions knlp/seq_labeling/NER/bert/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,9 +298,9 @@ def evaluate(self, args, model, tokenizer, prefix=""):
return results

def load_and_cache_examples(self, args, task, tokenizer, data_type='train'):
# if args.local_rank not in [-1, 0] and not args.do_eval:
# torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
# processor = processors[task]()
# if args.local_rank not in [-1, 0] and not args.do_eval: torch.distributed.barrier() # Make sure only the
# first process in distributed training process the dataset, and the others will use the cache processor =
# processors[task]()
processor = processors(self.task)
if self.training_data_path:
args.data_dir = self.training_data_path
Expand Down Expand Up @@ -378,15 +378,14 @@ def run(self):
args.label2id = {label: i for i, label in enumerate(label_list)}
num_labels = len(label_list)

# # Load pretrained model and tokenizer
# if args.local_rank not in [-1, 0]:
# torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
# # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier() #
# Make sure only the first process in distributed training will download model & vocab
args.model_type = args.model_type.lower()
config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=num_labels)
config.loss_type = args.loss_type
model = BertSoftmaxForNer.from_pretrained(args.model_name_or_path, config=config)
# if args.local_rank == 0:
# torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
# if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed
# training will download model & vocab

model.to(args.device)
logger.info("Training/evaluation parameters %s", args)
Expand All @@ -400,7 +399,7 @@ def run(self):
if not os.path.exists(self.output_dir):
os.makedirs(self.output_dir)
logger.info("Saving model checkpoint to %s", self.output_dir)
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# Save a trained model, configuration and tokenizer using `saseve_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = (
model.module if hasattr(model, "module") else model
Expand Down
34 changes: 34 additions & 0 deletions knlp/seq_labeling/bert/metrics/classification_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import csv
import sys
import logging

logger = logging.getLogger(__name__)


def simple_accuracy(preds, labels):
return (preds == labels).mean()


def acc_and_f1(preds, labels):
acc = simple_accuracy(preds, labels)
f1 = f1_score(y_true=labels, y_pred=preds)
return {
"acc": acc,
"f1": f1,
"acc_and_f1": (acc + f1) / 2,
}


def pearson_and_spearman(preds, labels):
pearson_corr = pearsonr(preds, labels)[0]
spearman_corr = spearmanr(preds, labels)[0]
return {
"pearson": pearson_corr,
"spearmanr": spearman_corr,
"corr": (pearson_corr + spearman_corr) / 2,
}


def compute_metrics(task_name, preds, labels):
assert len(preds) == len(labels)
return {"acc": simple_accuracy(preds, labels)}
2 changes: 1 addition & 1 deletion knlp/seq_labeling/bert/metrics/ner_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from knlp.seq_labeling.bert.processors.utils_ner import get_entities

class SeqEntityScore(object):
def __init__(self, id2label,markup='bios'):
def __init__(self, id2label, markup='bios'):
self.id2label = id2label
self.markup = markup
self.reset()
Expand Down
50 changes: 50 additions & 0 deletions knlp/seq_labeling/bert/models/bert_for_classification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import torch
import torch.nn as nn
from transformers import BertModel, BertPreTrainedModel
from torch.nn import CrossEntropyLoss

from knlp.common.constant import KNLP_PATH
from knlp.seq_labeling.bert.losses.focal_loss import FocalLoss
from knlp.seq_labeling.bert.losses.label_smoothing import LabelSmoothingCrossEntropy


class BertForTokenClassification(BertPreTrainedModel):

def __init__(self, config):
super(BertForTokenClassification, self).__init__(config)
self.num_labels = config.num_labels

self.bert = BertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)

self.init_weights()

def forward(self, input_ids, attention_mask=None, token_type_ids=None,
position_ids=None, head_mask=None, labels=None):

outputs = self.bert(input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask)

sequence_output = outputs[0]

sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)

outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
if labels is not None:
loss_fct = CrossEntropyLoss()
# Only keep active parts of the loss
if attention_mask is not None:
active_loss = attention_mask.view(-1) == 1
active_logits = logits.view(-1, self.num_labels)[active_loss]
active_labels = labels.view(-1)[active_loss]
loss = loss_fct(active_logits, active_labels)
else:
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
outputs = (loss,) + outputs

return outputs # (loss), scores, (hidden_states), (attentions)
Loading