global-nlp · Rooki1e · Apr 16, 2023
diff --git a/knlp/Pipeline/PipeEval.py b/knlp/Pipeline/PipeEval.py
@@ -0,0 +1,6 @@
+class PipeEvaluator:
+    def __init__(self):
+        pass
+
+    def evaluate(self):
+        pass
diff --git a/knlp/Pipeline/PipeTrainer.py b/knlp/Pipeline/PipeTrainer.py
@@ -0,0 +1,6 @@
+class PipeTrainer:
+    def __init__(self):
+        pass
+
+    def train(self):
+        pass
diff --git a/knlp/Pipeline/__init__.py b/knlp/Pipeline/__init__.py
diff --git a/knlp/Pipeline/pipeline.py b/knlp/Pipeline/pipeline.py
@@ -0,0 +1,12 @@
+class Pipeline:
+    def __init__(self):
+        pass
+
+    def train(self, model):
+        pass
+
+    def inference(self, model, input):
+        pass
+
+    def evaluate(self, model):
+        pass
diff --git a/knlp/common/constant.py b/knlp/common/constant.py
@@ -19,4 +19,5 @@
 SEP = "[SEP]"
 CLS = "[CLS]"
 MASK = "MASK"
-model_list = ['hmm', 'crf', 'trie', 'bilstm', 'bert_mrc', 'bert_tagger']    # ner pipeline中目前支持的模型列表
+model_list = ['hmm', 'crf', 'trie', 'bilstm', 'bert_mrc', 'bert_tagger']  # ner pipeline中目前支持的模型列表
+class_model_list = ['bert', 'textcnn', 'beyas']  # classification pipeline中目前支持的模型列表
diff --git a/knlp/seq_labeling/NER/bert/trainer.py b/knlp/seq_labeling/NER/bert/trainer.py
@@ -298,9 +298,9 @@ def evaluate(self, args, model, tokenizer, prefix=""):
         return results
 
     def load_and_cache_examples(self, args, task, tokenizer, data_type='train'):
-        # if args.local_rank not in [-1, 0] and not args.do_eval:
-        #     torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-        # processor = processors[task]()
+        # if args.local_rank not in [-1, 0] and not args.do_eval: torch.distributed.barrier()  # Make sure only the
+        # first process in distributed training process the dataset, and the others will use the cache processor =
+        # processors[task]()
         processor = processors(self.task)
         if self.training_data_path:
             args.data_dir = self.training_data_path
@@ -378,15 +378,14 @@ def run(self):
         args.label2id = {label: i for i, label in enumerate(label_list)}
         num_labels = len(label_list)
 
-        # # Load pretrained model and tokenizer
-        # if args.local_rank not in [-1, 0]:
-        #     torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+        # # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier()  #
+        # Make sure only the first process in distributed training will download model & vocab
         args.model_type = args.model_type.lower()
         config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=num_labels)
         config.loss_type = args.loss_type
         model = BertSoftmaxForNer.from_pretrained(args.model_name_or_path, config=config)
-        # if args.local_rank == 0:
-        #     torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+        # if args.local_rank == 0: torch.distributed.barrier()  # Make sure only the first process in distributed
+        # training will download model & vocab
 
         model.to(args.device)
         logger.info("Training/evaluation parameters %s", args)
@@ -400,7 +399,7 @@ def run(self):
             if not os.path.exists(self.output_dir):
                 os.makedirs(self.output_dir)
             logger.info("Saving model checkpoint to %s", self.output_dir)
-            # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+            # Save a trained model, configuration and tokenizer using `saseve_pretrained()`.
             # They can then be reloaded using `from_pretrained()`
             model_to_save = (
                 model.module if hasattr(model, "module") else model

diff --git a/knlp/seq_labeling/bert/metrics/classification_metrics.py b/knlp/seq_labeling/bert/metrics/classification_metrics.py
@@ -0,0 +1,34 @@
+import csv
+import sys
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def simple_accuracy(preds, labels):
+    return (preds == labels).mean()
+
+
+def acc_and_f1(preds, labels):
+    acc = simple_accuracy(preds, labels)
+    f1 = f1_score(y_true=labels, y_pred=preds)
+    return {
+        "acc": acc,
+        "f1": f1,
+        "acc_and_f1": (acc + f1) / 2,
+    }
+
+
+def pearson_and_spearman(preds, labels):
+    pearson_corr = pearsonr(preds, labels)[0]
+    spearman_corr = spearmanr(preds, labels)[0]
+    return {
+        "pearson": pearson_corr,
+        "spearmanr": spearman_corr,
+        "corr": (pearson_corr + spearman_corr) / 2,
+    }
+
+
+def compute_metrics(task_name, preds, labels):
+    assert len(preds) == len(labels)
+    return {"acc": simple_accuracy(preds, labels)}
diff --git a/knlp/seq_labeling/bert/metrics/ner_metrics.py b/knlp/seq_labeling/bert/metrics/ner_metrics.py
@@ -3,7 +3,7 @@
 from knlp.seq_labeling.bert.processors.utils_ner import get_entities
 
 class SeqEntityScore(object):
-    def __init__(self, id2label,markup='bios'):
+    def __init__(self, id2label, markup='bios'):
         self.id2label = id2label
         self.markup = markup
         self.reset()

diff --git a/knlp/seq_labeling/bert/models/bert_for_classification.py b/knlp/seq_labeling/bert/models/bert_for_classification.py
@@ -0,0 +1,50 @@
+import torch
+import torch.nn as nn
+from transformers import BertModel, BertPreTrainedModel
+from torch.nn import CrossEntropyLoss
+
+from knlp.common.constant import KNLP_PATH
+from knlp.seq_labeling.bert.losses.focal_loss import FocalLoss
+from knlp.seq_labeling.bert.losses.label_smoothing import LabelSmoothingCrossEntropy
+
+
+class BertForTokenClassification(BertPreTrainedModel):
+
+    def __init__(self, config):
+        super(BertForTokenClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None,
+                position_ids=None, head_mask=None, labels=None):
+
+        outputs = self.bert(input_ids,
+                            attention_mask=attention_mask,
+                            token_type_ids=token_type_ids,
+                            position_ids=position_ids,
+                            head_mask=head_mask)
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)[active_loss]
+                active_labels = labels.view(-1)[active_loss]
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), scores, (hidden_states), (attentions)