diff --git a/knlp/Pipeline/PipeEval.py b/knlp/Pipeline/PipeEval.py new file mode 100644 index 0000000..d7ccdb2 --- /dev/null +++ b/knlp/Pipeline/PipeEval.py @@ -0,0 +1,6 @@ +class PipeEvaluator: + def __init__(self): + pass + + def evaluate(self): + pass diff --git a/knlp/Pipeline/PipeTrainer.py b/knlp/Pipeline/PipeTrainer.py new file mode 100644 index 0000000..dbf5d8a --- /dev/null +++ b/knlp/Pipeline/PipeTrainer.py @@ -0,0 +1,6 @@ +class PipeTrainer: + def __init__(self): + pass + + def train(self): + pass \ No newline at end of file diff --git a/knlp/Pipeline/__init__.py b/knlp/Pipeline/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/knlp/Pipeline/pipeline.py b/knlp/Pipeline/pipeline.py new file mode 100644 index 0000000..257ef28 --- /dev/null +++ b/knlp/Pipeline/pipeline.py @@ -0,0 +1,12 @@ +class Pipeline: + def __init__(self): + pass + + def train(self, model): + pass + + def inference(self, model, input): + pass + + def evaluate(self, model): + pass diff --git a/knlp/common/constant.py b/knlp/common/constant.py index 4669266..c236793 100644 --- a/knlp/common/constant.py +++ b/knlp/common/constant.py @@ -19,4 +19,5 @@ SEP = "[SEP]" CLS = "[CLS]" MASK = "MASK" -model_list = ['hmm', 'crf', 'trie', 'bilstm', 'bert_mrc', 'bert_tagger'] # ner pipeline中目前支持的模型列表 +model_list = ['hmm', 'crf', 'trie', 'bilstm', 'bert_mrc', 'bert_tagger'] # ner pipeline中目前支持的模型列表 +class_model_list = ['bert', 'textcnn', 'beyas'] # classification pipeline中目前支持的模型列表 diff --git a/knlp/seq_labeling/NER/bert/trainer.py b/knlp/seq_labeling/NER/bert/trainer.py index 26f0902..59818ca 100644 --- a/knlp/seq_labeling/NER/bert/trainer.py +++ b/knlp/seq_labeling/NER/bert/trainer.py @@ -298,9 +298,9 @@ def evaluate(self, args, model, tokenizer, prefix=""): return results def load_and_cache_examples(self, args, task, tokenizer, data_type='train'): - # if args.local_rank not in [-1, 0] and not args.do_eval: - # torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache - # processor = processors[task]() + # if args.local_rank not in [-1, 0] and not args.do_eval: torch.distributed.barrier() # Make sure only the + # first process in distributed training process the dataset, and the others will use the cache processor = + # processors[task]() processor = processors(self.task) if self.training_data_path: args.data_dir = self.training_data_path @@ -378,15 +378,14 @@ def run(self): args.label2id = {label: i for i, label in enumerate(label_list)} num_labels = len(label_list) - # # Load pretrained model and tokenizer - # if args.local_rank not in [-1, 0]: - # torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab + # # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier() # + # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=num_labels) config.loss_type = args.loss_type model = BertSoftmaxForNer.from_pretrained(args.model_name_or_path, config=config) - # if args.local_rank == 0: - # torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab + # if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed + # training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) @@ -400,7 +399,7 @@ def run(self): if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) logger.info("Saving model checkpoint to %s", self.output_dir) - # Save a trained model, configuration and tokenizer using `save_pretrained()`. + # Save a trained model, configuration and tokenizer using `saseve_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = ( model.module if hasattr(model, "module") else model diff --git a/knlp/seq_labeling/bert/metrics/classification_metrics.py b/knlp/seq_labeling/bert/metrics/classification_metrics.py new file mode 100644 index 0000000..f3f8409 --- /dev/null +++ b/knlp/seq_labeling/bert/metrics/classification_metrics.py @@ -0,0 +1,34 @@ +import csv +import sys +import logging + +logger = logging.getLogger(__name__) + + +def simple_accuracy(preds, labels): + return (preds == labels).mean() + + +def acc_and_f1(preds, labels): + acc = simple_accuracy(preds, labels) + f1 = f1_score(y_true=labels, y_pred=preds) + return { + "acc": acc, + "f1": f1, + "acc_and_f1": (acc + f1) / 2, + } + + +def pearson_and_spearman(preds, labels): + pearson_corr = pearsonr(preds, labels)[0] + spearman_corr = spearmanr(preds, labels)[0] + return { + "pearson": pearson_corr, + "spearmanr": spearman_corr, + "corr": (pearson_corr + spearman_corr) / 2, + } + + +def compute_metrics(task_name, preds, labels): + assert len(preds) == len(labels) + return {"acc": simple_accuracy(preds, labels)} diff --git a/knlp/seq_labeling/bert/metrics/ner_metrics.py b/knlp/seq_labeling/bert/metrics/ner_metrics.py index fdef4eb..7cf03f0 100644 --- a/knlp/seq_labeling/bert/metrics/ner_metrics.py +++ b/knlp/seq_labeling/bert/metrics/ner_metrics.py @@ -3,7 +3,7 @@ from knlp.seq_labeling.bert.processors.utils_ner import get_entities class SeqEntityScore(object): - def __init__(self, id2label,markup='bios'): + def __init__(self, id2label, markup='bios'): self.id2label = id2label self.markup = markup self.reset() diff --git a/knlp/seq_labeling/bert/models/bert_for_classification.py b/knlp/seq_labeling/bert/models/bert_for_classification.py new file mode 100644 index 0000000..a13430d --- /dev/null +++ b/knlp/seq_labeling/bert/models/bert_for_classification.py @@ -0,0 +1,50 @@ +import torch +import torch.nn as nn +from transformers import BertModel, BertPreTrainedModel +from torch.nn import CrossEntropyLoss + +from knlp.common.constant import KNLP_PATH +from knlp.seq_labeling.bert.losses.focal_loss import FocalLoss +from knlp.seq_labeling.bert.losses.label_smoothing import LabelSmoothingCrossEntropy + + +class BertForTokenClassification(BertPreTrainedModel): + + def __init__(self, config): + super(BertForTokenClassification, self).__init__(config) + self.num_labels = config.num_labels + + self.bert = BertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + self.init_weights() + + def forward(self, input_ids, attention_mask=None, token_type_ids=None, + position_ids=None, head_mask=None, labels=None): + + outputs = self.bert(input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + if labels is not None: + loss_fct = CrossEntropyLoss() + # Only keep active parts of the loss + if attention_mask is not None: + active_loss = attention_mask.view(-1) == 1 + active_logits = logits.view(-1, self.num_labels)[active_loss] + active_labels = labels.view(-1)[active_loss] + loss = loss_fct(active_logits, active_labels) + else: + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + outputs = (loss,) + outputs + + return outputs # (loss), scores, (hidden_states), (attentions) diff --git a/knlp/seq_labeling/bert/processors/classification.py b/knlp/seq_labeling/bert/processors/classification.py new file mode 100644 index 0000000..389c693 --- /dev/null +++ b/knlp/seq_labeling/bert/processors/classification.py @@ -0,0 +1,508 @@ +import logging +import os +import torch +from knlp.seq_labeling.bert.processors.classification_utils import DataProcessor, InputExample, InputFeatures + +logger = logging.getLogger(__name__) + + +def collate_fn(batch): + """ + batch should be a list of (sequence, target, length) tuples... + Returns a padded tensor of sequences sorted from longest to shortest, + """ + all_input_ids, all_attention_mask, all_token_type_ids, all_lens, all_labels = map(torch.stack, zip(*batch)) + max_len = max(all_lens).item() + all_input_ids = all_input_ids[:, :max_len] + all_attention_mask = all_attention_mask[:, :max_len] + all_token_type_ids = all_token_type_ids[:, :max_len] + return all_input_ids, all_attention_mask, all_token_type_ids, all_labels + + +def xlnet_collate_fn(batch): + """ + batch should be a list of (sequence, target, length) tuples... + Returns a padded tensor of sequences sorted from longest to shortest, + """ + all_input_ids, all_attention_mask, all_token_type_ids, all_lens, all_labels = map(torch.stack, zip(*batch)) + max_len = max(all_lens).item() + all_input_ids = all_input_ids[:, -max_len:] + all_attention_mask = all_attention_mask[:, -max_len:] + all_token_type_ids = all_token_type_ids[:, -max_len:] + return all_input_ids, all_attention_mask, all_token_type_ids, all_labels + + +def clue_convert_examples_to_features(examples, tokenizer, + max_length=512, + task=None, + label_list=None, + output_mode=None, + pad_on_left=False, + pad_token=0, + pad_token_segment_id=0, + mask_padding_with_zero=True): + """ + Loads a data file into a list of ``InputFeatures`` + Args: + examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples. + tokenizer: Instance of a tokenizer that will tokenize the examples + max_length: Maximum example length + task: CLUE task + label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method + output_mode: String indicating the output mode. Either ``regression`` or ``classification`` + pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default) + pad_token: Padding token + pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4) + mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values + and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for + actual values) + + Returns: + If the input is a list of ``InputExamples``, will return + a list of task-specific ``InputFeatures`` which can be fed to the model. + + """ + if task is not None: + processor = clue_processors[task]() + if label_list is None: + label_list = processor.get_labels() + logger.info("Using label list %s for task %s" % (label_list, task)) + if output_mode is None: + output_mode = clue_output_modes[task] + logger.info("Using output mode %s for task %s" % (output_mode, task)) + + label_map = {label: i for i, label in enumerate(label_list)} + + features = [] + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + logger.info("Writing example %d" % (ex_index)) + + inputs = tokenizer.encode_plus( + example.text_a, + example.text_b, + add_special_tokens=True, + max_length=max_length + ) + input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) + input_len = len(input_ids) + # Zero-pad up to the sequence length. + padding_length = max_length - len(input_ids) + if pad_on_left: + input_ids = ([pad_token] * padding_length) + input_ids + attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask + token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids + else: + input_ids = input_ids + ([pad_token] * padding_length) + attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) + token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) + + assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length) + assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), + max_length) + assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), + max_length) + if output_mode == "classification": + label = label_map[example.label] + elif output_mode == "regression": + label = float(example.label) + else: + raise KeyError(output_mode) + + if ex_index < 5: + logger.info("*** Example ***") + logger.info("guid: %s" % (example.guid)) + logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) + logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) + logger.info("label: %s (id = %d)" % (example.label, label)) + logger.info("input length: %d" % (input_len)) + + features.append( + InputFeatures(input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + label=label, + input_len=input_len)) + return features + + +class TnewsProcessor(DataProcessor): + """Processor for the TNEWS data set (CLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_json(os.path.join(data_dir, "train.json")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_json(os.path.join(data_dir, "dev.json")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_json(os.path.join(data_dir, "test.json")), "test") + + def get_labels(self): + """See base class.""" + labels = [] + for i in range(17): + if i == 5 or i == 11: + continue + labels.append(str(100 + i)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text_a = line['sentence'] + text_b = None + label = str(line['label']) if set_type != 'test' else "100" + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class IflytekProcessor(DataProcessor): + """Processor for the IFLYTEK data set (CLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_json(os.path.join(data_dir, "train.json")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_json(os.path.join(data_dir, "dev.json")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_json(os.path.join(data_dir, "test.json")), "test") + + def get_labels(self): + """See base class.""" + labels = [] + for i in range(119): + labels.append(str(i)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text_a = line['sentence'] + text_b = None + label = str(line['label']) if set_type != 'test' else "0" + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class AfqmcProcessor(DataProcessor): + """Processor for the AFQMC data set (CLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_json(os.path.join(data_dir, "train.json")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_json(os.path.join(data_dir, "dev.json")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_json(os.path.join(data_dir, "test.json")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text_a = line['sentence1'] + text_b = line['sentence2'] + label = str(line['label']) if set_type != 'test' else "0" + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + +class OcnliProcessor(DataProcessor): + """Processor for the CMNLI data set (CLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_json(os.path.join(data_dir, "train.json")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_json(os.path.join(data_dir, "dev.json")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_json(os.path.join(data_dir, "test.json")), "test") + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text_a = line["sentence1"] + text_b = line["sentence2"] + label = str(line["label"]) if set_type != 'test' else 'neutral' + if label.strip()=='-': + continue + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + +class CmnliProcessor(DataProcessor): + """Processor for the CMNLI data set (CLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_json(os.path.join(data_dir, "train.json")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_json(os.path.join(data_dir, "dev.json")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_json(os.path.join(data_dir, "test.json")), "test") + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text_a = line["sentence1"] + text_b = line["sentence2"] + label = str(line["label"]) if set_type != 'test' else 'neutral' + if label.strip()=='-': + continue + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +def _create_examples(lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text_a = " ".join(line['keyword']) + text_b = line['abst'] + label = str(line['label']) if set_type != 'test' else '0' + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class CslProcessor(DataProcessor): + """Processor for the CSL data set (CLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return _create_examples( + self._read_json(os.path.join(data_dir, "train.json")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return _create_examples( + self._read_json(os.path.join(data_dir, "dev.json")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return _create_examples( + self._read_json(os.path.join(data_dir, "test.json")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + +class WscProcessor(DataProcessor): + """Processor for the WSC data set (CLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_json(os.path.join(data_dir, "train.json")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_json(os.path.join(data_dir, "dev.json")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_json(os.path.join(data_dir, "test.json")), "test") + + def get_labels(self): + """See base class.""" + return ["true", "false"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text_a = line['text'] + text_a_list = list(text_a) + target = line['target'] + query = target['span1_text'] + query_idx = target['span1_index'] + pronoun = target['span2_text'] + pronoun_idx = target['span2_index'] + assert text_a[pronoun_idx: (pronoun_idx + len(pronoun))] == pronoun, "pronoun: {}".format(pronoun) + assert text_a[query_idx: (query_idx + len(query))] == query, "query: {}".format(query) + if pronoun_idx > query_idx: + text_a_list.insert(query_idx, "_") + text_a_list.insert(query_idx + len(query) + 1, "_") + text_a_list.insert(pronoun_idx + 2, "[") + text_a_list.insert(pronoun_idx + len(pronoun) + 2 + 1, "]") + else: + text_a_list.insert(pronoun_idx, "[") + text_a_list.insert(pronoun_idx + len(pronoun) + 1, "]") + text_a_list.insert(query_idx + 2, "_") + text_a_list.insert(query_idx + len(query) + 2 + 1, "_") + text_a = "".join(text_a_list) + text_b = None + label = str(line['label']) if set_type != 'test' else 'true' + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class CopaProcessor(DataProcessor): + """Processor for the COPA data set (CLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_json(os.path.join(data_dir, "train.json")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_json(os.path.join(data_dir, "dev.json")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_json(os.path.join(data_dir, "test.json")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + examples = [] + for (i, line) in enumerate(lines): + i = 2 * i + guid1 = "%s-%s" % (set_type, i) + guid2 = "%s-%s" % (set_type, i + 1) + premise = line['premise'] + choice0 = line['choice0'] + label = str(1 if line['label'] == 0 else 0) if set_type != 'test' else '0' + choice1 = line['choice1'] + label2 = str(0 if line['label'] == 0 else 1) if set_type != 'test' else '0' + if line['question'] == 'effect': + text_a = premise + text_b = choice0 + text_a2 = premise + text_b2 = choice1 + elif line['question'] == 'cause': + text_a = choice0 + text_b = premise + text_a2 = choice1 + text_b2 = premise + else: + raise ValueError(f'unknowed {line["question"]} type') + examples.append( + InputExample(guid=guid1, text_a=text_a, text_b=text_b, label=label)) + examples.append( + InputExample(guid=guid2, text_a=text_a2, text_b=text_b2, label=label2)) + return examples + + def _create_examples_version2(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + if line['question'] == 'cause': + text_a = line['premise'] + '这是什么原因造成的?' + line['choice0'] + text_b = line['premise'] + '这是什么原因造成的?' + line['choice1'] + else: + text_a = line['premise'] + '这造成了什么影响?' + line['choice0'] + text_b = line['premise'] + '这造成了什么影响?' + line['choice1'] + label = str(1 if line['label'] == 0 else 0) if set_type != 'test' else '0' + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +clue_tasks_num_labels = { + 'iflytek': 119, + 'cmnli': 3, + 'ocnli': 3, + 'afqmc': 2, + 'csl': 2, + 'wsc': 2, + 'copa': 2, + 'tnews': 15, +} + +clue_processors = { + 'tnews': TnewsProcessor, + 'iflytek': IflytekProcessor, + 'cmnli': CmnliProcessor, + 'ocnli': OcnliProcessor, + 'afqmc': AfqmcProcessor, + 'csl': CslProcessor, + 'wsc': WscProcessor, + 'copa': CopaProcessor, +} + +clue_output_modes = { + 'tnews': "classification", + 'iflytek': "classification", + 'cmnli': "classification", + 'ocnli': "classification", + 'afqmc': "classification", + 'csl': "classification", + 'wsc': "classification", + 'copa': "classification", +} diff --git a/knlp/seq_labeling/bert/processors/classification_utils.py b/knlp/seq_labeling/bert/processors/classification_utils.py new file mode 100644 index 0000000..beb52c1 --- /dev/null +++ b/knlp/seq_labeling/bert/processors/classification_utils.py @@ -0,0 +1,104 @@ +import csv +import sys +import copy +import json + +class InputExample(object): + """ + A single training/test example for simple sequence classification. + + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second sequence. + Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + def __init__(self, guid, text_a, text_b=None, label=None): + self.guid = guid + self.text_a = text_a + self.text_b = text_b + self.label = label + + def __repr__(self): + return str(self.to_json_string()) + + def to_dict(self): + """Serializes this instance to a Python dictionary.""" + output = copy.deepcopy(self.__dict__) + return output + + def to_json_string(self): + """Serializes this instance to a JSON string.""" + return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + + +class InputFeatures(object): + """ + A single set of features of data. + + Args: + input_ids: Indices of input sequence tokens in the vocabulary. + attention_mask: Mask to avoid performing attention on padding token indices. + Mask values selected in ``[0, 1]``: + Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens. + token_type_ids: Segment token indices to indicate first and second portions of the inputs. + label: Label corresponding to the input + """ + + def __init__(self, input_ids, attention_mask, token_type_ids, label,input_len): + self.input_ids = input_ids + self.attention_mask = attention_mask + self.token_type_ids = token_type_ids + self.input_len = input_len + self.label = label + + def __repr__(self): + return str(self.to_json_string()) + + def to_dict(self): + """Serializes this instance to a Python dictionary.""" + output = copy.deepcopy(self.__dict__) + return output + + def to_json_string(self): + """Serializes this instance to a JSON string.""" + return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + @classmethod + def _read_tsv(cls, input_file, quotechar=None): + """Reads a tab separated value file.""" + with open(input_file, "r", encoding="utf-8-sig") as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + lines = [] + for line in reader: + lines.append(line) + return lines + + @classmethod + def _read_json(cls, input_file): + """Reads a json list file.""" + with open(input_file, "r") as f: + reader = f.readlines() + lines = [] + for line in reader: + lines.append(json.loads(line.strip())) + return lines diff --git a/knlp/seq_labeling/bert/tools/collate_fn.py b/knlp/seq_labeling/bert/tools/collate_fn.py new file mode 100644 index 0000000..65f8039 --- /dev/null +++ b/knlp/seq_labeling/bert/tools/collate_fn.py @@ -0,0 +1,11 @@ +def collate_fn(batch): + """ + batch should be a list of (sequence, target, length) tuples... + Returns a padded tensor of sequences sorted from longest to shortest, + """ + all_input_ids, all_attention_mask, all_token_type_ids, all_lens, all_labels = map(torch.stack, zip(*batch)) + max_len = max(all_lens).item() + all_input_ids = all_input_ids[:, :max_len] + all_attention_mask = all_attention_mask[:, :max_len] + all_token_type_ids = all_token_type_ids[:, :max_len] + return all_input_ids, all_attention_mask, all_token_type_ids, all_labels diff --git a/knlp/seq_labeling/bert/tools/progressbar.py b/knlp/seq_labeling/bert/tools/progressbar.py new file mode 100644 index 0000000..7df72fe --- /dev/null +++ b/knlp/seq_labeling/bert/tools/progressbar.py @@ -0,0 +1,59 @@ +import time + +class ProgressBar(object): + ''' + custom progress bar + Example: + >>> pbar = ProgressBar(n_total=30,desc='Training') + >>> step = 2 + >>> pbar(step=step) + ''' + def __init__(self, n_total,width=30,desc = 'Training'): + self.width = width + self.n_total = n_total + self.start_time = time.time() + self.desc = desc + + def __call__(self, step, info={}): + now = time.time() + current = step + 1 + recv_per = current / self.n_total + bar = f'[{self.desc}] {current}/{self.n_total} [' + if recv_per >= 1: + recv_per = 1 + prog_width = int(self.width * recv_per) + if prog_width > 0: + bar += '=' * (prog_width - 1) + if current< self.n_total: + bar += ">" + else: + bar += '=' + bar += '.' * (self.width - prog_width) + bar += ']' + show_bar = f"\r{bar}" + time_per_unit = (now - self.start_time) / current + if current < self.n_total: + eta = time_per_unit * (self.n_total - current) + if eta > 3600: + eta_format = ('%d:%02d:%02d' % + (eta // 3600, (eta % 3600) // 60, eta % 60)) + elif eta > 60: + eta_format = '%d:%02d' % (eta // 60, eta % 60) + else: + eta_format = '%ds' % eta + time_info = f' - ETA: {eta_format}' + else: + if time_per_unit >= 1: + time_info = f' {time_per_unit:.1f}s/step' + elif time_per_unit >= 1e-3: + time_info = f' {time_per_unit * 1e3:.1f}ms/step' + else: + time_info = f' {time_per_unit * 1e6:.1f}us/step' + + show_bar += time_info + if len(info) != 0: + show_info = f'{show_bar} ' + \ + "-".join([f' {key}: {value:.4f} ' for key, value in info.items()]) + print(show_info, end='') + else: + print(show_bar, end='') diff --git a/knlp/seq_labeling/classification/ModelTrainer/__init__.py b/knlp/seq_labeling/classification/ModelTrainer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/knlp/seq_labeling/classification/ModelTrainer/model_train.py b/knlp/seq_labeling/classification/ModelTrainer/model_train.py new file mode 100644 index 0000000..bc9b4be --- /dev/null +++ b/knlp/seq_labeling/classification/ModelTrainer/model_train.py @@ -0,0 +1,107 @@ +from knlp.common.constant import KNLP_PATH, model_list +from knlp.seq_labeling.classification.bert.trainer import BertTrain +from knlp.nn.textcnn.train_textcnn import TrainTextCNN +from knlp.seq_labeling.classification.beyas.beyas_train import beyas_train + + +class ModelTrainer(PipeTrainer): + def __init__(self, data_path, vocab_path, model): + """ + :param data_path: 数据集路径(具体到训练数据位置,用于hmm、crf、trie等等模型) + :param vocab_path: 数据集vocab路径 + :param model: 选择模型库中的某个模型,或全部模型 + """ + super().__init__() + self.training_data_path = data_path + self.vocab_set_path = vocab_path + self.clf_model_path = KNLP_PATH + "/knlp/model/beyas/classification" + self.tf_model_path = KNLP_PATH + "/knlp/model/beyas/classification" + self.model = model + self.model_list = class_model_list + if not data_path: + self.training_data_path = KNLP_PATH + '/knlp/data/class_clue' + + def train(self): + if self.model not in self.model_list and self.model != 'all': + print(f'only support model in {self.model_list}') + else: + if self.model == 'bert': + self.bert_train() + elif self.model == 'textcnn': + self.textcnn_train(model_save_path=KNLP_PATH + "/knlp/model/classification/textcnn.pkl", + word2idx_path=KNLP_PATH + "/knlp/nn/textcnn/model_textcnn/weibo_word2idx.json", + label2idx_path=KNLP_PATH + "/knlp/nn/textcnn/model_textcnn/weibo_label2idx.json") + elif self.model == 'beyas': + self.beyas_train(clf_model_path=self.clf_model_path, tf_model_path=self.tf_model_path) + elif self.model == 'all': + self.bert_train() + self.textcnn_train(model_save_path=KNLP_PATH + "/knlp/model/classification/textcnn.pkl", + word2idx_path=KNLP_PATH + "/knlp/nn/textcnn/model_textcnn/weibo_word2idx.json", + label2idx_path=KNLP_PATH + "/knlp/nn/textcnn/model_textcnn/weibo_label2idx.json") + self.beyas_train(clf_model_path=self.clf_model_path, tf_model_path=self.tf_model_path) + + def your_model_train(self): + """ + example: + print('your_model_name-分类训练开始') + YourModelTrainer = YourModelTrain(**params) + YourModelTrainer.run(**params) + print('your_model_name-分类训练结束') + """ + pass + + def bert_train(self): + print('Bert-文本分类训练开始') + BertTrainer = BertTrain(data_path=self.tagger_data_path, tokenizer_vocab=self.vocab_set_path) + BertTrainer.run() + print('Bert-文本分类训练结束') + + def textcnn_train(self, model_save_path, word2idx_path, label2idx_path): + kwargs = { + "dataset_hyperparameters": { + "vocab_set_path": self.vocab_set_path, + # "training_data_path": KNLP_PATH + "/knlp/nn/textcnn/data_textcnn/text_classification_weibo_eval_9988.txt", + "training_data_path": self.training_data_path, + # "eval_data_path": KNLP_PATH + "/knlp/nn/textcnn/data_textcnn/text_classification_weibo_eval_9988.txt", + "tokenizer": jieba.lcut, + "shuffle": True, + "batch_size": 64, + "max_length": 150 + }, + "optimizer_hyperparameters": { + "lr": 0.01, + "weight_decay": 1e-4 + }, + "model_hyperparameters": { + "n_filters": 100, + "filter_sizes": [3, 4, 5] + }, + # "non_static_word2vec_path": KNLP_PATH + "/knlp/nn/textcnn/data_textcnn/text_classification_weibo_word2vec_300d_20509.txt", + "static_word2vec_path": KNLP_PATH + "/knlp/nn/textcnn/data_textcnn/text_classification_weibo_word2vec_100d_22770.txt", + + } + save_kwargs = { + "model_path": model_save_path, + "word2idx_path": word2idx_path, + "label2idx_path": label2idx_path, + } + print("Textcnn-文本分类训练开始") + train = TrainTextCNN(**kwargs) + train.train(5) + train.save(**save_kwargs) + print("Textcnn-文本分类结束") + + def beyas_train(self, clf_model_path, tf_model_path): + print("Beyas-文本分类开始") + beyas = beyas_train(file_path=self.train_data, clf_model_path=clf_model_path, tf_model_path=tf_model_path) + train_datas, train_labels = beyas.load_data() + beyas.train_model(datas=train_datas, labels=train_labels) + print("Beyas-文本分类结束") + +if __name__ == '__main__': + for model in ['bert', 'beyas', 'textcnn']: + test = ModelTrainer(data_path=KNLP_PATH + '/knlp/data/msra_bios/train.bios', + vocab_path=KNLP_PATH + '/knlp/data/cluener_public/cluener_vocab.txt', + model=model) + test.train() + diff --git a/knlp/seq_labeling/classification/__init__.py b/knlp/seq_labeling/classification/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/knlp/seq_labeling/classification/bert/__init__.py b/knlp/seq_labeling/classification/bert/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/knlp/seq_labeling/classification/bert/bert_config.py b/knlp/seq_labeling/classification/bert/bert_config.py new file mode 100644 index 0000000..5af04f8 --- /dev/null +++ b/knlp/seq_labeling/classification/bert/bert_config.py @@ -0,0 +1,98 @@ +from __future__ import absolute_import, division, print_function, unicode_literals + +import json +import logging +import sys +from io import open + +from .configuration_utils import PretrainedConfig + +logger = logging.getLogger(__name__) + +BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { + 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json", + 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json", + 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json", + 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json", + 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json", + 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json", + 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json", + 'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json", + 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json", + 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json", + 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json", + 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json", + 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json", + 'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json", + 'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json", +} + + +class BertConfig(PretrainedConfig): + r""" + :class:`~transformers.BertConfig` is the configuration class to store the configuration of a + `BertModel`. + + + Arguments: + vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`. + hidden_size: Size of the encoder layers and the pooler layer. + num_hidden_layers: Number of hidden layers in the Transformer encoder. + num_attention_heads: Number of attention heads for each attention layer in + the Transformer encoder. + intermediate_size: The size of the "intermediate" (i.e., feed-forward) + layer in the Transformer encoder. + hidden_act: The non-linear activation function (function or string) in the + encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. + hidden_dropout_prob: The dropout probabilitiy for all fully connected + layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob: The dropout ratio for the attention + probabilities. + max_position_embeddings: The maximum sequence length that this model might + ever be used with. Typically set this to something large just in case + (e.g., 512 or 1024 or 2048). + type_vocab_size: The vocabulary size of the `token_type_ids` passed into + `BertModel`. + initializer_range: The sttdev of the truncated_normal_initializer for + initializing all weight matrices. + layer_norm_eps: The epsilon used by LayerNorm. + """ + pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP + + def __init__(self, + vocab_size_or_config_json_file=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + **kwargs): + super(BertConfig, self).__init__(**kwargs) + if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 + and isinstance(vocab_size_or_config_json_file, unicode)): + with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: + json_config = json.loads(reader.read()) + for key, value in json_config.items(): + self.__dict__[key] = value + elif isinstance(vocab_size_or_config_json_file, int): + self.vocab_size = vocab_size_or_config_json_file + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + else: + raise ValueError("First argument must be either a vocabulary size (int)" + " or the path to a pretrained model config file (str)") diff --git a/knlp/seq_labeling/classification/bert/inference.py b/knlp/seq_labeling/classification/bert/inference.py new file mode 100644 index 0000000..c470b49 --- /dev/null +++ b/knlp/seq_labeling/classification/bert/inference.py @@ -0,0 +1,55 @@ +import json +import os +import numpy as np +import torch +from torch.utils.data import SequentialSampler, DataLoader, DistributedSampler +from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline, BertTokenizer +import torch.nn.functional as F +from knlp.common.constant import KNLP_PATH +from knlp.seq_labeling.bert.processors.classification import TnewsProcessor as processor +from knlp.seq_labeling.classification.bert.trainer import BertTrain +from knlp.utils.tokenization import BasicTokenizer +from knlp.seq_labeling.bert.tools.progressbar import ProgressBar +from knlp.seq_labeling.bert.tools.collate_fn import collate_fn + +BERT_MODEL_PATH = KNLP_PATH + "/knlp/model/bert/output_modelbert" + +class bertinference(): + def __init__(self, task): + self.task = task + + def predict(self, model, text): + tokenizer = BasicTokenizer(vocab_file=KNLP_PATH + '/knlp/data/msra_bios/vocab.txt', do_lower_case=True) + nb_pred_steps = 0 + preds = None + pbar = ProgressBar(n_total=len(text), desc="Predicting") + input_tokens = tokenizer.tokenize(text) + for step, batch in enumerate(pred_dataloader): + model.eval() + batch = tuple(t.to(args.device) for t in batch) + with torch.no_grad(): + inputs = {'input_ids': input_tokens[0], + 'attention_mask': input_tokens[1], + 'labels': input_tokens[3]} + if args.model_type != 'distilbert': + inputs['token_type_ids'] = batch[2] + outputs = model(**inputs) + _, logits = outputs[:2] + nb_pred_steps += 1 + if preds is None: + preds = logits.detach().cpu().numpy() + else: + preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) + pbar(step) + print(' ') + predict_label = np.argmax(preds, axis=1) + return predict_label + +if __name__ == '__main__': + inference = bertinference('cluener') + to_be_pred = '我还行' + model = BertForTokenClassification.from_pretrained(KNLP_PATH + '/knlp/model/bert/output_modelbert/checkpoint-448') + model.to('cpu') + processor = processor() + label_list = processor.get_labels() + result = inference.predict(model=model, label_list=label_list) diff --git a/knlp/seq_labeling/classification/bert/trainer.py b/knlp/seq_labeling/classification/bert/trainer.py new file mode 100644 index 0000000..faaaa83 --- /dev/null +++ b/knlp/seq_labeling/classification/bert/trainer.py @@ -0,0 +1,476 @@ +from __future__ import absolute_import, division, print_function +import argparse +import glob +import logging +import os +import json +import numpy as np +import torch +from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset +from torch.utils.data.distributed import DistributedSampler +from knlp.common.constant import KNLP_PATH +from transformers import (WEIGHTS_NAME, BertConfig, + BertForSequenceClassification, BertTokenizer, + RobertaConfig, XLNetConfig, + XLNetForSequenceClassification, + XLNetTokenizer, + AlbertForSequenceClassification) +from transformers import AdamW, WarmupLinearSchedule +from knlp.seq_labeling.classification.bert.bert_config import BertConfig +from knlp.seq_labeling.bert.models.bert_for_classification import BertForTokenClassification +from knlp.seq_labeling.bert.metrics.clue_compute_metrics import compute_metrics +from knlp.seq_labeling.bert.processors.classification import clue_output_modes as output_modes +from knlp.seq_labeling.bert.processors.classification import clue_processors as processors +from knlp.seq_labeling.bert.processors.classification import \ + clue_convert_examples_to_features as convert_examples_to_features +from knlp.seq_labeling.bert.processors.classification import collate_fn, xlnet_collate_fn +from knlp.seq_labeling.bert.tools.common import seed_everything, save_numpy +from knlp.seq_labeling.bert.tools.common import init_logger, logger +from knlp.seq_labeling.bert.tools.progressbar import progressbar + + +# 改训练数据路径! + +def get_argparse(): + parser = argparse.ArgumentParser() + parser.add_argument("--data_dir", default=KNLP_PATH + "/knlp/data/cluener_public", type=str, required=True, + help="The input data dir. Should contain the .tsv files (or other data files) for the task.") + parser.add_argument("--model_type", default='bert', type=str, required=True, + help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) + parser.add_argument("--model_name_or_path", default=KNLP_PATH + "/knlp/model/bert/Chinese_wwm", type=str, + required=True, + help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join( + ALL_MODELS)) + parser.add_argument("--task_name", default=None, type=str, required=True, + help="The name of the task to train selected in the list: " + ", ".join(processors.keys())) + parser.add_argument("--output_dir", default=KNLP_PATH + "/knlp/model/bert/output_model", type=str, required=True, + help="The output directory where the model predictions and checkpoints will be written.") + + ## Other parameters + parser.add_argument("--config_name", default="", type=str, + help="Pretrained config name or path if not the same as model_name") + parser.add_argument("--tokenizer_name", default="", type=str, + help="Pretrained tokenizer name or path if not the same as model_name") + parser.add_argument("--cache_dir", default="", type=str, + help="Where do you want to store the pre-trained models downloaded from s3") + parser.add_argument("--max_seq_length", default=128, type=int, + help="The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded.") + parser.add_argument("--do_train", action='store_true', + help="Whether to run training.") + parser.add_argument("--do_eval", action='store_true', + help="Whether to run eval on the dev set.") + parser.add_argument("--do_predict", action='store_true', + help="Whether to run the model in inference mode on the test set.") + parser.add_argument("--do_lower_case", action='store_true', + help="Set this flag if you are using an uncased model.") + + parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, + help="Batch size per GPU/CPU for training.") + parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, + help="Batch size per GPU/CPU for evaluation.") + parser.add_argument('--gradient_accumulation_steps', type=int, default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.") + parser.add_argument("--learning_rate", default=5e-5, type=float, + help="The initial learning rate for Adam.") + parser.add_argument("--weight_decay", default=0.01, type=float, + help="Weight deay if we apply some.") + parser.add_argument("--adam_epsilon", default=1e-8, type=float, + help="Epsilon for Adam optimizer.") + parser.add_argument("--max_grad_norm", default=1.0, type=float, + help="Max gradient norm.") + parser.add_argument("--num_train_epochs", default=3.0, type=float, + help="Total number of training epochs to perform.") + parser.add_argument("--max_steps", default=-1, type=int, + help="If > 0: set total number of training steps to perform. Override num_train_epochs.") + parser.add_argument("--warmup_proportion", default=0.1, type=float, + help="Proportion of training to perform linear learning rate warmup for,E.g., 0.1 = 10% of " + "training.") + parser.add_argument('--logging_steps', type=int, default=10, + help="Log every X updates steps.") + parser.add_argument('--save_steps', type=int, default=1000, + help="Save checkpoint every X updates steps.") + parser.add_argument("--eval_all_checkpoints", action='store_true', + help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending " + "with step number") + parser.add_argument("--predict_checkpoints", type=int, default=0, + help="predict checkpoints starting with the same prefix as model_name ending and ending with " + "step number") + parser.add_argument("--no_cuda", action='store_true', + help="Avoid using CUDA when available") + parser.add_argument('--overwrite_output_dir', action='store_true', + help="Overwrite the content of the output directory") + parser.add_argument('--overwrite_cache', action='store_true', + help="Overwrite the cached training and evaluation sets") + parser.add_argument('--seed', type=int, default=42, + help="random seed for initialization") + + +class BertTrain(TrainNN): + def __init__(self, device: str = "cuda", data_path=None, tokenizer_vocab=None, save_path=None): + super().__init__(device=device) + self.config_class = BertConfig() + self.output_dir = save_path if save_path else KNLP_PATH + "/knlp/model/bert/output_model" + self.vocab = KNLP_PATH + '/knlp/model/bert/Chinese_wwm/vocab.txt' if not tokenizer_vocab else tokenizer_vocab + self.tokenizer = BasicTokenizer(vocab_file=self.vocab, + do_lower_case=True) + self.training_data_path = data_path + + def train(args, train_dataset, model, tokenizer): + """ Train the model """ + args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) + train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) + train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, + collate_fn=collate_fn) + if args.max_steps > 0: + t_total = args.max_steps + args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 + else: + t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs + args.warmup_steps = int(t_total * args.warmup_proportion) + # Prepare optimizer and schedule (linear warmup and decay) + no_decay = ['bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], + 'weight_decay': args.weight_decay}, + {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} + ] + optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) + scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) + if args.fp16: + try: + from apex import amp + except ImportError: + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") + model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) + + # multi-gpu training (should be after apex fp16 initialization) + if args.n_gpu > 1: + model = torch.nn.DataParallel(model) + + # Distributed training (should be after apex fp16 initialization) + if args.local_rank != -1: + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], + output_device=args.local_rank, + find_unused_parameters=True) + + # Train! + logger.info("***** Running training *****") + logger.info(" Num examples = %d", len(train_dataset)) + logger.info(" Num Epochs = %d", args.num_train_epochs) + logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) + logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", + args.train_batch_size * args.gradient_accumulation_steps * ( + torch.distributed.get_world_size() if args.local_rank != -1 else 1)) + logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) + logger.info(" Total optimization steps = %d", t_total) + + global_step = 0 + tr_loss, logging_loss = 0.0, 0.0 + model.zero_grad() + seed_everything(args.seed) # Added here for reproductibility (even between python 2 and 3) + for _ in range(int(args.num_train_epochs)): + pbar = ProgressBar(n_total=len(train_dataloader), desc='Training') + for step, batch in enumerate(train_dataloader): + model.train() + batch = tuple(t.to(args.device) for t in batch) + inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3], + 'token_type_ids': batch[2]} + outputs = model(**inputs) + loss = outputs[0] # model outputs are always tuple in transformers (see doc) + + if args.n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu parallel training + if args.gradient_accumulation_steps > 1: + loss = loss / args.gradient_accumulation_steps + + if args.fp16: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) + else: + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) + + pbar(step, {'loss': loss.item()}) + tr_loss += loss.item() + if (step + 1) % args.gradient_accumulation_steps == 0: + optimizer.step() + scheduler.step() # Update learning rate schedule + model.zero_grad() + global_step += 1 + + if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: + print(" ") + # Log metrics + if args.local_rank == -1: # Only evaluate when single GPU otherwise metrics may not average + # well + evaluate(args, model, tokenizer) + + if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: + # Save model checkpoint + output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) + if not os.path.exists(output_dir): + os.makedirs(output_dir) + model_to_save = model.module if hasattr(model, + 'module') else model # Take care of + # distributed/parallel training + model_to_save.save_pretrained(output_dir) + torch.save(args, os.path.join(output_dir, 'training_args.bin')) + logger.info("Saving model checkpoint to %s", output_dir) + tokenizer.save_vocabulary(vocab_path=output_dir) + print(" ") + if 'cuda' in str(args.device): + torch.cuda.empty_cache() + return global_step, tr_loss / global_step + + def evaluate(args, model, tokenizer, prefix=""): + eval_task_names = (args.task_name,) + eval_outputs_dirs = (args.output_dir,) + results = {} + for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): + eval_dataset = self.load_and_cache_examples(args, eval_task, tokenizer, data_type='dev') + if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: + os.makedirs(eval_output_dir) + + args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) + # Note that DistributedSampler samples randomly + eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler( + eval_dataset) + eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, + collate_fn=collate_fn) + + # Eval! + logger.info("********* Running evaluation {} ********".format(prefix)) + eval_loss = 0.0 + nb_eval_steps = 0 + preds = None + out_label_ids = None + pbar = ProgressBar(n_total=len(eval_dataloader), desc="Evaluating") + for step, batch in enumerate(eval_dataloader): + model.eval() + batch = tuple(t.to(args.device) for t in batch) + with torch.no_grad(): + inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3], + 'token_type_ids': batch[2]} + outputs = model(**inputs) + tmp_eval_loss, logits = outputs[:2] + eval_loss += tmp_eval_loss.mean().item() + nb_eval_steps += 1 + if preds is None: + preds = logits.detach().cpu().numpy() + out_label_ids = inputs['labels'].detach().cpu().numpy() + else: + preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) + out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) + pbar(step) + print(' ') + if 'cuda' in str(args.device): + torch.cuda.empty_cache() + eval_loss = eval_loss / nb_eval_steps + if args.output_mode == "classification": + preds = np.argmax(preds, axis=1) + elif args.output_mode == "regression": + preds = np.squeeze(preds) + result = compute_metrics(eval_task, preds, out_label_ids) + results.update(result) + logger.info(" Num examples = %d", len(eval_dataset)) + logger.info(" Batch size = %d", args.eval_batch_size) + logger.info("******** Eval results {} ********".format(prefix)) + for key in sorted(result.keys()): + logger.info(" dev: %s = %s", key, str(result[key])) + return results + + def load_and_cache_examples(args, task, tokenizer, data_type='train'): + global all_labels + if args.local_rank not in [-1, 0] and not evaluate: + torch.distributed.barrier() # Make sure only the first process in distributed training process the + # dataset, and the others will use the cache + + processor = processors[task]() + output_mode = output_modes[task] + # Load data features from cache or dataset file + cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format( + data_type, + list(filter(None, args.model_name_or_path.split('/'))).pop(), + str(args.max_seq_length), + str(task))) + if os.path.exists(cached_features_file): + logger.info("Loading features from cached file %s", cached_features_file) + features = torch.load(cached_features_file) + else: + logger.info("Creating features from dataset file at %s", args.data_dir) + label_list = processor.get_labels() + if data_type == 'train': + examples = processor.get_train_examples(args.data_dir) + elif data_type == 'dev': + examples = processor.get_dev_examples(args.data_dir) + else: + examples = processor.get_test_examples(args.data_dir) + + features = convert_examples_to_features(examples, + tokenizer, + label_list=label_list, + max_length=args.max_seq_length, + output_mode=output_mode, + pad_on_left=False, + pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], + pad_token_segment_id=0, + ) + if args.local_rank in [-1, 0]: + logger.info("Saving features into cached file %s", cached_features_file) + torch.save(features, cached_features_file) + + if args.local_rank == 0 and not evaluate: + torch.distributed.barrier() # Make sure only the first process in distributed training process the + # dataset, and the others will use the cache + # Convert to Tensors and build dataset + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) + all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) + all_lens = torch.tensor([f.input_len for f in features], dtype=torch.long) + if output_mode == "classification": + all_labels = torch.tensor([f.label for f in features], dtype=torch.long) + elif output_mode == "regression": + all_labels = torch.tensor([f.label for f in features], dtype=torch.float) + dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_lens, all_labels) + return dataset + + def run(self): + parser = argparse.ArgumentParser() + ## Required parameters + args = parser.parse_args() + if self.training_data_path: + args.data_dir = self.training_data_path + tokenizer = self.tokenizer + if not os.path.exists(args.output_dir): + os.mkdir(args.output_dir) + if not os.path.exists(args.output_dir): + os.mkdir(args.output_dir) + init_logger(log_file=args.output_dir) + if os.path.exists(args.output_dir) and os.listdir( + args.output_dir) and args.do_train and not args.overwrite_output_dir: + raise ValueError( + "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( + args.output_dir)) + + # Setup distant debugging if needed + if args.server_ip and args.server_port: + # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script + import ptvsd + print("Waiting for debugger attach") + ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) + ptvsd.wait_for_attach() + + # Setup CUDA, GPU & distributed training + if args.local_rank == -1 or args.no_cuda: + device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") + args.n_gpu = torch.cuda.device_count() + else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs + torch.cuda.set_device(args.local_rank) + device = torch.device("cuda", args.local_rank) + torch.distributed.init_process_group(backend='nccl') + args.n_gpu = 1 + args.device = device + + # Setup logging + logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", + args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) + + # Set seed + seed_everything(args.seed) + # Prepare CLUE task + args.task_name = args.task_name.lower() + if args.task_name not in processors: + raise ValueError("Task not found: %s" % (args.task_name)) + processor = processors[args.task_name]() + args.output_mode = output_modes[args.task_name] + label_list = processor.get_labels() + num_labels = len(label_list) + + # Load pretrained model and tokenizer + if args.local_rank not in [-1, 0]: + torch.distributed.barrier() # Make sure only the first process in distributed training will download + # model & vocab + + config_class = self.config_class + config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, + num_labels=num_labels, finetuning_task=args.task_name) + model = BertForTokenClassification.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), + config=config) + + if args.local_rank == 0: + torch.distributed.barrier() # Make sure only the first process in distributed training will download + # model & vocab + model.to(args.device) + logger.info("Training/evaluation parameters %s", args) + # Training + if args.do_train: + train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, data_type='train') + global_step, tr_loss = train(args, train_dataset, model, tokenizer) + logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) + + # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() + if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): + # Create output directory if needed + if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: + os.makedirs(args.output_dir) + + logger.info("Saving model checkpoint to %s", args.output_dir) + # Save a trained model, configuration and tokenizer using `save_pretrained()`. + # They can then be reloaded using `from_pretrained()` + model_to_save = model.module if hasattr(model, + 'module') else model # Take care of distributed/parallel training + model_to_save.save_pretrained(args.output_dir) + tokenizer.save_pretrained(args.output_dir) + + # Good practice: save your training arguments together with the trained model + torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) + + # Load a trained model and vocabulary that you have fine-tuned + model = BertForTokenClassification.from_pretrained(args.output_dir) + model.to(args.device) + # Evaluation + results = {} + if args.do_eval and args.local_rank in [-1, 0]: + checkpoints = [args.output_dir] + if args.eval_all_checkpoints: + checkpoints = list( + os.path.dirname(c) for c in + sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) + logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging + logger.info("Evaluate the following checkpoints: %s", checkpoints) + for checkpoint in checkpoints: + global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" + prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else "" + model = BertForTokenClassification.from_pretrained(checkpoint) + model.to(args.device) + result = evaluate(args, model, tokenizer, prefix=prefix) + result = dict((k + '_{}'.format(global_step), v) for k, v in result.items()) + results.update(result) + output_eval_file = os.path.join(args.output_dir, "checkpoint_eval_results.txt") + with open(output_eval_file, "w") as writer: + for key in sorted(results.keys()): + writer.write("%s = %s\n" % (key, str(results[key]))) + + if args.do_predict and args.local_rank in [-1, 0]: + checkpoints = [args.output_dir] + if args.predict_checkpoints > 0: + checkpoints = list( + os.path.dirname(c) for c in + sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) + logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging + checkpoints = [x for x in checkpoints if x.split('-')[-1] == str(args.predict_checkpoints)] + logger.info("Predict the following checkpoints: %s", checkpoints) + for checkpoint in checkpoints: + prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else "" + model = BertForTokenClassification.from_pretrained(checkpoint) + model.to(args.device) + predict(args, model, tokenizer, label_list, prefix=prefix) + + +if __name__ == "__main__": + print('Bert-文本分类训练开始') + trainer = BertTrain() + trainer.run() + print('Bert-文本分类训练结束') diff --git a/knlp/seq_labeling/classification/beyas/beyas_inference.py b/knlp/seq_labeling/classification/beyas/beyas_inference.py new file mode 100644 index 0000000..92d5e33 --- /dev/null +++ b/knlp/seq_labeling/classification/beyas/beyas_inference.py @@ -0,0 +1,24 @@ +import jieba +import joblib + + +class beyas_inference(): + + def __init__(self, model_path, tf_path): + self.model_path = model_path + self.tf_path = tf_path + + def load_model(self): + global MODEL + global TF + MODEL = joblib.load(self.model_path) + TF = joblib.load(self.tf_path) + return MODEL, TF + + def predict(sentence, MODEL, TF): + assert MODEL != None and TF != None + words = jieba.cut(sentence) + s = ' '.join(words) + test_features = TF.transform([s]) + predicted_labels = MODEL.predict(test_features) + return predicted_labels[0] diff --git a/knlp/seq_labeling/classification/beyas/beyas_train.py b/knlp/seq_labeling/classification/beyas/beyas_train.py new file mode 100644 index 0000000..7265db3 --- /dev/null +++ b/knlp/seq_labeling/classification/beyas/beyas_train.py @@ -0,0 +1,46 @@ +from knlp.common.constant import KNLP_PATH +import jieba +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.naive_bayes import MultinomialNB +import joblib + + +class beyas_train(): + def __init__(self, file_path, clf_model_path, tf_model_path): + self.file_path = file_path + self.clf_model_path = clf_model_path + self.tf_model_path = tf_model_path + + def load_data(self): + with open(slef.file_path) as f: + lines = f.readlines() + data = [] + label = [] + for line in lines: + line = eval(line) + words = jieba.cut(line['query']) + print(words) + s = '' + for w in words: + s += w + ' ' + s = s.strip() + data.append(s) + label.append(line['label']) + return data, label + + def train(self, datas, labels): + tf = TfidfVectorizer(max_df=0.5) + train_features = tf.fit_transform(datas) + clf = MultinomialNB(alpha=0.001).fit(train_features, labels) + joblib.dump(clf, self.clf_model_path) + joblib.dump(tf, self.tf_model_path) + + +if __name__ == '__main__': + test = beyas_train(KNLP_PATH + '/knlp/seq_labeling/classification/bert/dataset/data_train.json') + train_datas, train_labels = test.load_data() + tf = TfidfVectorizer(max_df=0.5) + train_features = tf.fit_transform(train_datas) + clf = MultinomialNB(alpha=0.001).fit(train_features, train_labels) + joblib.dump(clf, KNLP_PATH + '/knlp/seq_labeling/classification/beyas/model/nb.pkl') + joblib.dump(tf, KNLP_PATH + '/knlp/seq_labeling/classification/beyas/model/tf.pkl') diff --git a/knlp/seq_labeling/classification/pipeline.py b/knlp/seq_labeling/classification/pipeline.py new file mode 100644 index 0000000..b9bbe08 --- /dev/null +++ b/knlp/seq_labeling/classification/pipeline.py @@ -0,0 +1,124 @@ +from knlp.common.constant import KNLP_PATH, class_model_list +from knlp.nn.textcnn.inference_textcnn import InferenceTextCNN +import jieba +from knlp.Pipeline.pipeline import Pipeline +from knlp.seq_labeling.classification.ModelTrainer.model_train import ModelTrainer +from knlp.seq_labeling.classification.bert.inference import bertinference +import torch +import argparse + + + + +class ClassificationPipeline(Pipeline): + + def __init__(self, type, data_path=KNLP_PATH + '/knlp/data/bios_clue/train.txt', + dev_path=KNLP_PATH + '/knlp/data/clue/val.txt', + vocab_path=KNLP_PATH + '/knlp/data/clue/vocab.txt', + word2idx_path=KNLP_PATH + "/knlp/nn/textcnn/model_textcnn/weibo_word2idx.json", + label2idx_path=KNLP_PATH + "/knlp/nn/textcnn/model_textcnn/weibo_label2idx.json", + max_length=150): + """ + Args: + type: train、inference的选择 + data_path:使用的数据数据集路径(具体到训练数据位置,用于模型的训练) + model:选择模型 + word2idx_path:textcnn中的word2idx的位置 + label2idx_path:textcnn中的label2idx位置 + max_length:最大截断长度 + + """ + super().__init__() + if data_path: + self.training_data_path = data_path + if dev_path: + self.dev_path = dev_path + if vocab_path: + self.vocab_set_path = vocab_path + if word2idx_path: + self.word2idx_path = word2idx_path + if label2idx_path: + self.label2idx_path = label2idx_path + self.type = type + self.max_length = max_length + self.model_list = class_model_list + # bert模型存储位置 + self.model_path_bert = KNLP_PATH + "/knlp/model/bert/output_model" + # beyas模型存储位置 + self.model_path_clf = KNLP_PATH + "/knlp/model/beyas/classification" + self.model_path_tf = KNLP_PATH + "/knlp/model/beyas/classification" + # textcnn模型存储位置 + self.model_path_textcnn = KNLP_PATH + "/knlp/model/classification/textcnn.pkl" + + def train(self, model): + model_list = class_model_list + if model not in model_list: + print(f'only support model in {model_list}') + trainer = ModelTrainer(data_path=self.training_data_pathth, + vocab_path=self.vocab_set_path, + model=model) + if model=='beyas': + trainer.beyas_train(clf_model_path=self.model_path_clf, tf_model_path=self.model_path_tf) + if model=='bert': + trainer.bert_train() + if model=='textcnn': + trainer.textcnn_train(model_save_path=self.model_path_textcnn, + word2idx_path=self.word2idx_path, + label2idx_path=self.label2idx_path) + if model=='all': + trainer.beyas_train(clf_model_path=self.model_path_clf, tf_model_path=self.model_path_tf) + trainer.bert_train() + trainer.textcnn_train(model_save_path=self.model_path_textcnn, + word2idx_path=self.word2idx_path, + label2idx_path=self.label2idx_path) + + def inference(self, model, input, model_path_textcnn=None, model_path_bert=None, model_path_clf=None, model_path_tf=None): + words = input + model_bert = model_path_bert if model_path_bert else self.model_path_bert + model_textcnn = model_path_textcnn if model_path_textcnn else self.model_path_textcnn + model_clf= model_path_clf if model_path_clf else self.model_path_clf + model_tf = model_path_tf if model_path_tf else self.model_path_tf + if model not in model_list: + print(f'only support model in {model_list}') + else: + if model == 'bert': + self.bert_inference(words, model_bert) + elif model == 'textcnn': + self.textcnn_inference(words, self.max_length, model_textcnn) + elif model == 'beyas': + self.beyas_inference(words, model_clf, model_tf) + elif model=='all': + self.bert_inference(words, model_bert) + self.textcnn_inference(words, self.max_length, model_textcnn) + self.beyas_inference(words, model_clf, model_tf) + + def bert_inference(self, words, model_path): + print("******** bert_result ********") + inference = bertinference('cluener') + model = BertForTokenClassification.from_pretrained(model_path) + model.to('cpu') + result = inference.predict(model=model, text=words) + print(result) + + def textcnn_inference(self, words, max_length, model_path, word2idx_path, label2idx_path): + print("******** textcnn_result ********") + model_path_textcnn = model_path + tokenizer = jieba.lcut + inference = InferenceTextCNN(model_path=model_path_textcnn, word2idx_path=word2idx_path, + label2idx_path=label2idx_path, max_length=max_length, tokenizer=tokenizer) + print(inference([words], return_label=True)) + + def beyas_inference(self, words, clf_model, tf_model): + print("******** beyas_result ********") + clf_model = clf_model + tf_model = tf_model + inference = beyas_inference(clf_model, tf_model) + MODEL, TF = inference.load_model() + result = inference.predict(words, MODEL, TF) + print(result) + + +if __name__ == '__main__': + sentence = '我很开心' + pipe = ClassificationPipeline(data_sign='msra') + pipe.inference(model='all', input=sentence)