huggingface · davidkim205 · Jan 4, 2019
diff --git a/bert_chatbot/bert_chatbot.py b/bert_chatbot/bert_chatbot.py
diff --git a/bert_chatbot/bert_chatbot_config.py b/bert_chatbot/bert_chatbot_config.py
@@ -0,0 +1,27 @@
+BERT_CONFIG={
+    'bert_model':'./bert_chatbot', 
+    'do_lower_case':True, 
+    'do_predict':True, 
+    'do_train':True, 
+    'doc_stride':128, 
+    'fp16':False, 
+    'gradient_accumulation_steps':1, 
+    'learning_rate':3e-05, 
+    'local_rank':-1, 
+    'loss_scale':128, 
+    'max_answer_length':30, 
+    'max_query_length':64, 
+    'max_seq_length':384, 
+    'n_best_size':20, 
+    'no_cuda':False, 
+    'num_train_epochs':2.0, 
+    'optimize_on_cpu':False, 
+    'output_dir':'./output_chatbot', 
+    'predict_batch_size':8, 
+    'predict_file':'./bert_chatbot/kor_dev.json', 
+    'train_batch_size':4, 
+    'train_file':'./bert_chatbot/train-v1.1.json', 
+    'verbose_logging':False, 
+    'warmup_proportion':0.1,
+    'seed':0
+}
diff --git a/bert_chatbot/bert_config.json b/bert_chatbot/bert_config.json
@@ -0,0 +1,19 @@
+{
+  "attention_probs_dropout_prob": 0.1, 
+  "directionality": "bidi", 
+  "hidden_act": "gelu", 
+  "hidden_dropout_prob": 0.1, 
+  "hidden_size": 768, 
+  "initializer_range": 0.02, 
+  "intermediate_size": 3072, 
+  "max_position_embeddings": 512, 
+  "num_attention_heads": 12, 
+  "num_hidden_layers": 12, 
+  "pooler_fc_size": 768, 
+  "pooler_num_attention_heads": 12, 
+  "pooler_num_fc_layers": 3,
+  "pooler_size_per_head": 128, 
+  "pooler_type": "first_token_transform", 
+  "type_vocab_size": 2, 
+  "vocab_size": 105879
+}
diff --git a/bert_chatbot/dev-v1.1.json b/bert_chatbot/dev-v1.1.json
diff --git a/bert_chatbot/dev.json b/bert_chatbot/dev.json
@@ -0,0 +1 @@
+kor_dev.json
diff --git a/bert_chatbot/kor_dev.json b/bert_chatbot/kor_dev.json
diff --git a/bert_chatbot/tf_idf.py b/bert_chatbot/tf_idf.py
@@ -0,0 +1,167 @@
+import os
+import json
+import numpy as np
+
+
+class ContextFinder:
+
+    # BASE_PATH : Project path
+    # vectorize : Initialize TF-IDF matrix
+    # documents : documents ( contexts in selected title or paragraph )
+    # X : Generated TF-IDF weights matrix after fitting input documents
+    # features : a.k.a vocabulary
+    def __init__(self, title: str):
+        self.title = title
+        self.BASE_PATH = os.path.dirname(os.path.abspath(__name__))
+        self.vectorize = self.init_tf_idf_vector()
+        self.documents = []
+        self.X = None
+        self.features = None
+
+    # using spacy tokenizer to normalize words
+    # en_core_web_sm is spacy's english data resource
+    # returning lemma of words list e.g) He likes playing soccer -> ['he', 'like', 'play', 'soccer']
+    @staticmethod
+    def convert_to_lemma(text: str) -> list:
+        import en_core_web_sm
+        nlp = en_core_web_sm.load()
+        doc = nlp(text)
+        return [token.lemma_ for token in doc]
+
+    # only retrieving selected title in squad 1.1 data set
+
+    def load_context_by_title(self, dataset_path):
+        selected_title = None
+        if dataset_path is None:
+            dataset_path = 'dev-v1.1.json'
+        with open(dataset_path) as f:
+            data = json.load(f)['data']
+            for article in data:
+                if article.get('title') == self.title:
+                    selected_title = article.get('paragraphs')
+                    break
+        for paragraph in selected_title:
+            self.documents.append(paragraph.get('context'))
+
+    # initializing vectorizer object, adding custom tokenizer above (convert_to_lemma)
+    @staticmethod
+    def init_tf_idf_vector():
+        from sklearn.feature_extraction.text import TfidfVectorizer
+        return TfidfVectorizer(
+            tokenizer=ContextFinder.convert_to_lemma,
+            min_df=1,
+            sublinear_tf=True
+        )
+
+
+    def generate_tf_idf_vector(self):
+        self.X = self.vectorize.fit_transform(self.documents)
+        self.features = self.vectorize.get_feature_names()
+        # e.g) after fitting 5 sentences and 7 features, matrix X looks like below
+        # ([[0.        , 0.40824829, 0.81649658, 0.        , 0.        , 0.        , 0.40824829],
+        # [0.        , 0.40824829, 0.40824829, 0.        , 0.        , 0.        , 0.81649658],
+        # [0.41680418, 0.        , 0.        , 0.69197025, 0.41680418, 0.41680418, 0.        ],
+        # [0.76944707, 0.        , 0.        , 0.63871058, 0.        , 0.        , 0.        ],
+        # [0.        , 0.        , 0.        , 0.8695635 , 0.34918428, 0.34918428, 0.        ]])
+
+    def build_model(self, dataset_path=None):
+        self.load_context_by_title(dataset_path)
+        self.generate_tf_idf_vector()
+
+    def get_ntop_context(self, query: str, n: int) -> str:
+        if self.X is None or self.features is None:
+            self.build_model()
+
+        # check input query keywords if they are in feature(vocabulary)
+        keywords = [word for word in ContextFinder.convert_to_lemma(query) if word in self.features]
+
+        # get indexes of keywords in X( TF-IDF matrix )
+        matched_keywords = np.asarray(self.X.toarray())[:, [self.vectorize.vocabulary_.get(i) for i in keywords]]
+        #       word 1      word 2
+        # 0     0.000000    0.000000 doc 1
+        # 1     0.000000    0.000000 doc 2
+        # 2     0.416804    0.691970 doc 3
+        # 3     0.769447    0.638711 doc 4
+        # 4     0.000000    0.869563 doc 5
+
+        # sum each words weights document by document and sorting reverse order
+        scores = matched_keywords.sum(axis=1).argsort()[::-1]
+        for i in scores[:n]:
+            if scores[i] > 0:
+                yield self.documents[i]
+
+    def get_ntop_context_by_cosine_similarity(self, query: str, n: int):
+        from sklearn.metrics.pairwise import linear_kernel
+        if self.X is None or self.features is None:
+            self.build_model()
+        query_vector = self.vectorize.transform([query])
+
+        # linear_kernel is dot product between query_vector and all documents vector and transform 1 dim array
+        cosine_similar = linear_kernel(query_vector, self.X).flatten()
+        ranked_idx = cosine_similar.argsort()[::-1]
+        for i in ranked_idx[:n]:
+            if cosine_similar[i] > 0:
+                yield self.documents[i]
+
+
+## usages
+if __name__ == "__main__":
+    # dev set articles
+    '''
+    Super_Bowl_50
+    Warsaw
+    Normans
+    Nikola_Tesla
+    Computational_complexity_theory
+    Teacher
+    Martin_Luther
+    Southern_California
+    Sky_(United_Kingdom)
+    Victoria_(Australia)
+    Huguenot
+    Steam_engine
+    Oxygen
+    1973_oil_crisis
+    Apollo_program
+    European_Union_law
+    Amazon_rainforest
+    Ctenophora
+    Fresno,_California
+    Packet_switching
+    Black_Death
+    Geology
+    Newcastle_upon_Tyne
+    Victoria_and_Albert_Museum
+    American_Broadcasting_Company
+    Genghis_Khan
+    Pharmacy
+    Immune_system
+    Civil_disobediencetoken
+    Construction
+    Private_school
+    Harvard_University
+    Jacksonville,_Florida
+    Economic_inequality
+    Doctor_Who
+    University_of_Chicago
+    Yuan_dynasty
+    Kenya
+    Intergovernmental_Panel_on_Climate_Change
+    Chloroplast
+    Prime_number
+    Rhine
+    Scottish_Parliament
+    Islamism
+    Imperialism
+    United_Methodist_Church
+    French_and_Indian_War
+    Force
+    '''
+
+    c = ContextFinder('Doctor_Who')
+    c.build_model('./bert_chatbot/dev-v1.1.json')
+    for i in c.get_ntop_context('what is doctor who?', 5):
+        print(i)
+
+    for i in c.get_ntop_context_by_cosine_similarity('what is doctor who?', 5):
+        print(i)
diff --git a/bert_chatbot/tf_idf_ko.py b/bert_chatbot/tf_idf_ko.py
@@ -0,0 +1,114 @@
+import os
+import json
+import numpy as np
+from konlpy.tag import Okt
+
+
+class ContextFinder:
+
+    # BASE_PATH : Project path
+    # vectorize : Initialize TF-IDF matrix
+    # documents : documents ( contexts in selected title or paragraph )
+    # X : Generated TF-IDF weights matrix after fitting input documents
+    # features : a.k.a vocabulary
+    # tokenizer : Open-Korean-Text for Korean language processing
+    def __init__(self):
+        self.BASE_PATH = os.path.dirname(os.path.abspath(__name__))
+        self.vectorize = self.init_tf_idf_vector()
+        self.documents = []
+        self.X = None
+        self.features = None
+        self.tokenizer = Okt()
+
+    # tokenization
+    # norm : ㅋㅋㅋㅋㅋ ---> ㅋㅋ
+    # stem : 들어간다 ---> 들어가다.
+    def convert_to_lemma(self, text: str) -> list:
+        return self.tokenizer.morphs(text, norm=True, stem=True)
+
+    # for testing, pos tagged tuple list
+    def check_pos(self, text: str) -> list:
+        return self.tokenizer.pos(text)
+
+
+    # loading dev data
+    def load_context_by_title(self, dataset_path):
+        if dataset_path is None:
+            dataset_path = 'dev-v1.1.json'
+        with open(dataset_path) as f:
+            data = json.load(f)['data']
+            for article in data:
+                for paragraph in article.get('paragraphs'):
+                    self.documents.append(paragraph.get('context'))
+
+    # initializing vectorizer object, adding custom tokenizer above (convert_to_lemma)
+    def init_tf_idf_vector(self):
+        from sklearn.feature_extraction.text import TfidfVectorizer
+        return TfidfVectorizer(
+            tokenizer=self.convert_to_lemma,
+            min_df=1,
+            sublinear_tf=True
+        )
+
+    def generate_tf_idf_vector(self):
+        self.X = self.vectorize.fit_transform(self.documents)
+        self.features = self.vectorize.get_feature_names()
+        # e.g) after fitting 5 sentences and 7 features, matrix X looks like below
+        # ([[0.        , 0.40824829, 0.81649658, 0.        , 0.        , 0.        , 0.40824829],
+        # [0.        , 0.40824829, 0.40824829, 0.        , 0.        , 0.        , 0.81649658],
+        # [0.41680418, 0.        , 0.        , 0.69197025, 0.41680418, 0.41680418, 0.        ],
+        # [0.76944707, 0.        , 0.        , 0.63871058, 0.        , 0.        , 0.        ],
+        # [0.        , 0.        , 0.        , 0.8695635 , 0.34918428, 0.34918428, 0.        ]])
+
+    def build_model(self, dataset_path=None):
+        self.load_context_by_title(dataset_path)
+        self.generate_tf_idf_vector()
+
+    def get_ntop_context(self, query: str, n: int) -> str:
+        if self.X is None or self.features is None:
+            self.build_model()
+
+        # check input query keywords if they are in feature(vocabulary)
+        keywords = [word for word in ContextFinder.convert_to_lemma(query) if word in self.features]
+
+        # get indexes of keywords in X( TF-IDF matrix )
+        matched_keywords = np.asarray(self.X.toarray())[:, [self.vectorize.vocabulary_.get(i) for i in keywords]]
+        #       word 1      word 2
+        # 0     0.000000    0.000000 doc 1
+        # 1     0.000000    0.000000 doc 2
+        # 2     0.416804    0.691970 doc 3
+        # 3     0.769447    0.638711 doc 4
+        # 4     0.000000    0.869563 doc 5
+
+        # sum each words weights document by document and sorting reverse order
+        scores = matched_keywords.sum(axis=1).argsort()[::-1]
+        for i in scores[:n]:
+            if scores[i] > 0:
+                yield self.documents[i]
+
+    def get_ntop_context_by_cosine_similarity(self, query: str, n: int):
+        from sklearn.metrics.pairwise import linear_kernel
+        if self.X is None or self.features is None:
+            self.build_model()
+        query_vector = self.vectorize.transform([query])
+
+        # linear_kernel is dot product between query_vector and all documents vector and transform 1 dim array
+        cosine_similar = linear_kernel(query_vector, self.X).flatten()
+        ranked_idx = cosine_similar.argsort()[::-1]
+        for i in ranked_idx[:n]:
+            if cosine_similar[i] > 0:
+                yield self.documents[i]
+
+
+## usages
+if __name__ == "__main__":
+
+    # dev set articles
+    c = ContextFinder()
+    print(c.convert_to_lemma("미국 군대 내 두번째로 높은 직위는 무엇인가?"))
+    # c.build_model('./bert_chatbot/dev-v1.1.json')
+    # for i in c.get_ntop_context('what is doctor who?', 5):
+    #     print(i)
+    #
+    # for i in c.get_ntop_context_by_cosine_similarity('what is doctor who?', 5):
+    #     print(i)
diff --git a/bert_chatbot/train-v1.1.json b/bert_chatbot/train-v1.1.json