|
18 | 18 |
|
19 | 19 |
|
20 | 20 | class Parse:
|
21 |
| - def __init__( |
22 |
| - self, model: str = "KoichiYasuoka/deberta-base-thai-ud-goeswith" |
23 |
| - ) -> None: |
24 |
| - if model is None: |
25 |
| - model = "KoichiYasuoka/deberta-base-thai-ud-goeswith" |
26 |
| - self.tokenizer = AutoTokenizer.from_pretrained(model) |
27 |
| - self.model = AutoModelForTokenClassification.from_pretrained(model) |
| 21 | + def __init__( |
| 22 | + self, model: str = "KoichiYasuoka/deberta-base-thai-ud-goeswith" |
| 23 | + ) -> None: |
| 24 | + if model is None: |
| 25 | + model = "KoichiYasuoka/deberta-base-thai-ud-goeswith" |
| 26 | + self.tokenizer = AutoTokenizer.from_pretrained(model) |
| 27 | + self.model = AutoModelForTokenClassification.from_pretrained(model) |
28 | 28 |
|
29 |
| - def __call__( |
30 |
| - self, |
31 |
| - text: str, tag: str = "str" |
32 |
| - ) -> Union[List[List[str]], str]: |
33 |
| - w = self.tokenizer(text, return_offsets_mapping=True) |
34 |
| - v = w["input_ids"] |
35 |
| - x = [ |
36 |
| - v[0:i]+[self.tokenizer.mask_token_id]+v[i+1:]+[j] |
37 |
| - for i, j in enumerate(v[1:-1], 1) |
38 |
| - ] |
39 |
| - with torch.no_grad(): |
40 |
| - e = self.model(input_ids=torch.tensor(x)).logits.numpy()[:, 1:-2, :] |
41 |
| - r = [ |
42 |
| - 1 if i == 0 else -1 |
43 |
| - if j.endswith("|root") else 0 |
44 |
| - for i, j in sorted(self.model.config.id2label.items()) |
45 |
| - ] |
46 |
| - e += np.where(np.add.outer(np.identity(e.shape[0]), r) == 0, 0, np.nan) |
47 |
| - g = self.model.config.label2id["X|_|goeswith"] |
48 |
| - r = np.tri(e.shape[0]) |
49 |
| - for i in range(e.shape[0]): |
50 |
| - for j in range(i+2, e.shape[1]): |
51 |
| - r[i, j] = r[i, j-1] if np.nanargmax(e[i, j-1]) == g else 1 |
52 |
| - e[:, :, g] += np.where(r == 0, 0, np.nan) |
53 |
| - m = np.full((e.shape[0]+1, e.shape[1]+1), np.nan) |
54 |
| - m[1:, 1:] = np.nanmax(e, axis=2).transpose() |
55 |
| - p = np.zeros(m.shape) |
56 |
| - p[1:, 1:] = np.nanargmax(e, axis=2).transpose() |
57 |
| - for i in range(1, m.shape[0]): |
58 |
| - m[i, 0], m[i, i], p[i, 0] = m[i, i], np.nan, p[i, i] |
59 |
| - h = ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0] |
60 |
| - if [0 for i in h if i == 0] != [0]: |
61 |
| - m[:, 0] += np.where( |
62 |
| - m[:, 0] == np.nanmax( |
63 |
| - m[[i for i, j in enumerate(h) if j == 0], 0]), 0, np.nan |
64 |
| - ) |
65 |
| - m[[i for i, j in enumerate(h) if j == 0]] += [ |
66 |
| - 0 if i == 0 or j == 0 else np.nan for i, j in enumerate(h) |
67 |
| - ] |
68 |
| - h = ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0] |
69 |
| - u = "" |
70 |
| - v = [(s, e) for s, e in w["offset_mapping"] if s < e] |
71 |
| - if tag == "list": |
72 |
| - _tag_data = [] |
73 |
| - for i, (s, e) in enumerate(v, 1): |
74 |
| - q = self.model.config.id2label[p[i, h[i]]].split("|") |
75 |
| - _tag_data.append( |
76 |
| - [ |
77 |
| - str(i), |
78 |
| - text[s:e], |
79 |
| - "_", |
80 |
| - q[0], |
81 |
| - "_", |
82 |
| - "|".join(q[1:-1]), |
83 |
| - str(h[i]), |
84 |
| - q[-1], |
85 |
| - "_", |
86 |
| - "_" if i < len(v) and e < v[i][0] else "SpaceAfter=No" |
87 |
| - ] |
88 |
| - ) |
89 |
| - return _tag_data |
90 |
| - else: |
91 |
| - for i, (s, e) in enumerate(v, 1): |
92 |
| - q = self.model.config.id2label[p[i, h[i]]].split("|") |
93 |
| - u += "\t".join([ |
94 |
| - str(i), |
95 |
| - text[s:e], |
96 |
| - "_", |
97 |
| - q[0], |
98 |
| - "_", |
99 |
| - "|".join(q[1:-1]), |
100 |
| - str(h[i]), |
101 |
| - q[-1], |
102 |
| - "_", |
103 |
| - "_" if i < len(v) and e < v[i][0] else "SpaceAfter=No"])+"\n" |
104 |
| - return u+"\n" |
| 29 | + def __call__( |
| 30 | + self, |
| 31 | + text: str, tag: str = "str" |
| 32 | + ) -> Union[List[List[str]], str]: |
| 33 | + w = self.tokenizer(text, return_offsets_mapping=True) |
| 34 | + v = w["input_ids"] |
| 35 | + x = [ |
| 36 | + v[0:i] + [self.tokenizer.mask_token_id] + v[i + 1:] + [j] |
| 37 | + for i, j in enumerate(v[1:-1], 1) |
| 38 | + ] |
| 39 | + with torch.no_grad(): |
| 40 | + e = self.model(input_ids=torch.tensor(x) |
| 41 | + ).logits.numpy()[:, 1:-2, :] |
| 42 | + r = [ |
| 43 | + 1 if i == 0 else -1 |
| 44 | + if j.endswith("|root") else 0 |
| 45 | + for i, j in sorted(self.model.config.id2label.items()) |
| 46 | + ] |
| 47 | + e += np.where(np.add.outer(np.identity(e.shape[0]), r) == 0, 0, np.nan) |
| 48 | + g = self.model.config.label2id["X|_|goeswith"] |
| 49 | + r = np.tri(e.shape[0]) |
| 50 | + for i in range(e.shape[0]): |
| 51 | + for j in range(i + 2, e.shape[1]): |
| 52 | + r[i, j] = r[i, j - 1] if np.nanargmax(e[i, j - 1]) == g else 1 |
| 53 | + e[:, :, g] += np.where(r == 0, 0, np.nan) |
| 54 | + m = np.full((e.shape[0] + 1, e.shape[1] + 1), np.nan) |
| 55 | + m[1:, 1:] = np.nanmax(e, axis=2).transpose() |
| 56 | + p = np.zeros(m.shape) |
| 57 | + p[1:, 1:] = np.nanargmax(e, axis=2).transpose() |
| 58 | + for i in range(1, m.shape[0]): |
| 59 | + m[i, 0], m[i, i], p[i, 0] = m[i, i], np.nan, p[i, i] |
| 60 | + h = ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0] |
| 61 | + if [0 for i in h if i == 0] != [0]: |
| 62 | + m[:, 0] += np.where( |
| 63 | + m[:, 0] == np.nanmax( |
| 64 | + m[[i for i, j in enumerate(h) if j == 0], 0]), 0, np.nan |
| 65 | + ) |
| 66 | + m[[i for i, j in enumerate(h) if j == 0]] += [ |
| 67 | + 0 if i == 0 or j == 0 else np.nan for i, j in enumerate(h) |
| 68 | + ] |
| 69 | + h = ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0] |
| 70 | + u = "" |
| 71 | + v = [(s, e) for s, e in w["offset_mapping"] if s < e] |
| 72 | + if tag == "list": |
| 73 | + _tag_data = [] |
| 74 | + for i, (s, e) in enumerate(v, 1): |
| 75 | + q = self.model.config.id2label[p[i, h[i]]].split("|") |
| 76 | + _tag_data.append( |
| 77 | + [ |
| 78 | + str(i), |
| 79 | + text[s:e], |
| 80 | + "_", |
| 81 | + q[0], |
| 82 | + "_", |
| 83 | + "|".join(q[1:-1]), |
| 84 | + str(h[i]), |
| 85 | + q[-1], |
| 86 | + "_", |
| 87 | + "_" if i < len(v) and e < v[i][0] else "SpaceAfter=No" |
| 88 | + ] |
| 89 | + ) |
| 90 | + return _tag_data |
| 91 | + else: |
| 92 | + for i, (s, e) in enumerate(v, 1): |
| 93 | + q = self.model.config.id2label[p[i, h[i]]].split("|") |
| 94 | + u += "\t".join([str(i), |
| 95 | + text[s:e], |
| 96 | + "_", |
| 97 | + q[0], |
| 98 | + "_", |
| 99 | + "|".join(q[1:-1]), |
| 100 | + str(h[i]), |
| 101 | + q[-1], |
| 102 | + "_", |
| 103 | + "_" if i < len(v) and e < v[i][0] else "SpaceAfter=No"]) + "\n" |
| 104 | + return u + "\n" |
0 commit comments