Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 52 additions & 95 deletions promptsource/machine_translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,96 +3,47 @@

from promptsource.templates import Template, TemplateCollection


### XNLI

PROMPTS = [
"GPT-3 style",
"can we infer",
"justified in saying",
"guaranteed/possible/impossible",
"MNLI crowdsource",
]

LANGS = [
"ar",
"es",
"fr",
"hi",
"sw",
"ur",
"vi",
"zh",
]

SOURCE_DATASET = TARGET_DATASET = "xnli"
SOURCE_LANG = "en"


### XCOPA

PROMPTS = [
"best_option",
'C1 or C2? premise, so/because…',
"i_am_hesitating",
"cause_effect",
"plausible_alternatives",
]

LANGS = [
"id",
"sw",
"ta",
"vi",
"zh",
]

SOURCE_DATASET = "super_glue/copa"
SOURCE_LANG = None
TARGET_DATASET = "xcopa"

### XSTORY_CLOZE

PROMPTS = [
"Answer Given options",
'Choose Story Ending',
"Story Continuation and Options",
"Generate Ending",
"Novel Correct Ending",
]

LANGS = [
"ar",
"es",
"eu",
"hi",
"id",
"zh",
DS_TO_ENG_PROMPT = {
"xcopa": "en",
"Muennighoff/xstory_cloze": "en",
"Muennighoff/xwinograd": "en",
'GEM/wiki_lingua': 'en_en', # Contains correct language names
'xnli': 'en',
"paws-x": "en",
"mlqa": "mlqa.en.en",
"xquad": "xquad.en",
"khalidalt/tydiqa-primary": "english",
"khalidalt/tydiqa-goldp": "english",
"pasinit/xlwic": "en",
"GEM/xlsum": "english",
"GEM/BiSECT": "en",
}

### ZH Datasets

DATASETS = [
('xquad', 'xquad.zh'),
# Context & Answer is in ZH
('mlqa', 'mlqa.zh.ar'),
('mlqa', 'mlqa.zh.vi'),
('mlqa', 'mlqa.zh.es'),
('mlqa', 'mlqa.zh.en'),
('mlqa', 'mlqa.zh.hi'),
('paws-x', 'zh'),
('clue', 'c3'),
('clue', 'cmrc2018'),
('clue', 'csl'),
('clue', 'drcd'),
('clue', 'tnews'),
('pasinit/xlwic', "xlwic_en_zh"),
('GEM/xlsum', "chinese_simplified"),
# ('GEM/xlsum', "chinese_traditional"),
# For WikiLingua there are already ZH prompts (except for xp3long prompts)
("xquad", )
]

SOURCE_DATASET = TARGET_DATASET = "Muennighoff/xstory_cloze"
SOURCE_LANG = "en"

### XWINOGRAD

PROMPTS = [
"Replace",
"stand for",
"True or False",
"does underscore refer to",
"underscore refer to",
]

LANGS = [
"fr",
"pt",
"zh",
]

SOURCE_DATASET = TARGET_DATASET = "Muennighoff/xwinograd"
SOURCE_LANG = "en"


LANG = "zh"

# Path to key
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/niklasmuennighoff/Desktop/gcp_translate_key.json"
Expand Down Expand Up @@ -147,22 +98,28 @@ def normalize_string(zh_string, en_string):


template_collection = TemplateCollection()
source_templates = template_collection.get_dataset(SOURCE_DATASET, SOURCE_LANG)

for lang in LANGS:
target_templates = template_collection.get_dataset(TARGET_DATASET, lang)
for (ds_name, subset_name) in DATASETS:

subset_name_eng = subset_name
if ds_name in DS_TO_ENG_PROMPT:
subset_name_eng = DS_TO_ENG_PROMPT[ds_name]

source_templates = template_collection.get_dataset(ds_name, subset_name_eng)
#for lang in LANGS:
target_templates = template_collection.get_dataset(ds_name, subset_name)
for uid, template in source_templates.templates.items():
if template.name.strip() not in PROMPTS:
if not("xp3long" in template.name.strip()):# not in PROMPTS:
continue
print(f"Translating {template.name.strip()} to {lang}")
print(f"Translating {template.name.strip()} to {LANG}")
answer_choices = []
if template.answer_choices is not None:
choices = template.answer_choices.split("|||")
for c in choices:
answer_choices.append(normalize_string(translate(lang, c.strip()), c.strip()))
answer_choices.append(normalize_string(translate(LANG, c.strip()), c.strip()))
or_jinja = template.jinja.strip()
jinja = normalize_string(translate(lang, or_jinja), or_jinja)
template_name = template.name.strip() + f"_{lang}mt"
jinja = normalize_string(translate(LANG, or_jinja), or_jinja)
template_name = template.name.strip() + f"_{LANG}mt"
target_template = Template(
template_name, jinja=jinja, reference="", answer_choices=" ||| ".join(answer_choices)
)
Expand Down
1 change: 1 addition & 0 deletions promptsource/templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -841,6 +841,7 @@ def read_from_file(self) -> Dict:
"Please ignore this warning if you are creating new prompts for this dataset."
)
return {}
print(self.dataset_name, self.yaml_path)
yaml_dict = yaml.load(open(self.yaml_path, "r"), Loader=yaml.FullLoader)
return yaml_dict[self.TEMPLATES_KEY]

Expand Down
27 changes: 27 additions & 0 deletions promptsource/templates/GEM/wiki_lingua/zh/templates.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,30 @@ templates:
original_task: true
name: summarize_above_zh
reference: xsum templates
dfa7b514-7385-4855-bb90-253073a34fde: !Template
answer_choices: null
id: dfa7b514-7385-4855-bb90-253073a34fde
jinja: "{{target}}\n\n鉴于上述总结,为它写一个详细的文本。||| {{source}}"
metadata: !TemplateMetadata
choices_in_prompt: false
languages: []
metrics:
- ROUGE
- BLEU
original_task: true
name: xp3longwritearticle_zhmt
reference: ''
dff8b414-7485-4855-bb90-253073a34fde: !Template
answer_choices: null
id: dff8b414-7485-4855-bb90-253073a34fde
jinja: "{{target}}\n\n我对此很感兴趣,但我只有几分钟的时间。
你能不能给我最多前500个字符的详细解释关于那个? ||| {{source[:500]}}"
metadata: !TemplateMetadata
choices_in_prompt: false
languages: []
metrics:
- ROUGE
- BLEU
original_task: true
name: xp3longchars_zhmt
reference: ''
115 changes: 115 additions & 0 deletions promptsource/templates/GEM/xlsum/chinese_simplified/templates.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
dataset: GEM/xlsum
subset: chinese_simplified
templates:
0bf9d87c-c6ac-4ff3-a956-e57ec434c4df: !Template
answer_choices: ''
id: 0bf9d87c-c6ac-4ff3-a956-e57ec434c4df
jinja: 标题:{{title}}\n给定上面虚构文章的标题,想象这篇文章。\n ||| {{text[:7000]}}
metadata: !TemplateMetadata
choices_in_prompt: null
languages: null
metrics: null
original_task: null
name: imaginearticle_zhmt
reference: ''
70151209-fdd4-4b78-988d-629ff5da1239: !Template
answer_choices: ''
id: 70151209-fdd4-4b78-988d-629ff5da1239
jinja: '{{text[:1000]}}... 继续这篇文章,最多 4000 个字符:||| {{text[1000:5000]}}'
metadata: !TemplateMetadata
choices_in_prompt: null
languages: null
metrics: null
original_task: null
name: xp3longcontinue_zhmt
reference: ''
900008a7-0cde-4f0d-8fd3-2ff9cb5b36f1: !Template
answer_choices: ''
id: 900008a7-0cde-4f0d-8fd3-2ff9cb5b36f1
jinja: 给定以下文章的标题和摘要,生成一篇短篇文章或一篇长篇文章的开头以配合它们。标题:{{title}}\n摘要:{{target}}\n文章(最多
500 个字符):||| {{text[:500]}}
metadata: !TemplateMetadata
choices_in_prompt: null
languages: null
metrics: null
original_task: null
name: xp3longgenarticle_zhmt
reference: ''
9387daa0-a014-4df0-9d21-ff086dc5df4a: !Template
answer_choices: ''
id: 9387daa0-a014-4df0-9d21-ff086dc5df4a
jinja: 要总结的文档:{{text[:8500]}}\n以与文档相同的语言进行总结:||| {{target}}
metadata: !TemplateMetadata
choices_in_prompt: null
languages: null
metrics: null
original_task: null
name: docsummary_zhmt
reference: ''
97e1cdb0-1c5d-4421-a15b-faf02cd812a8: !Template
answer_choices: ''
id: 97e1cdb0-1c5d-4421-a15b-faf02cd812a8
jinja: '{{text}} \n\n给我上面文章的好标题。 ||| {{title}}'
metadata: !TemplateMetadata
choices_in_prompt: null
languages: null
metrics: null
original_task: null
name: goodtitle_zhmt
reference: ''
9d46d594-7463-4f74-bfd9-b5cbf804e94b: !Template
answer_choices: ''
id: 9d46d594-7463-4f74-bfd9-b5cbf804e94b
jinja: 标题:{{title}}\n给定上面虚构文章的标题,想象这篇文章。\n ||| {{text[:7000]}}
metadata: !TemplateMetadata
choices_in_prompt: null
languages: null
metrics: null
original_task: null
name: xp3longimaginearticle_zhmt
reference: ''
bfd429f5-d75d-46a2-8224-874d1109c310: !Template
answer_choices: ''
id: bfd429f5-d75d-46a2-8224-874d1109c310
jinja: '{{title}}\n{{text[:5000]}}\n\ntl;博士:||| {{target}}'
metadata: !TemplateMetadata
choices_in_prompt: null
languages: null
metrics: null
original_task: null
name: tldr_zhmt
reference: ''
d4f2980a-cb44-4e47-b6c3-3df858c82126: !Template
answer_choices: ''
id: d4f2980a-cb44-4e47-b6c3-3df858c82126
jinja: 给定以下文章的标题和摘要,生成一篇短篇文章或一篇长篇文章的开头以配合它们。标题:{{title}}\n摘要:{{target}}\n文章(最多
500 个字符):||| {{text[:500]}}
metadata: !TemplateMetadata
choices_in_prompt: null
languages: null
metrics: null
original_task: null
name: genarticle_zhmt
reference: ''
dca03c69-940f-46b0-acd8-16bbcbd6bec9: !Template
answer_choices: ''
id: dca03c69-940f-46b0-acd8-16bbcbd6bec9
jinja: '...{{text[3000:3500]}}... 写下文章的其余部分:||| {{text[5000:]}}'
metadata: !TemplateMetadata
choices_in_prompt: null
languages: null
metrics: null
original_task: null
name: xp3longrest_zhmt
reference: ''
ff7e01a6-9c23-4228-85c8-b84432deeb50: !Template
answer_choices: ''
id: ff7e01a6-9c23-4228-85c8-b84432deeb50
jinja: 内容:{{text[:7000]}}\n前面的内容可以总结如下:||| {{target}}
metadata: !TemplateMetadata
choices_in_prompt: null
languages: null
metrics: null
original_task: null
name: prevcontent_zhmt
reference: ''
Loading