From 6b1e3155a83842afb1a0e750e1a45e316e6d7e00 Mon Sep 17 00:00:00 2001 From: Yuan Date: Mon, 5 Dec 2022 23:13:23 +0800 Subject: [PATCH 1/4] created script created script for converting bilingual caption to monolingual caption --- utils/convert_bilingual_monolingual.py | 61 ++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 utils/convert_bilingual_monolingual.py diff --git a/utils/convert_bilingual_monolingual.py b/utils/convert_bilingual_monolingual.py new file mode 100644 index 000000000..4a8004cdb --- /dev/null +++ b/utils/convert_bilingual_monolingual.py @@ -0,0 +1,61 @@ +#!/usr/bin/python3 +import getopt +import re +import sys + +PATTERN_TIMESTAMP = re.compile('^[0-9][0-9]:[0-9][0-9]:[0-9][0-9],[0-9][0-9][0-9] --> [0-9][0-9]:[0-9][0-9]:[0-9][0-9],[0-9][0-9][0-9]') +PATTERN_NUM = re.compile('\\d+') + + +def main(argv): + inputfile = '' + outputfile = '' + try: + opts, args = getopt.getopt(argv, "hi:o:", ["ifile=", "ofile="]) + except getopt.GetoptError: + print('srt_worker.py -i -o ') + sys.exit(2) + for opt, arg in opts: + if opt == '-h': + print( 'Usage: convert_bilingual_monolingual.py -i -o ') + sys.exit(-2) + elif opt in ("-i", "--ifile"): + inputfile = arg + elif opt in ("-o", "--ofile"): + outputfile = arg + + if not inputfile: + print('no input file is specified.\nUsage: convert_bilingual_monolingual.py -i -o ') + elif not outputfile: + print('no output file is specified.\nUsage: convert_bilingual_monolingual.py -i -o ') + else: + process(inputfile, outputfile) + + +def process(input_file, output): + """ + Convert bilingual caption file to monolingual caption, supported caption file type is srt. + """ + line_count = 0 + with open(input_file) as file: + with open(output, 'a') as output: + for line in file: + if line_count == 0: + line_count += 1 + output.write(line) + elif PATTERN_TIMESTAMP.match(line): + line_count += 1 + output.write(line) + elif line == '\n': + line_count = 0 + output.write(line) + else: + if line_count == 2: + output.write(line) + line_count += 1 + output.close() + print('conversion completed!') + + +if __name__ == "__main__": + main(sys.argv[1:]) From 5393ada04f48b2dfdb954c13f9ed09d05c767294 Mon Sep 17 00:00:00 2001 From: Yuan Date: Mon, 5 Dec 2022 23:16:22 +0800 Subject: [PATCH 2/4] updated quality.yml updated quality check configuration --- .github/workflows/quality.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml index 6b7afd288..63904d655 100644 --- a/.github/workflows/quality.yml +++ b/.github/workflows/quality.yml @@ -11,10 +11,10 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - - name: Set up Python 3.6 + - name: Set up Python 3.11 uses: actions/setup-python@v2 with: - python-version: 3.6 + python-version: 3.11 - name: Install Python dependencies run: pip install black - name: Run Quality check From 80d8963905a51fee221f2f97f377c04cdd735ca0 Mon Sep 17 00:00:00 2001 From: Yuan Date: Tue, 6 Dec 2022 13:10:24 +0800 Subject: [PATCH 3/4] reverted changes on quality.yml --- .github/workflows/quality.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml index 63904d655..6b7afd288 100644 --- a/.github/workflows/quality.yml +++ b/.github/workflows/quality.yml @@ -11,10 +11,10 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - - name: Set up Python 3.11 + - name: Set up Python 3.6 uses: actions/setup-python@v2 with: - python-version: 3.11 + python-version: 3.6 - name: Install Python dependencies run: pip install black - name: Run Quality check From 20311556d2503a6f35a83c81db526f55ebefe2f2 Mon Sep 17 00:00:00 2001 From: Yuan Date: Fri, 9 Dec 2022 00:00:39 +0800 Subject: [PATCH 4/4] merged branch:base merged quality.yml added instruction of converting bilingual subtitle to monolingual --- .github/workflows/quality.yml | 6 +++--- subtitles/README.md | 15 ++++++++++++++- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml index 6b7afd288..06b296e3e 100644 --- a/.github/workflows/quality.yml +++ b/.github/workflows/quality.yml @@ -11,11 +11,11 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - - name: Set up Python 3.6 + - name: Set up Python 3.8 uses: actions/setup-python@v2 with: - python-version: 3.6 + python-version: 3.8 - name: Install Python dependencies run: pip install black - name: Run Quality check - run: make quality \ No newline at end of file + run: make quality diff --git a/subtitles/README.md b/subtitles/README.md index 071b13c63..53d87db37 100644 --- a/subtitles/README.md +++ b/subtitles/README.md @@ -26,4 +26,17 @@ Some languages like Simplified Chinese have a different YouTube language code (` python utils/generate_subtitles.py --language zh-CN --youtube_language_code zh-Hans ``` -Once you have the `.srt` files you can manually fix any translation errors and then open a pull request with the new files. \ No newline at end of file +Once you have the `.srt` files you can manually fix any translation errors and then open a pull request with the new files. + +# How to convert bilingual subtitle to monolingual subtitle + +# Logic + +The english caption line is conventionally placed at the last line of each subtitle block in srt files. So removing the last line of each subtitle block would make the bilingual subtitle a monolingual subtitle. + +# Usage +> python3 convert_bilingual_monolingual.py -i \ -o \ + +**Example** +* For instance, the input file name is "test.cn.en.srt", and you name your output file as "output_test.cn.srt" * +> python3 convert_bilingual_monolingual.py -i test.cn.en.srt -o output_test.cn.srt \ No newline at end of file