diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml index 6b7afd288..06b296e3e 100644 --- a/.github/workflows/quality.yml +++ b/.github/workflows/quality.yml @@ -11,11 +11,11 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - - name: Set up Python 3.6 + - name: Set up Python 3.8 uses: actions/setup-python@v2 with: - python-version: 3.6 + python-version: 3.8 - name: Install Python dependencies run: pip install black - name: Run Quality check - run: make quality \ No newline at end of file + run: make quality diff --git a/subtitles/README.md b/subtitles/README.md index 071b13c63..53d87db37 100644 --- a/subtitles/README.md +++ b/subtitles/README.md @@ -26,4 +26,17 @@ Some languages like Simplified Chinese have a different YouTube language code (` python utils/generate_subtitles.py --language zh-CN --youtube_language_code zh-Hans ``` -Once you have the `.srt` files you can manually fix any translation errors and then open a pull request with the new files. \ No newline at end of file +Once you have the `.srt` files you can manually fix any translation errors and then open a pull request with the new files. + +# How to convert bilingual subtitle to monolingual subtitle + +# Logic + +The english caption line is conventionally placed at the last line of each subtitle block in srt files. So removing the last line of each subtitle block would make the bilingual subtitle a monolingual subtitle. + +# Usage +> python3 convert_bilingual_monolingual.py -i \ -o \ + +**Example** +* For instance, the input file name is "test.cn.en.srt", and you name your output file as "output_test.cn.srt" * +> python3 convert_bilingual_monolingual.py -i test.cn.en.srt -o output_test.cn.srt \ No newline at end of file diff --git a/utils/convert_bilingual_monolingual.py b/utils/convert_bilingual_monolingual.py new file mode 100644 index 000000000..4a8004cdb --- /dev/null +++ b/utils/convert_bilingual_monolingual.py @@ -0,0 +1,61 @@ +#!/usr/bin/python3 +import getopt +import re +import sys + +PATTERN_TIMESTAMP = re.compile('^[0-9][0-9]:[0-9][0-9]:[0-9][0-9],[0-9][0-9][0-9] --> [0-9][0-9]:[0-9][0-9]:[0-9][0-9],[0-9][0-9][0-9]') +PATTERN_NUM = re.compile('\\d+') + + +def main(argv): + inputfile = '' + outputfile = '' + try: + opts, args = getopt.getopt(argv, "hi:o:", ["ifile=", "ofile="]) + except getopt.GetoptError: + print('srt_worker.py -i -o ') + sys.exit(2) + for opt, arg in opts: + if opt == '-h': + print( 'Usage: convert_bilingual_monolingual.py -i -o ') + sys.exit(-2) + elif opt in ("-i", "--ifile"): + inputfile = arg + elif opt in ("-o", "--ofile"): + outputfile = arg + + if not inputfile: + print('no input file is specified.\nUsage: convert_bilingual_monolingual.py -i -o ') + elif not outputfile: + print('no output file is specified.\nUsage: convert_bilingual_monolingual.py -i -o ') + else: + process(inputfile, outputfile) + + +def process(input_file, output): + """ + Convert bilingual caption file to monolingual caption, supported caption file type is srt. + """ + line_count = 0 + with open(input_file) as file: + with open(output, 'a') as output: + for line in file: + if line_count == 0: + line_count += 1 + output.write(line) + elif PATTERN_TIMESTAMP.match(line): + line_count += 1 + output.write(line) + elif line == '\n': + line_count = 0 + output.write(line) + else: + if line_count == 2: + output.write(line) + line_count += 1 + output.close() + print('conversion completed!') + + +if __name__ == "__main__": + main(sys.argv[1:])