Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/quality.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.6
- name: Set up Python 3.8
uses: actions/setup-python@v2
with:
python-version: 3.6
python-version: 3.8
- name: Install Python dependencies
run: pip install black
- name: Run Quality check
run: make quality
run: make quality
15 changes: 14 additions & 1 deletion subtitles/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,17 @@ Some languages like Simplified Chinese have a different YouTube language code (`
python utils/generate_subtitles.py --language zh-CN --youtube_language_code zh-Hans
```

Once you have the `.srt` files you can manually fix any translation errors and then open a pull request with the new files.
Once you have the `.srt` files you can manually fix any translation errors and then open a pull request with the new files.

# How to convert bilingual subtitle to monolingual subtitle

# Logic

The english caption line is conventionally placed at the last line of each subtitle block in srt files. So removing the last line of each subtitle block would make the bilingual subtitle a monolingual subtitle.

# Usage
> python3 convert_bilingual_monolingual.py -i \<input_file\> -o \<output_file\>

**Example**
* For instance, the input file name is "test.cn.en.srt", and you name your output file as "output_test.cn.srt" *
> python3 convert_bilingual_monolingual.py -i test.cn.en.srt -o output_test.cn.srt
61 changes: 61 additions & 0 deletions utils/convert_bilingual_monolingual.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/usr/bin/python3
import getopt
import re
import sys

PATTERN_TIMESTAMP = re.compile('^[0-9][0-9]:[0-9][0-9]:[0-9][0-9],[0-9][0-9][0-9] --> [0-9][0-9]:[0-9][0-9]:[0-9][0-9],[0-9][0-9][0-9]')
PATTERN_NUM = re.compile('\\d+')


def main(argv):
inputfile = ''
outputfile = ''
try:
opts, args = getopt.getopt(argv, "hi:o:", ["ifile=", "ofile="])
except getopt.GetoptError:
print('srt_worker.py -i <inputfile> -o <outputfile>')
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print( 'Usage: convert_bilingual_monolingual.py -i <inputfile> -o <outputfile>')
sys.exit(-2)
elif opt in ("-i", "--ifile"):
inputfile = arg
elif opt in ("-o", "--ofile"):
outputfile = arg

if not inputfile:
print('no input file is specified.\nUsage: convert_bilingual_monolingual.py -i <inputfile> -o <outputfile>')
elif not outputfile:
print('no output file is specified.\nUsage: convert_bilingual_monolingual.py -i <inputfile> -o <outputfile>')
else:
process(inputfile, outputfile)


def process(input_file, output):
"""
Convert bilingual caption file to monolingual caption, supported caption file type is srt.
"""
line_count = 0
with open(input_file) as file:
with open(output, 'a') as output:
for line in file:
if line_count == 0:
line_count += 1
output.write(line)
elif PATTERN_TIMESTAMP.match(line):
line_count += 1
output.write(line)
elif line == '\n':
line_count = 0
output.write(line)
else:
if line_count == 2:
output.write(line)
line_count += 1
output.close()
print('conversion completed!')


if __name__ == "__main__":
main(sys.argv[1:])