105 lines
2.8 KiB
Python
105 lines
2.8 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""Convert VTT or SRT subtitle files to Markdown plain text."""
|
||
|
|
import re
|
||
|
|
import sys
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
|
||
|
|
def strip_html(text):
|
||
|
|
return re.sub(r'<[^>]+>', '', text)
|
||
|
|
|
||
|
|
|
||
|
|
def parse_vtt(content):
|
||
|
|
blocks = re.split(r'\n{2,}', content.strip())
|
||
|
|
cues = []
|
||
|
|
for block in blocks:
|
||
|
|
lines = block.strip().splitlines()
|
||
|
|
if not lines:
|
||
|
|
continue
|
||
|
|
if lines[0].startswith('WEBVTT'):
|
||
|
|
continue
|
||
|
|
if lines[0].startswith('NOTE') or lines[0].startswith('STYLE'):
|
||
|
|
continue
|
||
|
|
ts_idx = next((i for i, l in enumerate(lines) if '-->' in l), None)
|
||
|
|
if ts_idx is None:
|
||
|
|
continue
|
||
|
|
text = ' '.join(
|
||
|
|
strip_html(l).strip()
|
||
|
|
for l in lines[ts_idx + 1:]
|
||
|
|
if strip_html(l).strip()
|
||
|
|
)
|
||
|
|
if text:
|
||
|
|
cues.append(text)
|
||
|
|
return cues
|
||
|
|
|
||
|
|
|
||
|
|
def parse_srt(content):
|
||
|
|
blocks = re.split(r'\n{2,}', content.strip())
|
||
|
|
cues = []
|
||
|
|
for block in blocks:
|
||
|
|
lines = block.strip().splitlines()
|
||
|
|
text_lines = []
|
||
|
|
for line in lines:
|
||
|
|
line = line.strip()
|
||
|
|
if re.match(r'^\d+$', line):
|
||
|
|
continue
|
||
|
|
if re.match(r'\d{2}:\d{2}:\d{2}[,\.]\d{3}\s*-->', line):
|
||
|
|
continue
|
||
|
|
cleaned = strip_html(line)
|
||
|
|
if cleaned:
|
||
|
|
text_lines.append(cleaned)
|
||
|
|
if text_lines:
|
||
|
|
cues.append(' '.join(text_lines))
|
||
|
|
return cues
|
||
|
|
|
||
|
|
|
||
|
|
def deduplicate(cues):
|
||
|
|
result = []
|
||
|
|
prev = None
|
||
|
|
for cue in cues:
|
||
|
|
if cue != prev:
|
||
|
|
result.append(cue)
|
||
|
|
prev = cue
|
||
|
|
return result
|
||
|
|
|
||
|
|
|
||
|
|
def to_markdown(cues):
|
||
|
|
cues = deduplicate(cues)
|
||
|
|
if not cues:
|
||
|
|
return ''
|
||
|
|
full = ' '.join(cues)
|
||
|
|
sentences = re.split(r'(?<=[.!?…])\s+', full)
|
||
|
|
paragraphs = []
|
||
|
|
for i in range(0, len(sentences), 8):
|
||
|
|
paragraphs.append(' '.join(sentences[i:i + 8]))
|
||
|
|
return '\n\n'.join(paragraphs)
|
||
|
|
|
||
|
|
|
||
|
|
def convert(input_path):
|
||
|
|
content = input_path.read_text(encoding='utf-8', errors='replace')
|
||
|
|
suffix = input_path.suffix.lower()
|
||
|
|
if suffix == '.vtt':
|
||
|
|
cues = parse_vtt(content)
|
||
|
|
elif suffix == '.srt':
|
||
|
|
cues = parse_srt(content)
|
||
|
|
else:
|
||
|
|
raise ValueError(f'Unsupported format: {suffix}')
|
||
|
|
return to_markdown(cues)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
if len(sys.argv) < 2:
|
||
|
|
print(f'Usage: {sys.argv[0]} <file.vtt|file.srt> [output.md]', file=sys.stderr)
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
input_path = Path(sys.argv[1])
|
||
|
|
if not input_path.exists():
|
||
|
|
print(f'File not found: {input_path}', file=sys.stderr)
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
md = convert(input_path)
|
||
|
|
|
||
|
|
output_path = Path(sys.argv[2]) if len(sys.argv) >= 3 else input_path.with_suffix('.md')
|
||
|
|
output_path.write_text(md, encoding='utf-8')
|
||
|
|
print(f'Saved: {output_path}')
|