yt-dlf/scripts/subtitle_to_markdown.py

105 lines
2.8 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python3
"""Convert VTT or SRT subtitle files to Markdown plain text."""
import re
import sys
from pathlib import Path
def strip_html(text):
return re.sub(r'<[^>]+>', '', text)
def parse_vtt(content):
blocks = re.split(r'\n{2,}', content.strip())
cues = []
for block in blocks:
lines = block.strip().splitlines()
if not lines:
continue
if lines[0].startswith('WEBVTT'):
continue
if lines[0].startswith('NOTE') or lines[0].startswith('STYLE'):
continue
ts_idx = next((i for i, l in enumerate(lines) if '-->' in l), None)
if ts_idx is None:
continue
text = ' '.join(
strip_html(l).strip()
for l in lines[ts_idx + 1:]
if strip_html(l).strip()
)
if text:
cues.append(text)
return cues
def parse_srt(content):
blocks = re.split(r'\n{2,}', content.strip())
cues = []
for block in blocks:
lines = block.strip().splitlines()
text_lines = []
for line in lines:
line = line.strip()
if re.match(r'^\d+$', line):
continue
if re.match(r'\d{2}:\d{2}:\d{2}[,\.]\d{3}\s*-->', line):
continue
cleaned = strip_html(line)
if cleaned:
text_lines.append(cleaned)
if text_lines:
cues.append(' '.join(text_lines))
return cues
def deduplicate(cues):
result = []
prev = None
for cue in cues:
if cue != prev:
result.append(cue)
prev = cue
return result
def to_markdown(cues):
cues = deduplicate(cues)
if not cues:
return ''
full = ' '.join(cues)
sentences = re.split(r'(?<=[.!?…])\s+', full)
paragraphs = []
for i in range(0, len(sentences), 8):
paragraphs.append(' '.join(sentences[i:i + 8]))
return '\n\n'.join(paragraphs)
def convert(input_path):
content = input_path.read_text(encoding='utf-8', errors='replace')
suffix = input_path.suffix.lower()
if suffix == '.vtt':
cues = parse_vtt(content)
elif suffix == '.srt':
cues = parse_srt(content)
else:
raise ValueError(f'Unsupported format: {suffix}')
return to_markdown(cues)
if __name__ == '__main__':
if len(sys.argv) < 2:
print(f'Usage: {sys.argv[0]} <file.vtt|file.srt> [output.md]', file=sys.stderr)
sys.exit(1)
input_path = Path(sys.argv[1])
if not input_path.exists():
print(f'File not found: {input_path}', file=sys.stderr)
sys.exit(1)
md = convert(input_path)
output_path = Path(sys.argv[2]) if len(sys.argv) >= 3 else input_path.with_suffix('.md')
output_path.write_text(md, encoding='utf-8')
print(f'Saved: {output_path}')