#!/usr/bin/env python3 """Convert VTT or SRT subtitle files to Markdown plain text.""" import re import sys from pathlib import Path def strip_html(text): return re.sub(r'<[^>]+>', '', text) def parse_vtt(content): blocks = re.split(r'\n{2,}', content.strip()) cues = [] for block in blocks: lines = block.strip().splitlines() if not lines: continue if lines[0].startswith('WEBVTT'): continue if lines[0].startswith('NOTE') or lines[0].startswith('STYLE'): continue ts_idx = next((i for i, l in enumerate(lines) if '-->' in l), None) if ts_idx is None: continue text = ' '.join( strip_html(l).strip() for l in lines[ts_idx + 1:] if strip_html(l).strip() ) if text: cues.append(text) return cues def parse_srt(content): blocks = re.split(r'\n{2,}', content.strip()) cues = [] for block in blocks: lines = block.strip().splitlines() text_lines = [] for line in lines: line = line.strip() if re.match(r'^\d+$', line): continue if re.match(r'\d{2}:\d{2}:\d{2}[,\.]\d{3}\s*-->', line): continue cleaned = strip_html(line) if cleaned: text_lines.append(cleaned) if text_lines: cues.append(' '.join(text_lines)) return cues def deduplicate(cues): result = [] prev = None for cue in cues: if cue != prev: result.append(cue) prev = cue return result def to_markdown(cues): cues = deduplicate(cues) if not cues: return '' full = ' '.join(cues) sentences = re.split(r'(?<=[.!?…])\s+', full) paragraphs = [] for i in range(0, len(sentences), 8): paragraphs.append(' '.join(sentences[i:i + 8])) return '\n\n'.join(paragraphs) def convert(input_path): content = input_path.read_text(encoding='utf-8', errors='replace') suffix = input_path.suffix.lower() if suffix == '.vtt': cues = parse_vtt(content) elif suffix == '.srt': cues = parse_srt(content) else: raise ValueError(f'Unsupported format: {suffix}') return to_markdown(cues) if __name__ == '__main__': if len(sys.argv) < 2: print(f'Usage: {sys.argv[0]} [output.md]', file=sys.stderr) sys.exit(1) input_path = Path(sys.argv[1]) if not input_path.exists(): print(f'File not found: {input_path}', file=sys.stderr) sys.exit(1) md = convert(input_path) output_path = Path(sys.argv[2]) if len(sys.argv) >= 3 else input_path.with_suffix('.md') output_path.write_text(md, encoding='utf-8') print(f'Saved: {output_path}')