diff --git a/convert.py b/backup.py similarity index 100% rename from convert.py rename to backup.py diff --git a/restack.py b/stackvtt.py similarity index 60% rename from restack.py rename to stackvtt.py index 451bda8..5c6e58b 100644 --- a/restack.py +++ b/stackvtt.py @@ -4,7 +4,7 @@ from datetime import timedelta VTT_TIMECODE_PATTERN = r"((?:\d{2}:)?\d{2}:\d{2}\.\d{3}) --> ((?:\d{2}:)?\d{2}:\d{2}\.\d{3})" VTT_LINE_NUMBER_PATTERN = r"^\d+$" -def parse_vtt(vtt_string): +def from_vtt(vtt_string): parts = re.split(r'\n\n+', vtt_string.strip()) if parts[0].startswith('WEBVTT'): @@ -35,47 +35,39 @@ def parse_vtt(vtt_string): def to_vtt(subtitles): vtt_content = "WEBVTT\n\n" for idx, subtitle in enumerate(subtitles): - # print(subtitle, idx) start = subtitle['start'] end = subtitle['end'] content = subtitle['content'] vtt_content += f"{start} --> {end}\n{content}\n\n" return vtt_content.strip() +def stack_subtitle(): + buffer = [] + linebuf = [] + for line in parsed_vtt: + print(line["content"].strip()) + content = line["content"].strip() + if True: + linebuf.append(line) + else: + linebuf.append(line) + buffer.append(linebuf) + linebuf = [] + + sub = [] + for section in buffer: + strbuf = "" + for scene in section: + strbuf += scene["content"] + # if scene["content"][-1] == ".": + strbuf += "\n" + # else: + # strbuf += " " + scene["content"] = strbuf + sub.append(scene) with open("example.vtt", "r") as f: vtt_content = f.read() -parsed_vtt = parse_vtt(vtt_content) -#print(len(parsed_vtt)) - -buffer = [] -linebuf = [] - -for line in parsed_vtt: -# print(line["content"].strip()) - content = line["content"].strip() - if "".join([i["content"] for i in linebuf]).count(".") < 4 or len(linebuf) < 5: - linebuf.append(line) - else: - linebuf.append(line) - buffer.append(linebuf) - linebuf = [] - -# print(buffer) - -sub = [] -for section in buffer: - strbuf = "" - for scene in section: - strbuf += scene["content"] - # if scene["content"][-1] == ".": - strbuf += "\n" - # else: - # strbuf += " " - scene["content"] = strbuf - sub.append(scene) - -# print(buffer[0]) - -print(to_vtt(sub)) \ No newline at end of file +parsed_vtt = from_vtt(vtt_content) +print(to_vtt(stack_subtitle(parsed_vtt))) \ No newline at end of file diff --git a/wordvtt.py b/wordvtt.py new file mode 100644 index 0000000..e3bcf7e --- /dev/null +++ b/wordvtt.py @@ -0,0 +1,231 @@ +#!/usr/bin/env python3 + +import re, json +import os +from datetime import timedelta + +def from_vtt(vtt_string): + VTT_TIMECODE_PATTERN = r"((?:\d{2}:)?\d{2}:\d{2}\.\d{3}) --> ((?:\d{2}:)?\d{2}:\d{2}\.\d{3})" + VTT_LINE_NUMBER_PATTERN = r"^\d+$" + parts = re.split(r'\n\n+', vtt_string.strip()) + if parts[0].startswith('WEBVTT'): + parts.pop(0) + + subtitles = [] + for part in parts: + lines = part.split('\n') + match = re.match(VTT_TIMECODE_PATTERN, lines[0]) + if not match: + if re.match(VTT_LINE_NUMBER_PATTERN, lines[0]): + lines.pop(0) + match = re.match(VTT_TIMECODE_PATTERN, lines[0]) + if not match: + continue + + start, end = match.groups() + content = '\n'.join(lines[1:]) + subtitles.append({ + 'start': start, + 'end': end, + 'content': content + }) + + return subtitles + +def to_vtt(subtitles): + vtt_content = "WEBVTT\n\n\n" + for idx, subtitle in enumerate(subtitles): + content = subtitle['content'] + if not subtitle.get("split", False): + start = subtitle['start'] + end = subtitle['end'] + vtt_content += f"{start} --> {end}\n{content}\n\n\n" + else: + vtt_content += f"NOTE {content}\n\n\n" + + return vtt_content.strip() + +def to_stacked_vtt(subtitles): + vtt_content = "WEBVTT\n\n\n" + buffer = "" + for subtitle in subtitles: + if subtitle.get("split", False): + buffer = "" + continue + if len(buffer) != 0: + if str(subtitle['content'].strip())[-1] == ".": + buffer += "\n" + else: + buffer += " " + buffer += subtitle['content'].strip() + vtt_content += f"{to_time(subtitle['start'])} --> {to_time(subtitle['end'])}\n" + vtt_content += buffer + vtt_content += "\n\n\n" + +def create_word_scenes(wordvtt, scriptraw): + subtitles = from_vtt(wordvtt) + scripts = [i for i in scriptraw.split("\n") if i] + print(f"VTT {len(subtitles)} lines, Script {len(scripts)} lines") + scenes = [] + for n, script in enumerate(scripts): + if len(script.split(" ")) == 1: + continue + scenes.append({"scene": script, "timestamp": []}) + + scenes_cur = 0 + for n, subtitle in enumerate(subtitles): + sentence = subtitle["content"].replace("", "").replace("", "") + if len(sentence.split(" ")) == 1: + continue + + if sentence != scenes[scenes_cur].get("scene"): + if sentence == scenes[scenes_cur+1].get("scene"): + scenes_cur += 1 + else: + print(f"Error, Mismatch\n=> scenes[{scenes_cur}] != \"{sentence}\"") + return + + current_scene = scenes[scenes_cur] + if current_scene["timestamp"]: + word_idx = current_scene["timestamp"][-1]["index"] + 1 + else: + word_idx = 0 + + if "" in subtitle["content"]: + word = subtitle["content"].split("")[1].split("")[0] + if word not in sentence: + print(f"Error, Mismatch\n=> \"{word}\" not in \"{sentence}\"") + return + + try: + assert sentence.split(" ")[word_idx] == word + except: + print(f"Error, Mismatch\n=> \"{word}\" != [{word_idx}] of \"{sentence}\"") + return + + word_time = {"start": subtitle["start"], "end": subtitle["end"], "index": word_idx, "word": word} + current_scene["timestamp"].append(word_time) + + # print(json.dumps(scenes, indent=2)) + + for scene in scenes: + if len(scene["scene"].split(" ")) != len(scene["timestamp"]): + print("Error, Mismatch length") + return + + full_script, full_scenes = [], [] + for scene in scenes: + full_script += scene["scene"].split(" ") + full_scenes += scene["timestamp"] + + for i, j in zip(full_script, full_scenes): + if i != j["word"]: + print("Error, Mismatch") + return + + assert len(full_scenes) == len(full_script) + + return full_script, full_scenes + +def scene_from_new_script(raw_script, full_script, full_scenes): + mod_script = raw_script.replace("\n", " \n ").split(" ") + mod_script = [i for i in mod_script if i] + n = 0 + while True: + if mod_script[n] == "\n": + mod_script[n-1] += "\n" + del(mod_script[n]) + n -= 1 + n += 1 + if n == len(mod_script): + break + # print(mod_script) + print(f"Original: {len(full_script)}, Modded: {len(mod_script)}") + allowed_list = [".", "\n", "\n\n", ","] + + def normalized(x): + for i in allowed_list: + x = x.replace(i, "") + return x.upper() + + same = lambda a, b: normalized(a) == normalized(b) + new_script, new_timestamp, orig_index, n = [], [], 0, 0 + while n < len(mod_script): + # print(f"{repr(mod_script[n]):>20} ? {repr(full_script[orig_index])}") + word = mod_script[n] + if same(word, full_script[orig_index]): + cur = full_scenes[orig_index] + new_script.append(word) + new_timestamp.append({"start": cur["start"], "end": cur["end"]}) + else: + # print("Back") + n -= 1 + n, orig_index = n+1, orig_index+1 + + assert len(new_script) == len(new_timestamp) + return new_script, new_timestamp + +def build_new_subtitle(new_script, new_timestamp): + buffer, new_scenes, start, end = [], [], None, None + current_scene = [] + # print(" ".join(new_script).split("\n")) + + for i, j in zip(new_script, new_timestamp): + if "\n" in i: + buffer.append(i.replace("\n", "")) + current_scene.append({"content": " ".join(buffer), "start": start, "end": j["end"]}) + buffer, start = [], None + if "\n\n" in i: + print(f"Section break at line #{len(current_scene):<3}| \"{current_scene[-1]["content"]}\"") + new_scenes.append(current_scene) + current_scene = [] + else: + buffer.append(i) + if not start: + start = j["start"] + + if start: + buffer.append(i.replace("\n", "")) + current_scene.append({"content": " ".join(buffer), "start": start, "end": j["end"]}) + + if current_scene != (new_scenes[-1] if new_scenes else None): + new_scenes.append(current_scene) + + # print("\n\n".join(["\n".join([j["content"] for j in i]) for i in new_scenes])) + newsub = [] + for n, i in enumerate(new_scenes): + newsub += i + if n < len(new_scenes) - 1: + newsub.append({"content": "Break", "start": None, "end": None, "split": True}) + + return newsub + +def saveFile(filename, data, override = False): + if os.path.exists(filename) and not override: + print(f"File {filename} already exists.") + return -1 + with open(filename, "w") as f: + f.write(data) + +def openFile(filename): + with open(filename, "r") as f: + data = f.read() + if not data: + return -1 + return data + +def main(): + vttfile = "test.vtt" + scriptfile = "test.txt" + modfile = "test.script" + + full_script, full_scenes = create_word_scenes(openFile(vttfile), openFile(scriptfile)) + saveFile("test.script", " ".join(full_script).replace(". ", ".\n")) + a, b = scene_from_new_script(openFile(modfile), full_script, full_scenes) + final_vtt = build_new_subtitle(a, b) + # print(final_vtt) + saveFile("test.final.vtt", to_vtt(final_vtt), True) + saveFile("test.final.json", json.dumps(final_vtt, indent=2), True) + +if __name__=="__main__": + main() \ No newline at end of file