From 368c26d491ae54bf2e30c2a84cb1a3de795799cb Mon Sep 17 00:00:00 2001 From: Morgan Date: Thu, 15 Feb 2024 01:32:58 +0900 Subject: [PATCH] Fix --- snuenc.sh | 1 + snusub.py | 238 +++++++++++++++++++++++++++++------------------------ subedit.py | 79 ++++++++++++++++++ 3 files changed, 211 insertions(+), 107 deletions(-) create mode 100644 snuenc.sh create mode 100644 subedit.py diff --git a/snuenc.sh b/snuenc.sh new file mode 100644 index 0000000..be52aab --- /dev/null +++ b/snuenc.sh @@ -0,0 +1 @@ +i="input.mp4";j="test.stacked.vtt";ffmpeg -f lavfi -i color=c=gray:s=508x1080:r=ntsc:d=$(ffprobe -i $i -show_entries format=duration -v quiet -of csv="p=0") -vf "subtitles=$j:force_style='FontName=Helvetica,Alignment=4,Fontsize=9.5,Outline=0,Shadow=0,MarginH=2,MarginV=4,Spacing=0'" -b:v 2000k -f nut - | ffmpeg -i $i -i - -filter_complex "[0:v][1:v]hstack=inputs=2:shortest=1[v];[0:a]anull[a2];[0:a][a2]amerge[a]" -map "[v]" -map "[a]" -b:v 2000k -f nut - | ffplay - diff --git a/snusub.py b/snusub.py index 02ecad8..b9ab1e5 100644 --- a/snusub.py +++ b/snusub.py @@ -4,6 +4,8 @@ import re, json import os, sys from datetime import timedelta +### + def from_vtt(vtt_string): VTT_TIMECODE_PATTERN = r"((?:\d{2}:)?\d{2}:\d{2}\.\d{3}) --> ((?:\d{2}:)?\d{2}:\d{2}\.\d{3})" VTT_LINE_NUMBER_PATTERN = r"^\d+$" @@ -24,6 +26,9 @@ def from_vtt(vtt_string): start, end = match.groups() content = '\n'.join(lines[1:]) + "\n" + # if start == end: + # continue + subtitles.append({ 'start': start, 'end': end, @@ -39,6 +44,8 @@ def to_vtt(subtitles): if not subtitle.get("split", False): start = subtitle['start'] end = subtitle['end'] + if not start or not end or start == end: + raise Exception(f"VTT timestamp parse error from #{idx}.") vtt_content += f"{start} --> {end}\n{content}\n\n\n" else: vtt_content += f"NOTE {content}\n\n\n" @@ -58,6 +65,7 @@ def to_stacked_vtt(subtitles, continous = True): buffer += "\n" else: buffer += " " + buffer += subtitle['content'].strip() if n < len(subtitles) - 1: @@ -65,6 +73,10 @@ def to_stacked_vtt(subtitles, continous = True): else: end_time = subtitle['end'] + if not subtitle['start'] or not end_time: + raise Exception(f"VTT timestamp parse error from #{idx}.") + if subtitle['start'] == end_time: + raise Exception(f"Error, subtitle timestamp overlaps.\n{subtitle['start']} --> {end_time} {subtitle['content'].strip()}") vtt_content += f"{subtitle['start']} --> {end_time}\n" vtt_content += buffer vtt_content += "\n\n\n" @@ -73,16 +85,18 @@ def to_stacked_vtt(subtitles, continous = True): return vtt_content +### + def script_from_word_vtt(wordvtt): subtitles = from_vtt(wordvtt) - print(f"VTT {len(subtitles)} lines. Generating script file from VTT.") + print(f"Generating script file from VTT...") sentences = [] - EXCEPTION_FLAG, ADD_NEXT_SENTENCE = "", 0 + ADD_NEXT_SENTENCE = 0 for n, subtitle in enumerate(subtitles): sentence = subtitle["content"].replace("", "").replace("", "") if ((sentences[-1] if sentences else None) != sentence) or ADD_NEXT_SENTENCE: sentences.append(sentence) - ADD_NEXT_SENTENCE = 0 + ADD_NEXT_SENTENCE = 0 if subtitle["content"][-4:] == "": ADD_NEXT_SENTENCE = 1 if n + 2 < len(subtitles): @@ -90,10 +104,11 @@ def script_from_word_vtt(wordvtt): ADD_NEXT_SENTENCE = 0 return sentences -def create_word_scenes(wordvtt, scriptraw): - subtitles = from_vtt(wordvtt) - scripts = [i for i in scriptraw.split("\n") if i] - print(f"VTT {len(subtitles)} lines, Script {len(scripts)} lines") +def create_word_scenes(raw_vtt, raw_script): + subtitles = from_vtt(raw_vtt) + scripts = [i for i in raw_script.split("\n") if i] + print(f"Found {len(subtitles)} subtitles, {len(scripts)} scenes.\nTimestamping each words...") + scenes = [] for n, script in enumerate(scripts): if len(script.split(" ")) == 1: @@ -110,8 +125,7 @@ def create_word_scenes(wordvtt, scriptraw): if sentence == scenes[scenes_cur+1].get("scene"): scenes_cur += 1 else: - print(f"Error, Mismatch in scenes\n=>\"[{scenes_cur}] {scenes[scenes_cur].get("scene")}\" or \"[{scenes_cur+1}] {scenes[scenes_cur+1].get("scene")}\" != \"{sentence}\"") - return + raise Exception(f"Error, Failed to match sentence with scene.\n\"{scenes[scenes_cur].get("scene")}\" or \"[{scenes_cur+1}] {scenes[scenes_cur+1].get("scene")}\" != \"{sentence}\"") current_scene = scenes[scenes_cur] if current_scene["timestamp"]: @@ -120,9 +134,12 @@ def create_word_scenes(wordvtt, scriptraw): word_idx = 0 if ("" not in subtitle["content"]) and word_idx >= len(sentence.split(" ")): + # Ignore trailing dummy subtitle after last word indexed. pass + if ("" in subtitle["content"]) and word_idx >= len(sentence.split(" ")): - print(f"Error, index wrong. {scenes_cur}, word: {word_idx}, total words: {len(sentence.split(" "))}\n{subtitle}") + # If there is trailing non-dummy timestamped subtitle, Reset word_idx and step to next scene. (Repeating sentence doesnt increment cur.) + print(f"Error, Index wrong. {scenes_cur}, word: {word_idx}, total words: {len(sentence.split(" "))}\n{subtitle}") word_idx = 0 scenes_cur += 1 current_scene = scenes[scenes_cur] @@ -132,26 +149,25 @@ def create_word_scenes(wordvtt, scriptraw): word_idx = 0 print(f"Changed to {word_idx}, {scenes_cur}") + # Start matching words. if "" in subtitle["content"]: word = subtitle["content"].split("")[1].split("")[0] if word not in sentence.split(" "): - print(f"Error, Mismatch\n=> \"{word}\" not in \"{sentence}\"") + raise Exception(f"Error, Mismatch\n=> \"{word}\" not in \"{sentence}\"") return try: assert sentence.split(" ")[word_idx] == word except: - print(f"Error, Mismatch\n=> \"{word}\" != [{word_idx}] of \"{sentence}\"") - return + raise Exception(f"Error, Mismatch\n=> \"{word}\" != [{word_idx}] of \"{sentence}\"") word_time = {"start": subtitle["start"], "end": subtitle["end"], "index": word_idx, "word": word} current_scene["timestamp"].append(word_time) for scene in scenes: if len(scene["scene"].split(" ")) != len(scene["timestamp"]): - print("Error, Mismatch length") - return + raise Exception("Error, Scene length and timestamp length doesnt match.") if "" in scene["scene"].split(" "): print(repr(scene["scene"])) @@ -163,13 +179,90 @@ def create_word_scenes(wordvtt, scriptraw): for i, j in zip(full_script, full_scenes): if i.replace("##", "") != j["word"]: - print("Error, Mismatch") + raise Exception("Error, Mismatch") return assert len(full_scenes) == len(full_script) return full_script, full_scenes +def scene_from_new_script(raw_script, full_script, full_scenes): + mod_script = raw_script.replace("\n", " \n ").split(" ") + mod_script = [i for i in mod_script if i] + n = 0 + while True: + if mod_script[n] == "\n": + mod_script[n-1] += "\n" + del(mod_script[n]) + n -= 1 + n += 1 + if n == len(mod_script): + break + + print(f"Original: {len(full_script)}, Modded: {len(mod_script)}") + allowed_list = [".", "\n", "\n\n", ",", "?", "##"] + + def normalized(x): + for i in allowed_list: + x = x.replace(i, "") + return x.upper() + + same = lambda a, b: normalized(a) == normalized(b) + new_script, new_timestamp, orig_index, n = [], [], 0, 0 + fail = 0 + while n < len(mod_script): + print(f"{repr(mod_script[n]):>20} ? {repr(full_script[orig_index])}") + word = mod_script[n] + if same(word, full_script[orig_index].replace("##", "")): + cur = full_scenes[orig_index] + new_script.append(word.replace("##", "")) + new_timestamp.append({"start": cur["start"], "end": cur["end"]}) + fail = 0 + else: + if fail > 10: + raise Exception("Error: Failed to match words,") + return + fail += 1 + n -= 1 + n, orig_index = n+1, orig_index+1 + assert len(new_script) == len(new_timestamp) + return new_script, new_timestamp + +def build_new_subtitle(new_script, new_timestamp): + buffer, new_scenes, start, end = [], [], None, None + current_scene = [] + + for i, j in zip(new_script, new_timestamp): + buffer.append(i.replace("\n", "")) + if not start: + start = j["start"] + + if "\n" in i: + current_scene.append({"content": " ".join(buffer).replace("##", ""), "start": start, "end": j["end"]}) + buffer, start = [], None + + if "\n\n" in i: + print(f"Section break at line #{len(current_scene):<3}| \"{current_scene[-1]["content"]}\"") + new_scenes.append(current_scene) + current_scene = [] + + if start: + buffer.append(i.replace("\n", "")) + current_scene.append({"content": " ".join(buffer), "start": start, "end": j["end"]}) + + if current_scene != (new_scenes[-1] if new_scenes else None): + new_scenes.append(current_scene) + + newsub = [] + for n, i in enumerate(new_scenes): + newsub += i + if n < len(new_scenes) - 1: + newsub.append({"content": "Break", "start": None, "end": None, "split": True}) + + return newsub + +### + def autobreak(lines, times): from datetime import timedelta @@ -222,108 +315,39 @@ def autobreak(lines, times): return script -def scene_from_new_script(raw_script, full_script, full_scenes): - mod_script = raw_script.replace("\n", " \n ").split(" ") - mod_script = [i for i in mod_script if i] - n = 0 - while True: - if mod_script[n] == "\n": - mod_script[n-1] += "\n" - del(mod_script[n]) - n -= 1 - n += 1 - if n == len(mod_script): - break - - print(f"Original: {len(full_script)}, Modded: {len(mod_script)}") - allowed_list = [".", "\n", "\n\n", ",", "?", "##"] - - def normalized(x): - for i in allowed_list: - x = x.replace(i, "") - return x.upper() - - same = lambda a, b: normalized(a) == normalized(b) - new_script, new_timestamp, orig_index, n = [], [], 0, 0 - fail = 0 - while n < len(mod_script): - print(f"{repr(mod_script[n]):>20} ? {repr(full_script[orig_index])}") - word = mod_script[n] - if same(word, full_script[orig_index].replace("##", "")): - cur = full_scenes[orig_index] - new_script.append(word.replace("##", "")) - new_timestamp.append({"start": cur["start"], "end": cur["end"]}) - fail = 0 - else: - if fail > 10: - print("Error: Failed to match words,") - return - fail += 1 - n -= 1 - n, orig_index = n+1, orig_index+1 - assert len(new_script) == len(new_timestamp) - return new_script, new_timestamp - -def build_new_subtitle(new_script, new_timestamp): - buffer, new_scenes, start, end = [], [], None, None - current_scene = [] - - for i, j in zip(new_script, new_timestamp): - if "\n" in i: - buffer.append(i.replace("\n", "")) - current_scene.append({"content": " ".join(buffer).replace("##", ""), "start": start, "end": j["end"]}) - buffer, start = [], None - - if "\n\n" in i: - print(f"Section break at line #{len(current_scene):<3}| \"{current_scene[-1]["content"]}\"") - new_scenes.append(current_scene) - current_scene = [] - - else: - buffer.append(i) - if not start: - start = j["start"] - - if start: - buffer.append(i.replace("\n", "")) - current_scene.append({"content": " ".join(buffer), "start": start, "end": j["end"]}) - - if current_scene != (new_scenes[-1] if new_scenes else None): - new_scenes.append(current_scene) - - newsub = [] - for n, i in enumerate(new_scenes): - newsub += i - if n < len(new_scenes) - 1: - newsub.append({"content": "Break", "start": None, "end": None, "split": True}) - - return newsub - -### +############################################ def saveFile(filename, data, override = False): if os.path.exists(filename) and not override: - print(f"File {filename} already exists.") - return -1 + raise Exception(f"File {filename} already exists.") + return with open(filename, "w") as f: f.write(data) def openFile(filename): + if not os.path.exists(filename): + raise Exception(f"File {filename} doesnt exists.") + return with open(filename, "r") as f: data = f.read() if not data: - return -1 + raise Exception("Data empty.") + return return data -### +############################################ if __name__=="__main__": + PROG = sys.argv[0].split("/")[-1] if len(sys.argv) not in (3, 4): - PROG = sys.argv[0].split("/")[-1] - print(f"Usage: {PROG} script [VTT file] \n" \ - f" {" "*len(PROG)} apply [VTT file] [script file] \n" \ - f" {" "*len(PROG)} create [JSON file]" \ - ) + print( \ +f"""Usage: {PROG} [COMMAND] [FILES]... + +Commands: + - script Generates script file from vtt file. + - apply