diff --git a/backup.py b/backup.py index e8b3bca..a0aaa0d 100644 --- a/backup.py +++ b/backup.py @@ -1,11 +1,12 @@ import json import re + def parse_vtt(vtt_filename): - with open(vtt_filename, 'r', encoding='utf-8') as file: + with open(vtt_filename, "r", encoding="utf-8") as file: lines = file.readlines() - time_pattern = re.compile(r'(\d+\.\d{3}) --> (\d+\.\d{3})') + time_pattern = re.compile(r"(\d+\.\d{3}) --> (\d+\.\d{3})") subtitles = [] current_subtitle = {} @@ -13,43 +14,47 @@ def parse_vtt(vtt_filename): for line in lines[1:]: match = time_pattern.match(line) if match: - current_subtitle['start'] = float(match.group(1)) - current_subtitle['end'] = float(match.group(2)) - current_subtitle['content'] = "" - elif line.strip() == '': + current_subtitle["start"] = float(match.group(1)) + current_subtitle["end"] = float(match.group(2)) + current_subtitle["content"] = "" + elif line.strip() == "": if current_subtitle: - if current_subtitle['content'][-1] == "\n": - current_subtitle['content'] = current_subtitle['content'][:-1] + if current_subtitle["content"][-1] == "\n": + current_subtitle["content"] = current_subtitle["content"][:-1] subtitles.append(current_subtitle) current_subtitle = {} else: - current_subtitle['content'] += line.strip() + "\n" # Space to separate lines + current_subtitle["content"] += ( + line.strip() + "\n" + ) # Space to separate lines if current_subtitle: - if current_subtitle['content'][-1] == "\n": - current_subtitle['content'] = current_subtitle['content'][:-1] + if current_subtitle["content"][-1] == "\n": + current_subtitle["content"] = current_subtitle["content"][:-1] subtitles.append(current_subtitle) return subtitles -def subtitles_to_backup(subtitles): +def subtitles_to_backup(subtitles): backup_data = { - "subtitles": subtitles, + "subtitles": subtitles, "script_lines": [], "line_index": len(subtitles), - "current_subtitle": {}, - "play": 0 + "current_subtitle": {}, + "play": 0, } return backup_data + def main(vtt_filename, output_filename): subtitles = parse_vtt(vtt_filename) backup_data = subtitles_to_backup(subtitles) - with open(output_filename, 'w', encoding='utf-8') as json_file: + with open(output_filename, "w", encoding="utf-8") as json_file: json.dump(backup_data, json_file, indent=2) -vtt_filename = 'audio.vtt' -output_filename = 'backup2.json' + +vtt_filename = "audio.vtt" +output_filename = "backup2.json" main(vtt_filename, output_filename) diff --git a/snusub.py b/snusub.py index b9ab1e5..bf85045 100644 --- a/snusub.py +++ b/snusub.py @@ -6,410 +6,479 @@ from datetime import timedelta ### + def from_vtt(vtt_string): - VTT_TIMECODE_PATTERN = r"((?:\d{2}:)?\d{2}:\d{2}\.\d{3}) --> ((?:\d{2}:)?\d{2}:\d{2}\.\d{3})" - VTT_LINE_NUMBER_PATTERN = r"^\d+$" - parts = re.split(r'\n\n+', vtt_string.strip()) - if parts[0].startswith('WEBVTT'): - parts.pop(0) + VTT_TIMECODE_PATTERN = ( + r"((?:\d{2}:)?\d{2}:\d{2}\.\d{3}) --> ((?:\d{2}:)?\d{2}:\d{2}\.\d{3})" + ) + VTT_LINE_NUMBER_PATTERN = r"^\d+$" + parts = re.split(r"\n\n+", vtt_string.strip()) + if parts[0].startswith("WEBVTT"): + parts.pop(0) - subtitles = [] - for part in parts: - lines = part.split('\n') - match = re.match(VTT_TIMECODE_PATTERN, lines[0]) - if not match: - if re.match(VTT_LINE_NUMBER_PATTERN, lines[0]): - lines.pop(0) - match = re.match(VTT_TIMECODE_PATTERN, lines[0]) - if not match: - continue + subtitles = [] + for part in parts: + lines = part.split("\n") + match = re.match(VTT_TIMECODE_PATTERN, lines[0]) + if not match: + if re.match(VTT_LINE_NUMBER_PATTERN, lines[0]): + lines.pop(0) + match = re.match(VTT_TIMECODE_PATTERN, lines[0]) + if not match: + continue - start, end = match.groups() - content = '\n'.join(lines[1:]) + "\n" - # if start == end: - # continue - - subtitles.append({ - 'start': start, - 'end': end, - 'content': (content.replace("-\n", "\n").replace("-\n", "\n").replace("-", " ").replace("%", " ").replace(" "," ").replace(" "," ").replace(" ","").replace("","").replace(" \n", "\n"))[:-1] - }) + start, end = match.groups() + content = "\n".join(lines[1:]) + "\n" + # if start == end: + # continue + + subtitles.append( + { + "start": start, + "end": end, + "content": ( + content.replace("-\n", "\n") + .replace("-\n", "\n") + .replace("-", " ") + .replace("%", " ") + .replace(" ", " ") + .replace(" ", " ") + .replace(" ", "") + .replace("", "") + .replace(" \n", "\n") + )[:-1], + } + ) + + return subtitles - return subtitles def to_vtt(subtitles): vtt_content = "WEBVTT\n\n\n" for idx, subtitle in enumerate(subtitles): - content = subtitle['content'] + content = subtitle["content"] if not subtitle.get("split", False): - start = subtitle['start'] - end = subtitle['end'] - if not start or not end or start == end: - raise Exception(f"VTT timestamp parse error from #{idx}.") - vtt_content += f"{start} --> {end}\n{content}\n\n\n" + start = subtitle["start"] + end = subtitle["end"] + if not start or not end or start == end: + raise Exception(f"VTT timestamp parse error from #{idx}.") + vtt_content += f"{start} --> {end}\n{content}\n\n\n" else: - vtt_content += f"NOTE {content}\n\n\n" + vtt_content += f"NOTE {content}\n\n\n" return vtt_content.strip() -def to_stacked_vtt(subtitles, continous = True): - vtt_content = "WEBVTT\n\n\n" - buffer = "" - for n, subtitle in enumerate(subtitles): - if subtitle.get("split", False): - buffer = "" - continue - if len(buffer) != 0: - if str(subtitle['content'].strip())[-1] == ".": - buffer += "\n" - else: - buffer += " " +def to_stacked_vtt(subtitles, continous=True): + vtt_content = "WEBVTT\n\n\n" + buffer = "" + for n, subtitle in enumerate(subtitles): + if subtitle.get("split", False): + buffer = "" + continue - buffer += subtitle['content'].strip() + if len(buffer) != 0: + if str(subtitle["content"].strip())[-1] == ".": + buffer += "\n" + else: + buffer += " " - if n < len(subtitles) - 1: - end_time = subtitles[n+1]['start'] if continous and not subtitles[n+1].get("split", False) else subtitle['end'] - else: - end_time = subtitle['end'] - - if not subtitle['start'] or not end_time: - raise Exception(f"VTT timestamp parse error from #{idx}.") - if subtitle['start'] == end_time: - raise Exception(f"Error, subtitle timestamp overlaps.\n{subtitle['start']} --> {end_time} {subtitle['content'].strip()}") - vtt_content += f"{subtitle['start']} --> {end_time}\n" - vtt_content += buffer - vtt_content += "\n\n\n" + buffer += subtitle["content"].strip() - print(f"{subtitle['start']} --> {end_time}\n{buffer}\n\n") + if n < len(subtitles) - 1: + end_time = ( + subtitles[n + 1]["start"] + if continous and not subtitles[n + 1].get("split", False) + else subtitle["end"] + ) + else: + end_time = subtitle["end"] + + if not subtitle["start"] or not end_time: + raise Exception(f"VTT timestamp parse error from #{idx}.") + if subtitle["start"] == end_time: + raise Exception( + f"Error, subtitle timestamp overlaps.\n{subtitle['start']} --> {end_time} {subtitle['content'].strip()}" + ) + vtt_content += f"{subtitle['start']} --> {end_time}\n" + vtt_content += buffer + vtt_content += "\n\n\n" + + print(f"{subtitle['start']} --> {end_time}\n{buffer}\n\n") + + return vtt_content - return vtt_content ### + def script_from_word_vtt(wordvtt): - subtitles = from_vtt(wordvtt) - print(f"Generating script file from VTT...") - sentences = [] - ADD_NEXT_SENTENCE = 0 - for n, subtitle in enumerate(subtitles): - sentence = subtitle["content"].replace("", "").replace("", "") - if ((sentences[-1] if sentences else None) != sentence) or ADD_NEXT_SENTENCE: - sentences.append(sentence) - ADD_NEXT_SENTENCE = 0 - if subtitle["content"][-4:] == "": - ADD_NEXT_SENTENCE = 1 - if n + 2 < len(subtitles): - if subtitles[n+2]["content"].replace("", "").replace("", "") != sentence: - ADD_NEXT_SENTENCE = 0 - return sentences + subtitles = from_vtt(wordvtt) + print(f"Generating script file from VTT...") + sentences = [] + ADD_NEXT_SENTENCE = 0 + for n, subtitle in enumerate(subtitles): + sentence = subtitle["content"].replace("", "").replace("", "") + if ((sentences[-1] if sentences else None) != sentence) or ADD_NEXT_SENTENCE: + sentences.append(sentence) + ADD_NEXT_SENTENCE = 0 + if subtitle["content"][-4:] == "": + ADD_NEXT_SENTENCE = 1 + if n + 2 < len(subtitles): + if ( + subtitles[n + 2]["content"].replace("", "").replace("", "") + != sentence + ): + ADD_NEXT_SENTENCE = 0 + return sentences + def create_word_scenes(raw_vtt, raw_script): - subtitles = from_vtt(raw_vtt) - scripts = [i for i in raw_script.split("\n") if i] - print(f"Found {len(subtitles)} subtitles, {len(scripts)} scenes.\nTimestamping each words...") + subtitles = from_vtt(raw_vtt) + scripts = [i for i in raw_script.split("\n") if i] + print( + f"Found {len(subtitles)} subtitles, {len(scripts)} scenes.\nTimestamping each words..." + ) - scenes = [] - for n, script in enumerate(scripts): - if len(script.split(" ")) == 1: - continue - scenes.append({"scene": script, "timestamp": []}) + scenes = [] + for n, script in enumerate(scripts): + if len(script.split(" ")) == 1: + continue + scenes.append({"scene": script, "timestamp": []}) - scenes_cur = 0 - for n, subtitle in enumerate(subtitles): - sentence = subtitle["content"].replace("", "").replace("", "") - if len(sentence.split(" ")) == 1: - continue + scenes_cur = 0 + for n, subtitle in enumerate(subtitles): + sentence = subtitle["content"].replace("", "").replace("", "") + if len(sentence.split(" ")) == 1: + continue - if sentence != scenes[scenes_cur].get("scene"): - if sentence == scenes[scenes_cur+1].get("scene"): - scenes_cur += 1 - else: - raise Exception(f"Error, Failed to match sentence with scene.\n\"{scenes[scenes_cur].get("scene")}\" or \"[{scenes_cur+1}] {scenes[scenes_cur+1].get("scene")}\" != \"{sentence}\"") + if sentence != scenes[scenes_cur].get("scene"): + if sentence == scenes[scenes_cur + 1].get("scene"): + scenes_cur += 1 + else: + raise Exception( + f"Error, Failed to match sentence with scene.\n\"{scenes[scenes_cur].get("scene")}\" or \"[{scenes_cur+1}] {scenes[scenes_cur+1].get("scene")}\" != \"{sentence}\"" + ) - current_scene = scenes[scenes_cur] - if current_scene["timestamp"]: - word_idx = current_scene["timestamp"][-1]["index"] + 1 - else: - word_idx = 0 + current_scene = scenes[scenes_cur] + if current_scene["timestamp"]: + word_idx = current_scene["timestamp"][-1]["index"] + 1 + else: + word_idx = 0 - if ("" not in subtitle["content"]) and word_idx >= len(sentence.split(" ")): - # Ignore trailing dummy subtitle after last word indexed. - pass + if ("" not in subtitle["content"]) and word_idx >= len(sentence.split(" ")): + # Ignore trailing dummy subtitle after last word indexed. + pass - if ("" in subtitle["content"]) and word_idx >= len(sentence.split(" ")): - # If there is trailing non-dummy timestamped subtitle, Reset word_idx and step to next scene. (Repeating sentence doesnt increment cur.) - print(f"Error, Index wrong. {scenes_cur}, word: {word_idx}, total words: {len(sentence.split(" "))}\n{subtitle}") - word_idx = 0 - scenes_cur += 1 - current_scene = scenes[scenes_cur] - if current_scene["timestamp"]: - word_idx = current_scene["timestamp"][-1]["index"] + 1 - else: - word_idx = 0 - print(f"Changed to {word_idx}, {scenes_cur}") + if ("" in subtitle["content"]) and word_idx >= len(sentence.split(" ")): + # If there is trailing non-dummy timestamped subtitle, Reset word_idx and step to next scene. (Repeating sentence doesnt increment cur.) + print( + f"Error, Index wrong. {scenes_cur}, word: {word_idx}, total words: {len(sentence.split(" "))}\n{subtitle}" + ) + word_idx = 0 + scenes_cur += 1 + current_scene = scenes[scenes_cur] + if current_scene["timestamp"]: + word_idx = current_scene["timestamp"][-1]["index"] + 1 + else: + word_idx = 0 + print(f"Changed to {word_idx}, {scenes_cur}") - # Start matching words. - if "" in subtitle["content"]: - word = subtitle["content"].split("")[1].split("")[0] + # Start matching words. + if "" in subtitle["content"]: + word = subtitle["content"].split("")[1].split("")[0] - if word not in sentence.split(" "): - raise Exception(f"Error, Mismatch\n=> \"{word}\" not in \"{sentence}\"") - return + if word not in sentence.split(" "): + raise Exception(f'Error, Mismatch\n=> "{word}" not in "{sentence}"') + return - try: - assert sentence.split(" ")[word_idx] == word - except: - raise Exception(f"Error, Mismatch\n=> \"{word}\" != [{word_idx}] of \"{sentence}\"") + try: + assert sentence.split(" ")[word_idx] == word + except: + raise Exception( + f'Error, Mismatch\n=> "{word}" != [{word_idx}] of "{sentence}"' + ) - word_time = {"start": subtitle["start"], "end": subtitle["end"], "index": word_idx, "word": word} - current_scene["timestamp"].append(word_time) + word_time = { + "start": subtitle["start"], + "end": subtitle["end"], + "index": word_idx, + "word": word, + } + current_scene["timestamp"].append(word_time) - for scene in scenes: - if len(scene["scene"].split(" ")) != len(scene["timestamp"]): - raise Exception("Error, Scene length and timestamp length doesnt match.") - if "" in scene["scene"].split(" "): - print(repr(scene["scene"])) + for scene in scenes: + if len(scene["scene"].split(" ")) != len(scene["timestamp"]): + raise Exception("Error, Scene length and timestamp length doesnt match.") + if "" in scene["scene"].split(" "): + print(repr(scene["scene"])) - full_script, full_scenes = [], [] - for scene in scenes: - full_script += scene["scene"].split(" ")[:-1] - full_script.append(scene["scene"].split(" ")[-1]+"##") - full_scenes += scene["timestamp"] + full_script, full_scenes = [], [] + for scene in scenes: + full_script += scene["scene"].split(" ")[:-1] + full_script.append(scene["scene"].split(" ")[-1] + "##") + full_scenes += scene["timestamp"] - for i, j in zip(full_script, full_scenes): - if i.replace("##", "") != j["word"]: - raise Exception("Error, Mismatch") - return + for i, j in zip(full_script, full_scenes): + if i.replace("##", "") != j["word"]: + raise Exception("Error, Mismatch") + return + + assert len(full_scenes) == len(full_script) + + return full_script, full_scenes - assert len(full_scenes) == len(full_script) - return full_script, full_scenes - def scene_from_new_script(raw_script, full_script, full_scenes): - mod_script = raw_script.replace("\n", " \n ").split(" ") - mod_script = [i for i in mod_script if i] - n = 0 - while True: - if mod_script[n] == "\n": - mod_script[n-1] += "\n" - del(mod_script[n]) - n -= 1 - n += 1 - if n == len(mod_script): - break - - print(f"Original: {len(full_script)}, Modded: {len(mod_script)}") - allowed_list = [".", "\n", "\n\n", ",", "?", "##"] + mod_script = raw_script.replace("\n", " \n ").split(" ") + mod_script = [i for i in mod_script if i] + n = 0 + while True: + if mod_script[n] == "\n": + mod_script[n - 1] += "\n" + del mod_script[n] + n -= 1 + n += 1 + if n == len(mod_script): + break + + print(f"Original: {len(full_script)}, Modded: {len(mod_script)}") + allowed_list = [".", "\n", "\n\n", ",", "?", "##"] + + def normalized(x): + for i in allowed_list: + x = x.replace(i, "") + return x.upper() + + same = lambda a, b: normalized(a) == normalized(b) + new_script, new_timestamp, orig_index, n = [], [], 0, 0 + fail = 0 + while n < len(mod_script): + print(f"{repr(mod_script[n]):>20} ? {repr(full_script[orig_index])}") + word = mod_script[n] + if same(word, full_script[orig_index].replace("##", "")): + cur = full_scenes[orig_index] + new_script.append(word.replace("##", "")) + new_timestamp.append({"start": cur["start"], "end": cur["end"]}) + fail = 0 + else: + if fail > 10: + raise Exception("Error: Failed to match words,") + return + fail += 1 + n -= 1 + n, orig_index = n + 1, orig_index + 1 + assert len(new_script) == len(new_timestamp) + return new_script, new_timestamp - def normalized(x): - for i in allowed_list: - x = x.replace(i, "") - return x.upper() - - same = lambda a, b: normalized(a) == normalized(b) - new_script, new_timestamp, orig_index, n = [], [], 0, 0 - fail = 0 - while n < len(mod_script): - print(f"{repr(mod_script[n]):>20} ? {repr(full_script[orig_index])}") - word = mod_script[n] - if same(word, full_script[orig_index].replace("##", "")): - cur = full_scenes[orig_index] - new_script.append(word.replace("##", "")) - new_timestamp.append({"start": cur["start"], "end": cur["end"]}) - fail = 0 - else: - if fail > 10: - raise Exception("Error: Failed to match words,") - return - fail += 1 - n -= 1 - n, orig_index = n+1, orig_index+1 - assert len(new_script) == len(new_timestamp) - return new_script, new_timestamp def build_new_subtitle(new_script, new_timestamp): - buffer, new_scenes, start, end = [], [], None, None - current_scene = [] - - for i, j in zip(new_script, new_timestamp): - buffer.append(i.replace("\n", "")) - if not start: - start = j["start"] + buffer, new_scenes, start, end = [], [], None, None + current_scene = [] - if "\n" in i: - current_scene.append({"content": " ".join(buffer).replace("##", ""), "start": start, "end": j["end"]}) - buffer, start = [], None - - if "\n\n" in i: - print(f"Section break at line #{len(current_scene):<3}| \"{current_scene[-1]["content"]}\"") - new_scenes.append(current_scene) - current_scene = [] + for i, j in zip(new_script, new_timestamp): + buffer.append(i.replace("\n", "")) + if not start: + start = j["start"] - if start: - buffer.append(i.replace("\n", "")) - current_scene.append({"content": " ".join(buffer), "start": start, "end": j["end"]}) + if "\n" in i: + current_scene.append( + { + "content": " ".join(buffer).replace("##", ""), + "start": start, + "end": j["end"], + } + ) + buffer, start = [], None - if current_scene != (new_scenes[-1] if new_scenes else None): - new_scenes.append(current_scene) + if "\n\n" in i: + print( + f"Section break at line #{len(current_scene):<3}| \"{current_scene[-1]["content"]}\"" + ) + new_scenes.append(current_scene) + current_scene = [] - newsub = [] - for n, i in enumerate(new_scenes): - newsub += i - if n < len(new_scenes) - 1: - newsub.append({"content": "Break", "start": None, "end": None, "split": True}) + if start: + buffer.append(i.replace("\n", "")) + current_scene.append( + {"content": " ".join(buffer), "start": start, "end": j["end"]} + ) + + if current_scene != (new_scenes[-1] if new_scenes else None): + new_scenes.append(current_scene) + + newsub = [] + for n, i in enumerate(new_scenes): + newsub += i + if n < len(new_scenes) - 1: + newsub.append( + {"content": "Break", "start": None, "end": None, "split": True} + ) + + return newsub - return newsub ### + def autobreak(lines, times): - from datetime import timedelta + from datetime import timedelta - def parsetime(time_str): - minutes, seconds = time_str.split(':') - seconds, milliseconds = seconds.split('.') - td = timedelta(minutes=int(minutes), seconds=int(seconds), milliseconds=int(milliseconds)) - return td + def parsetime(time_str): + minutes, seconds = time_str.split(":") + seconds, milliseconds = seconds.split(".") + td = timedelta( + minutes=int(minutes), seconds=int(seconds), milliseconds=int(milliseconds) + ) + return td - script = [] - long_breaks = [] - tmark = parsetime("0:0.0") - for i, j in zip(lines, times): - tdiff = parsetime(j["start"]) - tmark - tmark = parsetime(j["end"]) - if tdiff > parsetime("0:0.0"): - long_breaks.append(tdiff) + script = [] + long_breaks = [] + tmark = parsetime("0:0.0") + for i, j in zip(lines, times): + tdiff = parsetime(j["start"]) - tmark + tmark = parsetime(j["end"]) + if tdiff > parsetime("0:0.0"): + long_breaks.append(tdiff) - mean_break = parsetime("0:0.0") - for i in long_breaks: - mean_break += i/len(long_breaks) - print(mean_break) + mean_break = parsetime("0:0.0") + for i in long_breaks: + mean_break += i / len(long_breaks) + print(mean_break) - script = "" - tmark = parsetime("0:0.0") - tmp = " " + script = "" + tmark = parsetime("0:0.0") + tmp = " " - continous_line = 0 - for i, j in zip(lines, times): - tdiff = parsetime(j["start"]) - tmark - tmark = parsetime(j["end"]) - if tdiff > mean_break and tmp[-1] != ".": - script += "\n" + continous_line = 0 + for i, j in zip(lines, times): + tdiff = parsetime(j["start"]) - tmark + tmark = parsetime(j["end"]) + if tdiff > mean_break and tmp[-1] != ".": + script += "\n" - if (tdiff >= mean_break and tmp[-1] == "."): - script += "\n" - continous_line = 0 - else: - continous_line += 1 + if tdiff >= mean_break and tmp[-1] == ".": + script += "\n" + continous_line = 0 + else: + continous_line += 1 - script += i.replace("##", "") + script += i.replace("##", "") + + if i[-1] == ".": + script += "\n" + elif "##" in i: + script += "\n" + else: + script += " " + tmp = i + + return script - if i[-1] == ".": - script += "\n" - elif "##" in i: - script += "\n" - else: - script += " " - tmp = i - - return script ############################################ -def saveFile(filename, data, override = False): - if os.path.exists(filename) and not override: - raise Exception(f"File {filename} already exists.") - return - with open(filename, "w") as f: - f.write(data) + +def saveFile(filename, data, override=False): + if os.path.exists(filename) and not override: + raise Exception(f"File {filename} already exists.") + return + with open(filename, "w") as f: + f.write(data) + def openFile(filename): - if not os.path.exists(filename): - raise Exception(f"File {filename} doesnt exists.") - return - with open(filename, "r") as f: - data = f.read() - if not data: - raise Exception("Data empty.") - return - return data + if not os.path.exists(filename): + raise Exception(f"File {filename} doesnt exists.") + return + with open(filename, "r") as f: + data = f.read() + if not data: + raise Exception("Data empty.") + return + return data + ############################################ -if __name__=="__main__": - PROG = sys.argv[0].split("/")[-1] - if len(sys.argv) not in (3, 4): - print( \ -f"""Usage: {PROG} [COMMAND] [FILES]... +if __name__ == "__main__": + PROG = sys.argv[0].split("/")[-1] + if len(sys.argv) not in (3, 4): + print( + f"""Usage: {PROG} [COMMAND] [FILES]... Commands: - script Generates script file from vtt file. - apply