diff --git a/convert.py b/backup.py
similarity index 100%
rename from convert.py
rename to backup.py
diff --git a/restack.py b/stackvtt.py
similarity index 60%
rename from restack.py
rename to stackvtt.py
index 451bda8..5c6e58b 100644
--- a/restack.py
+++ b/stackvtt.py
@@ -4,7 +4,7 @@ from datetime import timedelta
VTT_TIMECODE_PATTERN = r"((?:\d{2}:)?\d{2}:\d{2}\.\d{3}) --> ((?:\d{2}:)?\d{2}:\d{2}\.\d{3})"
VTT_LINE_NUMBER_PATTERN = r"^\d+$"
-def parse_vtt(vtt_string):
+def from_vtt(vtt_string):
parts = re.split(r'\n\n+', vtt_string.strip())
if parts[0].startswith('WEBVTT'):
@@ -35,47 +35,39 @@ def parse_vtt(vtt_string):
def to_vtt(subtitles):
vtt_content = "WEBVTT\n\n"
for idx, subtitle in enumerate(subtitles):
- # print(subtitle, idx)
start = subtitle['start']
end = subtitle['end']
content = subtitle['content']
vtt_content += f"{start} --> {end}\n{content}\n\n"
return vtt_content.strip()
+def stack_subtitle():
+ buffer = []
+ linebuf = []
+ for line in parsed_vtt:
+ print(line["content"].strip())
+ content = line["content"].strip()
+ if True:
+ linebuf.append(line)
+ else:
+ linebuf.append(line)
+ buffer.append(linebuf)
+ linebuf = []
+
+ sub = []
+ for section in buffer:
+ strbuf = ""
+ for scene in section:
+ strbuf += scene["content"]
+ # if scene["content"][-1] == ".":
+ strbuf += "\n"
+ # else:
+ # strbuf += " "
+ scene["content"] = strbuf
+ sub.append(scene)
with open("example.vtt", "r") as f:
vtt_content = f.read()
-parsed_vtt = parse_vtt(vtt_content)
-#print(len(parsed_vtt))
-
-buffer = []
-linebuf = []
-
-for line in parsed_vtt:
-# print(line["content"].strip())
- content = line["content"].strip()
- if "".join([i["content"] for i in linebuf]).count(".") < 4 or len(linebuf) < 5:
- linebuf.append(line)
- else:
- linebuf.append(line)
- buffer.append(linebuf)
- linebuf = []
-
-# print(buffer)
-
-sub = []
-for section in buffer:
- strbuf = ""
- for scene in section:
- strbuf += scene["content"]
- # if scene["content"][-1] == ".":
- strbuf += "\n"
- # else:
- # strbuf += " "
- scene["content"] = strbuf
- sub.append(scene)
-
-# print(buffer[0])
-
-print(to_vtt(sub))
\ No newline at end of file
+parsed_vtt = from_vtt(vtt_content)
+print(to_vtt(stack_subtitle(parsed_vtt)))
\ No newline at end of file
diff --git a/wordvtt.py b/wordvtt.py
new file mode 100644
index 0000000..e3bcf7e
--- /dev/null
+++ b/wordvtt.py
@@ -0,0 +1,231 @@
+#!/usr/bin/env python3
+
+import re, json
+import os
+from datetime import timedelta
+
+def from_vtt(vtt_string):
+ VTT_TIMECODE_PATTERN = r"((?:\d{2}:)?\d{2}:\d{2}\.\d{3}) --> ((?:\d{2}:)?\d{2}:\d{2}\.\d{3})"
+ VTT_LINE_NUMBER_PATTERN = r"^\d+$"
+ parts = re.split(r'\n\n+', vtt_string.strip())
+ if parts[0].startswith('WEBVTT'):
+ parts.pop(0)
+
+ subtitles = []
+ for part in parts:
+ lines = part.split('\n')
+ match = re.match(VTT_TIMECODE_PATTERN, lines[0])
+ if not match:
+ if re.match(VTT_LINE_NUMBER_PATTERN, lines[0]):
+ lines.pop(0)
+ match = re.match(VTT_TIMECODE_PATTERN, lines[0])
+ if not match:
+ continue
+
+ start, end = match.groups()
+ content = '\n'.join(lines[1:])
+ subtitles.append({
+ 'start': start,
+ 'end': end,
+ 'content': content
+ })
+
+ return subtitles
+
+def to_vtt(subtitles):
+ vtt_content = "WEBVTT\n\n\n"
+ for idx, subtitle in enumerate(subtitles):
+ content = subtitle['content']
+ if not subtitle.get("split", False):
+ start = subtitle['start']
+ end = subtitle['end']
+ vtt_content += f"{start} --> {end}\n{content}\n\n\n"
+ else:
+ vtt_content += f"NOTE {content}\n\n\n"
+
+ return vtt_content.strip()
+
+def to_stacked_vtt(subtitles):
+ vtt_content = "WEBVTT\n\n\n"
+ buffer = ""
+ for subtitle in subtitles:
+ if subtitle.get("split", False):
+ buffer = ""
+ continue
+ if len(buffer) != 0:
+ if str(subtitle['content'].strip())[-1] == ".":
+ buffer += "\n"
+ else:
+ buffer += " "
+ buffer += subtitle['content'].strip()
+ vtt_content += f"{to_time(subtitle['start'])} --> {to_time(subtitle['end'])}\n"
+ vtt_content += buffer
+ vtt_content += "\n\n\n"
+
+def create_word_scenes(wordvtt, scriptraw):
+ subtitles = from_vtt(wordvtt)
+ scripts = [i for i in scriptraw.split("\n") if i]
+ print(f"VTT {len(subtitles)} lines, Script {len(scripts)} lines")
+ scenes = []
+ for n, script in enumerate(scripts):
+ if len(script.split(" ")) == 1:
+ continue
+ scenes.append({"scene": script, "timestamp": []})
+
+ scenes_cur = 0
+ for n, subtitle in enumerate(subtitles):
+ sentence = subtitle["content"].replace("", "").replace("", "")
+ if len(sentence.split(" ")) == 1:
+ continue
+
+ if sentence != scenes[scenes_cur].get("scene"):
+ if sentence == scenes[scenes_cur+1].get("scene"):
+ scenes_cur += 1
+ else:
+ print(f"Error, Mismatch\n=> scenes[{scenes_cur}] != \"{sentence}\"")
+ return
+
+ current_scene = scenes[scenes_cur]
+ if current_scene["timestamp"]:
+ word_idx = current_scene["timestamp"][-1]["index"] + 1
+ else:
+ word_idx = 0
+
+ if "" in subtitle["content"]:
+ word = subtitle["content"].split("")[1].split("")[0]
+ if word not in sentence:
+ print(f"Error, Mismatch\n=> \"{word}\" not in \"{sentence}\"")
+ return
+
+ try:
+ assert sentence.split(" ")[word_idx] == word
+ except:
+ print(f"Error, Mismatch\n=> \"{word}\" != [{word_idx}] of \"{sentence}\"")
+ return
+
+ word_time = {"start": subtitle["start"], "end": subtitle["end"], "index": word_idx, "word": word}
+ current_scene["timestamp"].append(word_time)
+
+ # print(json.dumps(scenes, indent=2))
+
+ for scene in scenes:
+ if len(scene["scene"].split(" ")) != len(scene["timestamp"]):
+ print("Error, Mismatch length")
+ return
+
+ full_script, full_scenes = [], []
+ for scene in scenes:
+ full_script += scene["scene"].split(" ")
+ full_scenes += scene["timestamp"]
+
+ for i, j in zip(full_script, full_scenes):
+ if i != j["word"]:
+ print("Error, Mismatch")
+ return
+
+ assert len(full_scenes) == len(full_script)
+
+ return full_script, full_scenes
+
+def scene_from_new_script(raw_script, full_script, full_scenes):
+ mod_script = raw_script.replace("\n", " \n ").split(" ")
+ mod_script = [i for i in mod_script if i]
+ n = 0
+ while True:
+ if mod_script[n] == "\n":
+ mod_script[n-1] += "\n"
+ del(mod_script[n])
+ n -= 1
+ n += 1
+ if n == len(mod_script):
+ break
+ # print(mod_script)
+ print(f"Original: {len(full_script)}, Modded: {len(mod_script)}")
+ allowed_list = [".", "\n", "\n\n", ","]
+
+ def normalized(x):
+ for i in allowed_list:
+ x = x.replace(i, "")
+ return x.upper()
+
+ same = lambda a, b: normalized(a) == normalized(b)
+ new_script, new_timestamp, orig_index, n = [], [], 0, 0
+ while n < len(mod_script):
+ # print(f"{repr(mod_script[n]):>20} ? {repr(full_script[orig_index])}")
+ word = mod_script[n]
+ if same(word, full_script[orig_index]):
+ cur = full_scenes[orig_index]
+ new_script.append(word)
+ new_timestamp.append({"start": cur["start"], "end": cur["end"]})
+ else:
+ # print("Back")
+ n -= 1
+ n, orig_index = n+1, orig_index+1
+
+ assert len(new_script) == len(new_timestamp)
+ return new_script, new_timestamp
+
+def build_new_subtitle(new_script, new_timestamp):
+ buffer, new_scenes, start, end = [], [], None, None
+ current_scene = []
+ # print(" ".join(new_script).split("\n"))
+
+ for i, j in zip(new_script, new_timestamp):
+ if "\n" in i:
+ buffer.append(i.replace("\n", ""))
+ current_scene.append({"content": " ".join(buffer), "start": start, "end": j["end"]})
+ buffer, start = [], None
+ if "\n\n" in i:
+ print(f"Section break at line #{len(current_scene):<3}| \"{current_scene[-1]["content"]}\"")
+ new_scenes.append(current_scene)
+ current_scene = []
+ else:
+ buffer.append(i)
+ if not start:
+ start = j["start"]
+
+ if start:
+ buffer.append(i.replace("\n", ""))
+ current_scene.append({"content": " ".join(buffer), "start": start, "end": j["end"]})
+
+ if current_scene != (new_scenes[-1] if new_scenes else None):
+ new_scenes.append(current_scene)
+
+ # print("\n\n".join(["\n".join([j["content"] for j in i]) for i in new_scenes]))
+ newsub = []
+ for n, i in enumerate(new_scenes):
+ newsub += i
+ if n < len(new_scenes) - 1:
+ newsub.append({"content": "Break", "start": None, "end": None, "split": True})
+
+ return newsub
+
+def saveFile(filename, data, override = False):
+ if os.path.exists(filename) and not override:
+ print(f"File {filename} already exists.")
+ return -1
+ with open(filename, "w") as f:
+ f.write(data)
+
+def openFile(filename):
+ with open(filename, "r") as f:
+ data = f.read()
+ if not data:
+ return -1
+ return data
+
+def main():
+ vttfile = "test.vtt"
+ scriptfile = "test.txt"
+ modfile = "test.script"
+
+ full_script, full_scenes = create_word_scenes(openFile(vttfile), openFile(scriptfile))
+ saveFile("test.script", " ".join(full_script).replace(". ", ".\n"))
+ a, b = scene_from_new_script(openFile(modfile), full_script, full_scenes)
+ final_vtt = build_new_subtitle(a, b)
+ # print(final_vtt)
+ saveFile("test.final.vtt", to_vtt(final_vtt), True)
+ saveFile("test.final.json", json.dumps(final_vtt, indent=2), True)
+
+if __name__=="__main__":
+ main()
\ No newline at end of file