diff --git a/snuenc.sh b/snuenc.sh
new file mode 100644
index 0000000..be52aab
--- /dev/null
+++ b/snuenc.sh
@@ -0,0 +1 @@
+i="input.mp4";j="test.stacked.vtt";ffmpeg -f lavfi -i color=c=gray:s=508x1080:r=ntsc:d=$(ffprobe -i $i -show_entries format=duration -v quiet -of csv="p=0") -vf "subtitles=$j:force_style='FontName=Helvetica,Alignment=4,Fontsize=9.5,Outline=0,Shadow=0,MarginH=2,MarginV=4,Spacing=0'" -b:v 2000k -f nut - | ffmpeg -i $i -i - -filter_complex "[0:v][1:v]hstack=inputs=2:shortest=1[v];[0:a]anull[a2];[0:a][a2]amerge[a]" -map "[v]" -map "[a]" -b:v 2000k -f nut - | ffplay -
diff --git a/snusub.py b/snusub.py
index 02ecad8..b9ab1e5 100644
--- a/snusub.py
+++ b/snusub.py
@@ -4,6 +4,8 @@ import re, json
import os, sys
from datetime import timedelta
+###
+
def from_vtt(vtt_string):
VTT_TIMECODE_PATTERN = r"((?:\d{2}:)?\d{2}:\d{2}\.\d{3}) --> ((?:\d{2}:)?\d{2}:\d{2}\.\d{3})"
VTT_LINE_NUMBER_PATTERN = r"^\d+$"
@@ -24,6 +26,9 @@ def from_vtt(vtt_string):
start, end = match.groups()
content = '\n'.join(lines[1:]) + "\n"
+ # if start == end:
+ # continue
+
subtitles.append({
'start': start,
'end': end,
@@ -39,6 +44,8 @@ def to_vtt(subtitles):
if not subtitle.get("split", False):
start = subtitle['start']
end = subtitle['end']
+ if not start or not end or start == end:
+ raise Exception(f"VTT timestamp parse error from #{idx}.")
vtt_content += f"{start} --> {end}\n{content}\n\n\n"
else:
vtt_content += f"NOTE {content}\n\n\n"
@@ -58,6 +65,7 @@ def to_stacked_vtt(subtitles, continous = True):
buffer += "\n"
else:
buffer += " "
+
buffer += subtitle['content'].strip()
if n < len(subtitles) - 1:
@@ -65,6 +73,10 @@ def to_stacked_vtt(subtitles, continous = True):
else:
end_time = subtitle['end']
+ if not subtitle['start'] or not end_time:
+ raise Exception(f"VTT timestamp parse error from #{idx}.")
+ if subtitle['start'] == end_time:
+ raise Exception(f"Error, subtitle timestamp overlaps.\n{subtitle['start']} --> {end_time} {subtitle['content'].strip()}")
vtt_content += f"{subtitle['start']} --> {end_time}\n"
vtt_content += buffer
vtt_content += "\n\n\n"
@@ -73,16 +85,18 @@ def to_stacked_vtt(subtitles, continous = True):
return vtt_content
+###
+
def script_from_word_vtt(wordvtt):
subtitles = from_vtt(wordvtt)
- print(f"VTT {len(subtitles)} lines. Generating script file from VTT.")
+ print(f"Generating script file from VTT...")
sentences = []
- EXCEPTION_FLAG, ADD_NEXT_SENTENCE = "", 0
+ ADD_NEXT_SENTENCE = 0
for n, subtitle in enumerate(subtitles):
sentence = subtitle["content"].replace("", "").replace("", "")
if ((sentences[-1] if sentences else None) != sentence) or ADD_NEXT_SENTENCE:
sentences.append(sentence)
- ADD_NEXT_SENTENCE = 0
+ ADD_NEXT_SENTENCE = 0
if subtitle["content"][-4:] == "":
ADD_NEXT_SENTENCE = 1
if n + 2 < len(subtitles):
@@ -90,10 +104,11 @@ def script_from_word_vtt(wordvtt):
ADD_NEXT_SENTENCE = 0
return sentences
-def create_word_scenes(wordvtt, scriptraw):
- subtitles = from_vtt(wordvtt)
- scripts = [i for i in scriptraw.split("\n") if i]
- print(f"VTT {len(subtitles)} lines, Script {len(scripts)} lines")
+def create_word_scenes(raw_vtt, raw_script):
+ subtitles = from_vtt(raw_vtt)
+ scripts = [i for i in raw_script.split("\n") if i]
+ print(f"Found {len(subtitles)} subtitles, {len(scripts)} scenes.\nTimestamping each words...")
+
scenes = []
for n, script in enumerate(scripts):
if len(script.split(" ")) == 1:
@@ -110,8 +125,7 @@ def create_word_scenes(wordvtt, scriptraw):
if sentence == scenes[scenes_cur+1].get("scene"):
scenes_cur += 1
else:
- print(f"Error, Mismatch in scenes\n=>\"[{scenes_cur}] {scenes[scenes_cur].get("scene")}\" or \"[{scenes_cur+1}] {scenes[scenes_cur+1].get("scene")}\" != \"{sentence}\"")
- return
+ raise Exception(f"Error, Failed to match sentence with scene.\n\"{scenes[scenes_cur].get("scene")}\" or \"[{scenes_cur+1}] {scenes[scenes_cur+1].get("scene")}\" != \"{sentence}\"")
current_scene = scenes[scenes_cur]
if current_scene["timestamp"]:
@@ -120,9 +134,12 @@ def create_word_scenes(wordvtt, scriptraw):
word_idx = 0
if ("" not in subtitle["content"]) and word_idx >= len(sentence.split(" ")):
+ # Ignore trailing dummy subtitle after last word indexed.
pass
+
if ("" in subtitle["content"]) and word_idx >= len(sentence.split(" ")):
- print(f"Error, index wrong. {scenes_cur}, word: {word_idx}, total words: {len(sentence.split(" "))}\n{subtitle}")
+ # If there is trailing non-dummy timestamped subtitle, Reset word_idx and step to next scene. (Repeating sentence doesnt increment cur.)
+ print(f"Error, Index wrong. {scenes_cur}, word: {word_idx}, total words: {len(sentence.split(" "))}\n{subtitle}")
word_idx = 0
scenes_cur += 1
current_scene = scenes[scenes_cur]
@@ -132,26 +149,25 @@ def create_word_scenes(wordvtt, scriptraw):
word_idx = 0
print(f"Changed to {word_idx}, {scenes_cur}")
+ # Start matching words.
if "" in subtitle["content"]:
word = subtitle["content"].split("")[1].split("")[0]
if word not in sentence.split(" "):
- print(f"Error, Mismatch\n=> \"{word}\" not in \"{sentence}\"")
+ raise Exception(f"Error, Mismatch\n=> \"{word}\" not in \"{sentence}\"")
return
try:
assert sentence.split(" ")[word_idx] == word
except:
- print(f"Error, Mismatch\n=> \"{word}\" != [{word_idx}] of \"{sentence}\"")
- return
+ raise Exception(f"Error, Mismatch\n=> \"{word}\" != [{word_idx}] of \"{sentence}\"")
word_time = {"start": subtitle["start"], "end": subtitle["end"], "index": word_idx, "word": word}
current_scene["timestamp"].append(word_time)
for scene in scenes:
if len(scene["scene"].split(" ")) != len(scene["timestamp"]):
- print("Error, Mismatch length")
- return
+ raise Exception("Error, Scene length and timestamp length doesnt match.")
if "" in scene["scene"].split(" "):
print(repr(scene["scene"]))
@@ -163,13 +179,90 @@ def create_word_scenes(wordvtt, scriptraw):
for i, j in zip(full_script, full_scenes):
if i.replace("##", "") != j["word"]:
- print("Error, Mismatch")
+ raise Exception("Error, Mismatch")
return
assert len(full_scenes) == len(full_script)
return full_script, full_scenes
+def scene_from_new_script(raw_script, full_script, full_scenes):
+ mod_script = raw_script.replace("\n", " \n ").split(" ")
+ mod_script = [i for i in mod_script if i]
+ n = 0
+ while True:
+ if mod_script[n] == "\n":
+ mod_script[n-1] += "\n"
+ del(mod_script[n])
+ n -= 1
+ n += 1
+ if n == len(mod_script):
+ break
+
+ print(f"Original: {len(full_script)}, Modded: {len(mod_script)}")
+ allowed_list = [".", "\n", "\n\n", ",", "?", "##"]
+
+ def normalized(x):
+ for i in allowed_list:
+ x = x.replace(i, "")
+ return x.upper()
+
+ same = lambda a, b: normalized(a) == normalized(b)
+ new_script, new_timestamp, orig_index, n = [], [], 0, 0
+ fail = 0
+ while n < len(mod_script):
+ print(f"{repr(mod_script[n]):>20} ? {repr(full_script[orig_index])}")
+ word = mod_script[n]
+ if same(word, full_script[orig_index].replace("##", "")):
+ cur = full_scenes[orig_index]
+ new_script.append(word.replace("##", ""))
+ new_timestamp.append({"start": cur["start"], "end": cur["end"]})
+ fail = 0
+ else:
+ if fail > 10:
+ raise Exception("Error: Failed to match words,")
+ return
+ fail += 1
+ n -= 1
+ n, orig_index = n+1, orig_index+1
+ assert len(new_script) == len(new_timestamp)
+ return new_script, new_timestamp
+
+def build_new_subtitle(new_script, new_timestamp):
+ buffer, new_scenes, start, end = [], [], None, None
+ current_scene = []
+
+ for i, j in zip(new_script, new_timestamp):
+ buffer.append(i.replace("\n", ""))
+ if not start:
+ start = j["start"]
+
+ if "\n" in i:
+ current_scene.append({"content": " ".join(buffer).replace("##", ""), "start": start, "end": j["end"]})
+ buffer, start = [], None
+
+ if "\n\n" in i:
+ print(f"Section break at line #{len(current_scene):<3}| \"{current_scene[-1]["content"]}\"")
+ new_scenes.append(current_scene)
+ current_scene = []
+
+ if start:
+ buffer.append(i.replace("\n", ""))
+ current_scene.append({"content": " ".join(buffer), "start": start, "end": j["end"]})
+
+ if current_scene != (new_scenes[-1] if new_scenes else None):
+ new_scenes.append(current_scene)
+
+ newsub = []
+ for n, i in enumerate(new_scenes):
+ newsub += i
+ if n < len(new_scenes) - 1:
+ newsub.append({"content": "Break", "start": None, "end": None, "split": True})
+
+ return newsub
+
+###
+
def autobreak(lines, times):
from datetime import timedelta
@@ -222,108 +315,39 @@ def autobreak(lines, times):
return script
-def scene_from_new_script(raw_script, full_script, full_scenes):
- mod_script = raw_script.replace("\n", " \n ").split(" ")
- mod_script = [i for i in mod_script if i]
- n = 0
- while True:
- if mod_script[n] == "\n":
- mod_script[n-1] += "\n"
- del(mod_script[n])
- n -= 1
- n += 1
- if n == len(mod_script):
- break
-
- print(f"Original: {len(full_script)}, Modded: {len(mod_script)}")
- allowed_list = [".", "\n", "\n\n", ",", "?", "##"]
-
- def normalized(x):
- for i in allowed_list:
- x = x.replace(i, "")
- return x.upper()
-
- same = lambda a, b: normalized(a) == normalized(b)
- new_script, new_timestamp, orig_index, n = [], [], 0, 0
- fail = 0
- while n < len(mod_script):
- print(f"{repr(mod_script[n]):>20} ? {repr(full_script[orig_index])}")
- word = mod_script[n]
- if same(word, full_script[orig_index].replace("##", "")):
- cur = full_scenes[orig_index]
- new_script.append(word.replace("##", ""))
- new_timestamp.append({"start": cur["start"], "end": cur["end"]})
- fail = 0
- else:
- if fail > 10:
- print("Error: Failed to match words,")
- return
- fail += 1
- n -= 1
- n, orig_index = n+1, orig_index+1
- assert len(new_script) == len(new_timestamp)
- return new_script, new_timestamp
-
-def build_new_subtitle(new_script, new_timestamp):
- buffer, new_scenes, start, end = [], [], None, None
- current_scene = []
-
- for i, j in zip(new_script, new_timestamp):
- if "\n" in i:
- buffer.append(i.replace("\n", ""))
- current_scene.append({"content": " ".join(buffer).replace("##", ""), "start": start, "end": j["end"]})
- buffer, start = [], None
-
- if "\n\n" in i:
- print(f"Section break at line #{len(current_scene):<3}| \"{current_scene[-1]["content"]}\"")
- new_scenes.append(current_scene)
- current_scene = []
-
- else:
- buffer.append(i)
- if not start:
- start = j["start"]
-
- if start:
- buffer.append(i.replace("\n", ""))
- current_scene.append({"content": " ".join(buffer), "start": start, "end": j["end"]})
-
- if current_scene != (new_scenes[-1] if new_scenes else None):
- new_scenes.append(current_scene)
-
- newsub = []
- for n, i in enumerate(new_scenes):
- newsub += i
- if n < len(new_scenes) - 1:
- newsub.append({"content": "Break", "start": None, "end": None, "split": True})
-
- return newsub
-
-###
+############################################
def saveFile(filename, data, override = False):
if os.path.exists(filename) and not override:
- print(f"File {filename} already exists.")
- return -1
+ raise Exception(f"File {filename} already exists.")
+ return
with open(filename, "w") as f:
f.write(data)
def openFile(filename):
+ if not os.path.exists(filename):
+ raise Exception(f"File {filename} doesnt exists.")
+ return
with open(filename, "r") as f:
data = f.read()
if not data:
- return -1
+ raise Exception("Data empty.")
+ return
return data
-###
+############################################
if __name__=="__main__":
+ PROG = sys.argv[0].split("/")[-1]
if len(sys.argv) not in (3, 4):
- PROG = sys.argv[0].split("/")[-1]
- print(f"Usage: {PROG} script [VTT file] \n" \
- f" {" "*len(PROG)} apply [VTT file] [script file] \n" \
- f" {" "*len(PROG)} create [JSON file]" \
- )
+ print( \
+f"""Usage: {PROG} [COMMAND] [FILES]...
+
+Commands:
+ - script Generates script file from vtt file.
+ - apply