From 368c26d491ae54bf2e30c2a84cb1a3de795799cb Mon Sep 17 00:00:00 2001
From: Morgan <me@morgan.kr>
Date: Thu, 15 Feb 2024 01:32:58 +0900
Subject: [PATCH] Fix

---
 snuenc.sh  |   1 +
 snusub.py  | 238 +++++++++++++++++++++++++++++------------------------
 subedit.py |  79 ++++++++++++++++++
 3 files changed, 211 insertions(+), 107 deletions(-)
 create mode 100644 snuenc.sh
 create mode 100644 subedit.py
diff --git a/snuenc.sh b/snuenc.sh
new file mode 100644
index 0000000..be52aab
--- /dev/null
+++ b/snuenc.sh
@@ -0,0 +1 @@
+i="input.mp4";j="test.stacked.vtt";ffmpeg -f lavfi -i color=c=gray:s=508x1080:r=ntsc:d=$(ffprobe -i $i -show_entries format=duration -v quiet -of csv="p=0") -vf "subtitles=$j:force_style='FontName=Helvetica,Alignment=4,Fontsize=9.5,Outline=0,Shadow=0,MarginH=2,MarginV=4,Spacing=0'" -b:v 2000k  -f nut - | ffmpeg -i $i -i - -filter_complex "[0:v][1:v]hstack=inputs=2:shortest=1[v];[0:a]anull[a2];[0:a][a2]amerge[a]" -map "[v]" -map "[a]" -b:v 2000k -f nut - | ffplay -
diff --git a/snusub.py b/snusub.py
index 02ecad8..b9ab1e5 100644
--- a/snusub.py
+++ b/snusub.py
@@ -4,6 +4,8 @@ import re, json
 import os, sys
 from datetime import timedelta
 
+###
+
 def from_vtt(vtt_string):
   VTT_TIMECODE_PATTERN = r"((?:\d{2}:)?\d{2}:\d{2}\.\d{3}) --> ((?:\d{2}:)?\d{2}:\d{2}\.\d{3})"
   VTT_LINE_NUMBER_PATTERN = r"^\d+$"
@@ -24,6 +26,9 @@ def from_vtt(vtt_string):
 
     start, end = match.groups()
     content = '\n'.join(lines[1:]) + "\n"
+    # if start == end:
+    #   continue
+      
     subtitles.append({
       'start': start,
       'end': end,
@@ -39,6 +44,8 @@ def to_vtt(subtitles):
         if not subtitle.get("split", False):
           start = subtitle['start']
           end = subtitle['end']
+          if not start or not end or start == end:
+            raise Exception(f"VTT timestamp parse error from #{idx}.")
           vtt_content += f"{start} --> {end}\n{content}\n\n\n"
         else:
           vtt_content += f"NOTE {content}\n\n\n"
@@ -58,6 +65,7 @@ def to_stacked_vtt(subtitles, continous = True):
         buffer += "\n"
       else:
         buffer += " "
+
     buffer += subtitle['content'].strip()
 
     if n < len(subtitles) - 1:
@@ -65,6 +73,10 @@ def to_stacked_vtt(subtitles, continous = True):
     else:
       end_time = subtitle['end']
     
+    if not subtitle['start'] or not end_time:
+      raise Exception(f"VTT timestamp parse error from #{idx}.")
+    if subtitle['start'] == end_time:
+      raise Exception(f"Error, subtitle timestamp overlaps.\n{subtitle['start']} --> {end_time} {subtitle['content'].strip()}")
     vtt_content += f"{subtitle['start']} --> {end_time}\n"
     vtt_content += buffer
     vtt_content += "\n\n\n"
@@ -73,16 +85,18 @@ def to_stacked_vtt(subtitles, continous = True):
 
   return vtt_content
 
+###
+
 def script_from_word_vtt(wordvtt):
   subtitles = from_vtt(wordvtt)
-  print(f"VTT {len(subtitles)} lines. Generating script file from VTT.")
+  print(f"Generating script file from VTT...")
   sentences = []
-  EXCEPTION_FLAG, ADD_NEXT_SENTENCE = "", 0
+  ADD_NEXT_SENTENCE = 0
   for n, subtitle in enumerate(subtitles):
     sentence = subtitle["content"].replace("<u>", "").replace("</u>", "")
     if ((sentences[-1] if sentences else None) != sentence) or ADD_NEXT_SENTENCE:
       sentences.append(sentence)
-    ADD_NEXT_SENTENCE = 0
+      ADD_NEXT_SENTENCE = 0
     if subtitle["content"][-4:] == "</u>":
       ADD_NEXT_SENTENCE = 1
       if n + 2 < len(subtitles):
@@ -90,10 +104,11 @@ def script_from_word_vtt(wordvtt):
           ADD_NEXT_SENTENCE = 0
   return sentences
 
-def create_word_scenes(wordvtt, scriptraw):
-  subtitles = from_vtt(wordvtt)
-  scripts   = [i for i in scriptraw.split("\n") if i]
-  print(f"VTT {len(subtitles)} lines, Script {len(scripts)} lines")
+def create_word_scenes(raw_vtt, raw_script):
+  subtitles = from_vtt(raw_vtt)
+  scripts   = [i for i in raw_script.split("\n") if i]
+  print(f"Found {len(subtitles)} subtitles, {len(scripts)} scenes.\nTimestamping each words...")
+
   scenes = []
   for n, script in enumerate(scripts):
     if len(script.split(" ")) == 1:
@@ -110,8 +125,7 @@ def create_word_scenes(wordvtt, scriptraw):
       if sentence == scenes[scenes_cur+1].get("scene"):
         scenes_cur += 1
       else:
-        print(f"Error, Mismatch in scenes\n=>\"[{scenes_cur}] {scenes[scenes_cur].get("scene")}\" or \"[{scenes_cur+1}] {scenes[scenes_cur+1].get("scene")}\" != \"{sentence}\"")
-        return
+        raise Exception(f"Error, Failed to match sentence with scene.\n\"{scenes[scenes_cur].get("scene")}\" or \"[{scenes_cur+1}] {scenes[scenes_cur+1].get("scene")}\" != \"{sentence}\"")
 
     current_scene = scenes[scenes_cur]
     if current_scene["timestamp"]:
@@ -120,9 +134,12 @@ def create_word_scenes(wordvtt, scriptraw):
       word_idx = 0
 
     if ("<u>" not in subtitle["content"]) and word_idx >= len(sentence.split(" ")):
+      # Ignore trailing dummy subtitle after last word indexed.
       pass
+
     if ("<u>" in subtitle["content"]) and word_idx >= len(sentence.split(" ")):
-      print(f"Error, index wrong. {scenes_cur}, word: {word_idx}, total words: {len(sentence.split(" "))}\n{subtitle}")
+      # If there is trailing non-dummy timestamped subtitle, Reset word_idx and step to next scene. (Repeating sentence doesnt increment cur.)
+      print(f"Error, Index wrong. {scenes_cur}, word: {word_idx}, total words: {len(sentence.split(" "))}\n{subtitle}")
       word_idx = 0
       scenes_cur += 1
       current_scene = scenes[scenes_cur]
@@ -132,26 +149,25 @@ def create_word_scenes(wordvtt, scriptraw):
         word_idx = 0
       print(f"Changed to {word_idx}, {scenes_cur}")
 
+    # Start matching words.
     if "<u>" in subtitle["content"]:
       word = subtitle["content"].split("<u>")[1].split("</u>")[0]
 
       if word not in sentence.split(" "):
-        print(f"Error, Mismatch\n=> \"{word}\" not in \"{sentence}\"")
+        raise Exception(f"Error, Mismatch\n=> \"{word}\" not in \"{sentence}\"")
         return
 
       try:
         assert sentence.split(" ")[word_idx] == word
       except:
-        print(f"Error, Mismatch\n=> \"{word}\" != [{word_idx}] of \"{sentence}\"")
-        return
+        raise Exception(f"Error, Mismatch\n=> \"{word}\" != [{word_idx}] of \"{sentence}\"")
 
       word_time = {"start": subtitle["start"], "end": subtitle["end"], "index": word_idx, "word": word}
       current_scene["timestamp"].append(word_time)
 
   for scene in scenes:
     if len(scene["scene"].split(" ")) != len(scene["timestamp"]):
-      print("Error, Mismatch length")
-      return
+      raise Exception("Error, Scene length and timestamp length doesnt match.")
     if "" in scene["scene"].split(" "):
       print(repr(scene["scene"]))
 
@@ -163,13 +179,90 @@ def create_word_scenes(wordvtt, scriptraw):
 
   for i, j in zip(full_script, full_scenes):
     if i.replace("##", "") != j["word"]:
-      print("Error, Mismatch")
+      raise Exception("Error, Mismatch")
       return
 
   assert len(full_scenes) == len(full_script)
 
   return full_script, full_scenes
   
+def scene_from_new_script(raw_script, full_script, full_scenes):
+  mod_script = raw_script.replace("\n", " \n ").split(" ")
+  mod_script = [i for i in mod_script if i]
+  n = 0
+  while True:
+    if mod_script[n] == "\n":
+      mod_script[n-1] += "\n"
+      del(mod_script[n])
+      n -= 1
+    n += 1
+    if n == len(mod_script):
+      break
+  
+  print(f"Original: {len(full_script)}, Modded: {len(mod_script)}")
+  allowed_list = [".", "\n", "\n\n", ",", "?", "##"]
+
+  def normalized(x):
+    for i in allowed_list:
+      x = x.replace(i, "")
+    return x.upper()
+  
+  same = lambda a, b: normalized(a) == normalized(b)
+  new_script, new_timestamp, orig_index, n = [], [], 0, 0
+  fail = 0
+  while n < len(mod_script):
+    print(f"{repr(mod_script[n]):>20} ? {repr(full_script[orig_index])}")
+    word = mod_script[n]
+    if same(word, full_script[orig_index].replace("##", "")):
+      cur = full_scenes[orig_index]
+      new_script.append(word.replace("##", ""))
+      new_timestamp.append({"start": cur["start"], "end": cur["end"]})
+      fail = 0
+    else:
+      if fail > 10:
+        raise Exception("Error: Failed to match words,")
+        return
+      fail += 1
+      n -= 1
+    n, orig_index = n+1, orig_index+1
+  assert len(new_script) == len(new_timestamp)
+  return new_script, new_timestamp
+
+def build_new_subtitle(new_script, new_timestamp):
+  buffer, new_scenes, start, end = [], [], None, None
+  current_scene = []
+ 
+  for i, j in zip(new_script, new_timestamp):
+    buffer.append(i.replace("\n", ""))
+    if not start:
+      start = j["start"]
+
+    if "\n" in i:
+      current_scene.append({"content": " ".join(buffer).replace("##", ""), "start": start, "end": j["end"]})
+      buffer, start = [], None
+ 
+    if "\n\n" in i:
+      print(f"Section break at line #{len(current_scene):<3}| \"{current_scene[-1]["content"]}\"")
+      new_scenes.append(current_scene)
+      current_scene = []
+
+  if start:
+      buffer.append(i.replace("\n", ""))
+      current_scene.append({"content": " ".join(buffer), "start": start, "end": j["end"]})
+
+  if current_scene != (new_scenes[-1] if new_scenes else None):
+    new_scenes.append(current_scene)
+
+  newsub = []
+  for n, i in enumerate(new_scenes):
+    newsub += i
+    if n < len(new_scenes) - 1:
+      newsub.append({"content": "Break", "start": None, "end": None, "split": True})
+
+  return newsub
+
+###
+
 def autobreak(lines, times):
   from datetime import timedelta
 
@@ -222,108 +315,39 @@ def autobreak(lines, times):
   
   return script
 
-def scene_from_new_script(raw_script, full_script, full_scenes):
-  mod_script = raw_script.replace("\n", " \n ").split(" ")
-  mod_script = [i for i in mod_script if i]
-  n = 0
-  while True:
-    if mod_script[n] == "\n":
-      mod_script[n-1] += "\n"
-      del(mod_script[n])
-      n -= 1
-    n += 1
-    if n == len(mod_script):
-      break
-  
-  print(f"Original: {len(full_script)}, Modded: {len(mod_script)}")
-  allowed_list = [".", "\n", "\n\n", ",", "?", "##"]
-
-  def normalized(x):
-    for i in allowed_list:
-      x = x.replace(i, "")
-    return x.upper()
-  
-  same = lambda a, b: normalized(a) == normalized(b)
-  new_script, new_timestamp, orig_index, n = [], [], 0, 0
-  fail = 0
-  while n < len(mod_script):
-    print(f"{repr(mod_script[n]):>20} ? {repr(full_script[orig_index])}")
-    word = mod_script[n]
-    if same(word, full_script[orig_index].replace("##", "")):
-      cur = full_scenes[orig_index]
-      new_script.append(word.replace("##", ""))
-      new_timestamp.append({"start": cur["start"], "end": cur["end"]})
-      fail = 0
-    else:
-      if fail > 10:
-        print("Error: Failed to match words,")
-        return
-      fail += 1
-      n -= 1
-    n, orig_index = n+1, orig_index+1
-  assert len(new_script) == len(new_timestamp)
-  return new_script, new_timestamp
-
-def build_new_subtitle(new_script, new_timestamp):
-  buffer, new_scenes, start, end = [], [], None, None
-  current_scene = []
- 
-  for i, j in zip(new_script, new_timestamp):
-    if "\n" in i:
-      buffer.append(i.replace("\n", ""))
-      current_scene.append({"content": " ".join(buffer).replace("##", ""), "start": start, "end": j["end"]})
-      buffer, start = [], None
- 
-      if "\n\n" in i:
-        print(f"Section break at line #{len(current_scene):<3}| \"{current_scene[-1]["content"]}\"")
-        new_scenes.append(current_scene)
-        current_scene = []
-
-    else:
-      buffer.append(i)
-      if not start:
-        start = j["start"]
-
-  if start:
-      buffer.append(i.replace("\n", ""))
-      current_scene.append({"content": " ".join(buffer), "start": start, "end": j["end"]})
-
-  if current_scene != (new_scenes[-1] if new_scenes else None):
-    new_scenes.append(current_scene)
-
-  newsub = []
-  for n, i in enumerate(new_scenes):
-    newsub += i
-    if n < len(new_scenes) - 1:
-      newsub.append({"content": "Break", "start": None, "end": None, "split": True})
-
-  return newsub
-
-###
+############################################
 
 def saveFile(filename, data, override = False):
   if os.path.exists(filename) and not override:
-    print(f"File {filename} already exists.")
-    return -1
+    raise Exception(f"File {filename} already exists.")
+    return
   with open(filename, "w") as f:
     f.write(data)
 
 def openFile(filename):
+  if not os.path.exists(filename):
+    raise Exception(f"File {filename} doesnt exists.")
+    return
   with open(filename, "r") as f:
     data = f.read()
   if not data:
-    return -1
+    raise Exception("Data empty.")
+    return
   return data
 
-###
+############################################
 
 if __name__=="__main__":
+  PROG = sys.argv[0].split("/")[-1]
   if len(sys.argv) not in (3, 4):
-    PROG = sys.argv[0].split("/")[-1]
-    print(f"Usage: {PROG} script [VTT file] \n"               \
- f"       {" "*len(PROG)} apply  [VTT file] [script file] \n" \
- f"       {" "*len(PROG)} create [JSON file]"               \
-          )                              
+    print( \
+f"""Usage: {PROG} [COMMAND] [FILES]...
+
+Commands:
+ - script   <VTT file>                    Generates script file from vtt file.
+ - apply    <VTT file> <script file>      Applies new scripted file to create JSON file.
+ - create   <JSON file>                   Creates new vtt from given JSON.
+ """)                              
     sys.exit()
 
   COMMAND = sys.argv[1]
@@ -331,7 +355,7 @@ if __name__=="__main__":
     print("Error. Command not found.")
     sys.exit()
 
-  print(f"-> {PROG} {COMMAND} {FILE}")
+  print(f"-> {sys.argv}")
   if COMMAND == "script":
     FILE = sys.argv[2]
     if (not os.path.exists(FILE)):
@@ -339,12 +363,12 @@ if __name__=="__main__":
       sys.exit(-1)
 
     modfile = ".".join(scriptfile.split(".")[:-1]) + ".script"
-    x = create_word_scenes(openFile(FILE), script_from_word_vtt(openFile(FILE)))
+    x = create_word_scenes(openFile(FILE), "\n".join(script_from_word_vtt(openFile(FILE))))
     if not x:
       sys.exit(-1)
 
     full_script, full_scenes = x
-    genscript = autobreak(full_script,full_scenes)
+    genscript = autobreak(full_script, full_scenes)
     saveFile(modfile, genscript)
     print(f"Saved script file {modfile}.")
   
@@ -358,7 +382,7 @@ if __name__=="__main__":
       print(f"Input file doesnt exists.")
       sys.exit(-1)
 
-    x = create_word_scenes(openFile(FILE1), script_from_word_vtt(openFile(FILE)))
+    x = create_word_scenes(openFile(FILE1), "\n".join(script_from_word_vtt(openFile(FILE1))))
     if not x:
       sys.exit(-1)
     full_script, full_scenes = x
diff --git a/subedit.py b/subedit.py
new file mode 100644
index 0000000..7120b60
--- /dev/null
+++ b/subedit.py
@@ -0,0 +1,79 @@
+import json
+import os, sys
+
+def readFile(file):
+  if not os.path.exists(file):
+    raise Exception(f"File {file} doesn't exists.")
+  with open(file, "r") as f:
+    data = f.read()
+  return data
+
+def writeFile(file, data, overwrite = False):
+  if (not overwrite) and os.path.exists(file):
+    raise Exception(f"File {file} already exists.")
+  if not len(data):
+    raise Exception(f"Tried to write empty data.")
+  with open(file, "w") as f:
+    ret = f.write(data)
+  return ret
+
+file = sys.argv[1]
+
+if ".json" in file:
+  subtitles = json.loads(readFile(file))
+  output = ""
+  index = 0
+  for subtitle in subtitles:
+    if subtitle.get("split", False):
+      output += "\n"
+    else:
+      index += 1
+      start = subtitle["start"]
+      end = subtitle["end"]
+      content = subtitle["content"]
+      "| {start:>10} --> {end:>10} |"
+      output += f"{index:03} | {content.strip()}\n"
+
+  output += "############ TIMESTAMPS ############\n\n"
+
+  index = 0
+  for subtitle in subtitles:
+    if not subtitle.get("split", False):
+      index += 1
+      start = subtitle["start"]
+      end = subtitle["end"]
+      output += f"{index:03} | {start} --> {end} \n"
+
+  writeFile(os.path.splitext(file)[0]+".edit", output)
+
+elif ".edit" in file:
+  subtitles = json.loads(readFile(os.path.splitext(file)[0]+".json"))
+  lines = readFile(file)
+
+  idx, sub = 0, {}
+  for subtitle in subtitles:
+    if not subtitle.get("split", False):
+      sub[idx] = subtitle
+      idx += 1
+
+  new_brk, new_sub = [], {}
+  for line in lines.split("\n"):
+    if "\n############ TIMESTAMPS ############" == line:
+      break
+    if line:
+      idx, content = line.split(" | ")
+      idx = int(idx) - 1
+      if sub[idx]["content"] != content:
+        print(f"{idx} {sub[idx]["content"]} -> {content}")
+      new_sub[idx] = {"content": content, "start": sub[idx]["start"], "end": sub[idx]["end"]}
+    else:
+      new_brk.append(idx)
+
+  output = []
+  for n in sorted(new_sub):
+    subtitle = new_sub[n]
+    output.append(subtitle)
+    if n in new_brk:
+      output.append({"content": "Break", "start": None, "end": None, "split": True})
+
+  writeFile(os.path.splitext(file)[0]+".json.1", json.dumps(output, indent=2))
\ No newline at end of file