fix: subtitle

2025-10-10 21:17:51 +08:00
parent 3844d2eb75
commit 001f04a9ee
6 changed files with 133 additions and 23 deletions
--- a/src/apis/index.js
+++ b/src/apis/index.js
@@ -510,6 +510,7 @@ export const apiSubtitle = async ({
  apiSetting,
 }) => {
  const cacheOpts = {
+    apiSlug: apiSetting.apiSlug,
    videoId,
    fromLang,
    toLang,
--- a/src/apis/trans.js
+++ b/src/apis/trans.js
@@ -34,6 +34,7 @@ import { parseJsonObj, extractJson } from "../libs/utils";
 import { kissLog } from "../libs/log";
 import { fetchData } from "../libs/fetch";
 import { getMsgHistory } from "./history";
+import { parseBilingualVtt } from "../subtitle/vtt";

 const keyMap = new Map();
 const urlMap = new Map();
@@ -118,8 +119,9 @@ const parseSTRes = (raw) => {
  }

  try {
-    const jsonString = extractJson(raw);
-    const data = JSON.parse(jsonString);
+    // const jsonString = extractJson(raw);
+    // const data = JSON.parse(jsonString);
+    const data = parseBilingualVtt(raw);
    if (Array.isArray(data)) {
      return data;
    }
--- a/src/config/api.js
+++ b/src/config/api.js
@@ -354,23 +354,49 @@ Output: {"translations":[{"id":1,"text":"一个<b>React</b>组件","sourceLangua

 Fail-safe: On any error, return {"translations":[]}.`;

-const defaultSubtitlePrompt = `Goal: Convert raw subtitle event JSON into a clean, sentence-based JSON array.
+// const defaultSubtitlePrompt = `Goal: Convert raw subtitle event JSON into a clean, sentence-based JSON array.

-Output (valid JSON array, output ONLY this array):
-[{
-  "text": "string",        // Full sentence with correct punctuation
-  "translation": "string", // Translation in ${INPUT_PLACE_TO}
-  "start": int,            // Start time (ms)
-  "end": int,              // End time (ms)
-  "duration": int          // end - start
-}]
+// Output (valid JSON array, output ONLY this array):
+// [{
+//   "text": "string",        // Full sentence with correct punctuation
+//   "translation": "string", // Translation in ${INPUT_PLACE_TO}
+//   "start": int,            // Start time (ms)
+//   "end": int,              // End time (ms)
+// }]

-Guidelines:
-1. **Segmentation**: Merge sequential 'utf8' strings from 'segs' into full sentences, merging groups logically.
-2. **Punctuation**: Ensure proper sentence-final punctuation (., ?, !); add if missing.
-3. **Translation**: Translate 'text' into ${INPUT_PLACE_TO}, place result in 'translation'.
-4. **Special Cases**: '[Music]' (and similar cues) are standalone entries. Translate appropriately (e.g., '[音乐]', '[Musique]').
-`;
+// Guidelines:
+// 1. **Segmentation**: Merge sequential 'utf8' strings from 'segs' into full sentences, merging groups logically.
+// 2. **Punctuation**: Ensure proper sentence-final punctuation (., ?, !); add if missing.
+// 3. **Translation**: Translate 'text' into ${INPUT_PLACE_TO}, place result in 'translation'.
+// 4. **Special Cases**: '[Music]' (and similar cues) are standalone entries. Translate appropriately (e.g., '[音乐]', '[Musique]').
+// `;
+
+const defaultSubtitlePrompt = `You are an expert AI for subtitle generation. Convert a JSON array of word-level timestamps into a bilingual VTT file.
+
+**Workflow:**
+1. Merge \`text\` fields into complete sentences; ignore empty text.
+2. Split long sentences into smaller, manageable subtitle cues (one sentence per cue).
+3. Translate each cue into ${INPUT_PLACE_TO}.
+4. Format as VTT:
+   - Start with \`WEBVTT\`.
+   - Each cue: timestamps (\`start --> end\` in milliseconds), original text, translated text.
+   - Keep non-speech text (e.g., \`[Music]\`) untranslated.
+   - Separate cues with a blank line.
+
+**Output:** Only the pure VTT content.
+
+**Example:**
+\`\`\`vtt
+WEBVTT
+
+1000 --> 3500
+Hello world!
+你好，世界！
+
+4000 --> 6000
+Good morning.
+早上好。
+\`\`\``;

 const defaultRequestHook = `async (args, { url, body, headers, userMsg, method } = {}) => {
  console.log("request hook args:", args);
--- a/src/hooks/Api.js
+++ b/src/hooks/Api.js
@@ -44,7 +44,7 @@ export function useApiList() {
  );

  const aiEnabledApis = useMemo(
-    () => enabledApis.filter((api) => API_SPE_TYPES.ai.has(api.apiSlug)),
+    () => enabledApis.filter((api) => API_SPE_TYPES.ai.has(api.apiType)),
    [enabledApis]
  );

@@ -124,6 +124,7 @@ export function useApiItem(apiSlug) {
            apiSlug: item.apiSlug,
            apiName: item.apiName,
            apiType: item.apiType,
+            key: item.key,
          };
        }
        return item;
--- a/src/subtitle/YouTubeCaptionProvider.js
+++ b/src/subtitle/YouTubeCaptionProvider.js
@@ -111,6 +111,11 @@ class YouTubeCaptionProvider {
    kissControls.appendChild(toggleButton);

    toggleButton.onclick = () => {
+      if (this.#isBusy) {
+        logger.info(`Youtube Provider: It's budy now...`);
+        return;
+      }
+
      if (!this.#enabled) {
        logger.info(`Youtube Provider: Feature toggled ON.`);
        this.#startManager();
@@ -283,9 +288,10 @@ class YouTubeCaptionProvider {
        OPT_LANGS_TO_CODE[OPT_TRANS_MICROSOFT].get(lang.slice(0, 2)) ||
        "auto";
      if (potUrl.searchParams.get("kind") === "asr" && segApiSetting) {
+        // todo: 切分多次发送接受以适应接口处理能力
        subtitles = await this.#aiSegment({
          videoId,
-          events,
+          events: this.#flatEvents(events),
          fromLang,
          toLang,
          segApiSetting,
@@ -408,10 +414,7 @@ class YouTubeCaptionProvider {
      lines = this.#processSubtitles({ events, usePause: true });
    }

-    return lines.map((item) => ({
-      ...item,
-      duration: Math.max(0, item.end - item.start),
-    }));
+    return lines;
  }

  #isQualityPoor(lines, lengthThreshold = 250, percentageThreshold = 0.1) {
@@ -580,6 +583,39 @@ class YouTubeCaptionProvider {

    return sentences;
  }
+
+  #flatEvents(events = []) {
+    const segments = [];
+    let buffer = null;
+
+    events.forEach(({ segs = [], tStartMs = 0, dDurationMs = 0 }) => {
+      segs.forEach(({ utf8 = "", tOffsetMs = 0 }, j) => {
+        const text = utf8.trim().replace(/\s+/g, " ");
+        const start = tStartMs + tOffsetMs;
+
+        if (buffer) {
+          if (!buffer.end || buffer.end > start) {
+            buffer.end = start;
+          }
+          segments.push(buffer);
+          buffer = null;
+        }
+
+        buffer = {
+          text,
+          start,
+        };
+
+        if (j === segs.length - 1) {
+          buffer.end = tStartMs + dDurationMs;
+        }
+      });
+    });
+
+    segments.push(buffer);
+
+    return segments.filter((item) => item.text);
+  }
 }

 export const YouTubeInitializer = (() => {
--- a/src/subtitle/vtt.js
+++ b/src/subtitle/vtt.js
@@ -0,0 +1,44 @@
+function millisecondsStringToNumber(msString) {
+  const cleanString = msString.trim();
+  const milliseconds = parseInt(cleanString, 10);
+
+  if (isNaN(milliseconds)) {
+    return 0;
+  }
+
+  return milliseconds;
+}
+
+export function parseBilingualVtt(vttText) {
+  const cleanText = vttText.replace(/^\uFEFF/, "").trim();
+  const cues = cleanText.split(/\n\n+/);
+
+  const result = [];
+
+  for (const cue of cues) {
+    if (!cue.includes("-->")) continue;
+
+    const lines = cue.split("\n");
+
+    const timestampLineIndex = lines.findIndex((line) => line.includes("-->"));
+    if (timestampLineIndex === -1) continue;
+
+    const [startTimeString, endTimeString] =
+      lines[timestampLineIndex].split(" --> ");
+    const textLines = lines.slice(timestampLineIndex + 1);
+
+    if (startTimeString && endTimeString && textLines.length > 0) {
+      const originalText = textLines[0].trim();
+      const translatedText = (textLines[1] || "").trim();
+
+      result.push({
+        start: millisecondsStringToNumber(startTimeString),
+        end: millisecondsStringToNumber(endTimeString),
+        text: originalText,
+        translation: translatedText,
+      });
+    }
+  }
+
+  return result;
+}