fix: subtitle

This commit is contained in:
Gabe
2025-10-10 21:17:51 +08:00
parent 3844d2eb75
commit 001f04a9ee
6 changed files with 133 additions and 23 deletions

View File

@@ -510,6 +510,7 @@ export const apiSubtitle = async ({
apiSetting,
}) => {
const cacheOpts = {
apiSlug: apiSetting.apiSlug,
videoId,
fromLang,
toLang,

View File

@@ -34,6 +34,7 @@ import { parseJsonObj, extractJson } from "../libs/utils";
import { kissLog } from "../libs/log";
import { fetchData } from "../libs/fetch";
import { getMsgHistory } from "./history";
import { parseBilingualVtt } from "../subtitle/vtt";
const keyMap = new Map();
const urlMap = new Map();
@@ -118,8 +119,9 @@ const parseSTRes = (raw) => {
}
try {
const jsonString = extractJson(raw);
const data = JSON.parse(jsonString);
// const jsonString = extractJson(raw);
// const data = JSON.parse(jsonString);
const data = parseBilingualVtt(raw);
if (Array.isArray(data)) {
return data;
}

View File

@@ -354,23 +354,49 @@ Output: {"translations":[{"id":1,"text":"一个<b>React</b>组件","sourceLangua
Fail-safe: On any error, return {"translations":[]}.`;
const defaultSubtitlePrompt = `Goal: Convert raw subtitle event JSON into a clean, sentence-based JSON array.
// const defaultSubtitlePrompt = `Goal: Convert raw subtitle event JSON into a clean, sentence-based JSON array.
Output (valid JSON array, output ONLY this array):
[{
"text": "string", // Full sentence with correct punctuation
"translation": "string", // Translation in ${INPUT_PLACE_TO}
"start": int, // Start time (ms)
"end": int, // End time (ms)
"duration": int // end - start
}]
// Output (valid JSON array, output ONLY this array):
// [{
// "text": "string", // Full sentence with correct punctuation
// "translation": "string", // Translation in ${INPUT_PLACE_TO}
// "start": int, // Start time (ms)
// "end": int, // End time (ms)
// }]
Guidelines:
1. **Segmentation**: Merge sequential 'utf8' strings from 'segs' into full sentences, merging groups logically.
2. **Punctuation**: Ensure proper sentence-final punctuation (., ?, !); add if missing.
3. **Translation**: Translate 'text' into ${INPUT_PLACE_TO}, place result in 'translation'.
4. **Special Cases**: '[Music]' (and similar cues) are standalone entries. Translate appropriately (e.g., '[音乐]', '[Musique]').
`;
// Guidelines:
// 1. **Segmentation**: Merge sequential 'utf8' strings from 'segs' into full sentences, merging groups logically.
// 2. **Punctuation**: Ensure proper sentence-final punctuation (., ?, !); add if missing.
// 3. **Translation**: Translate 'text' into ${INPUT_PLACE_TO}, place result in 'translation'.
// 4. **Special Cases**: '[Music]' (and similar cues) are standalone entries. Translate appropriately (e.g., '[音乐]', '[Musique]').
// `;
const defaultSubtitlePrompt = `You are an expert AI for subtitle generation. Convert a JSON array of word-level timestamps into a bilingual VTT file.
**Workflow:**
1. Merge \`text\` fields into complete sentences; ignore empty text.
2. Split long sentences into smaller, manageable subtitle cues (one sentence per cue).
3. Translate each cue into ${INPUT_PLACE_TO}.
4. Format as VTT:
- Start with \`WEBVTT\`.
- Each cue: timestamps (\`start --> end\` in milliseconds), original text, translated text.
- Keep non-speech text (e.g., \`[Music]\`) untranslated.
- Separate cues with a blank line.
**Output:** Only the pure VTT content.
**Example:**
\`\`\`vtt
WEBVTT
1000 --> 3500
Hello world!
你好,世界!
4000 --> 6000
Good morning.
早上好。
\`\`\``;
const defaultRequestHook = `async (args, { url, body, headers, userMsg, method } = {}) => {
console.log("request hook args:", args);

View File

@@ -44,7 +44,7 @@ export function useApiList() {
);
const aiEnabledApis = useMemo(
() => enabledApis.filter((api) => API_SPE_TYPES.ai.has(api.apiSlug)),
() => enabledApis.filter((api) => API_SPE_TYPES.ai.has(api.apiType)),
[enabledApis]
);
@@ -124,6 +124,7 @@ export function useApiItem(apiSlug) {
apiSlug: item.apiSlug,
apiName: item.apiName,
apiType: item.apiType,
key: item.key,
};
}
return item;

View File

@@ -111,6 +111,11 @@ class YouTubeCaptionProvider {
kissControls.appendChild(toggleButton);
toggleButton.onclick = () => {
if (this.#isBusy) {
logger.info(`Youtube Provider: It's budy now...`);
return;
}
if (!this.#enabled) {
logger.info(`Youtube Provider: Feature toggled ON.`);
this.#startManager();
@@ -283,9 +288,10 @@ class YouTubeCaptionProvider {
OPT_LANGS_TO_CODE[OPT_TRANS_MICROSOFT].get(lang.slice(0, 2)) ||
"auto";
if (potUrl.searchParams.get("kind") === "asr" && segApiSetting) {
// todo: 切分多次发送接受以适应接口处理能力
subtitles = await this.#aiSegment({
videoId,
events,
events: this.#flatEvents(events),
fromLang,
toLang,
segApiSetting,
@@ -408,10 +414,7 @@ class YouTubeCaptionProvider {
lines = this.#processSubtitles({ events, usePause: true });
}
return lines.map((item) => ({
...item,
duration: Math.max(0, item.end - item.start),
}));
return lines;
}
#isQualityPoor(lines, lengthThreshold = 250, percentageThreshold = 0.1) {
@@ -580,6 +583,39 @@ class YouTubeCaptionProvider {
return sentences;
}
#flatEvents(events = []) {
const segments = [];
let buffer = null;
events.forEach(({ segs = [], tStartMs = 0, dDurationMs = 0 }) => {
segs.forEach(({ utf8 = "", tOffsetMs = 0 }, j) => {
const text = utf8.trim().replace(/\s+/g, " ");
const start = tStartMs + tOffsetMs;
if (buffer) {
if (!buffer.end || buffer.end > start) {
buffer.end = start;
}
segments.push(buffer);
buffer = null;
}
buffer = {
text,
start,
};
if (j === segs.length - 1) {
buffer.end = tStartMs + dDurationMs;
}
});
});
segments.push(buffer);
return segments.filter((item) => item.text);
}
}
export const YouTubeInitializer = (() => {

44
src/subtitle/vtt.js Normal file
View File

@@ -0,0 +1,44 @@
function millisecondsStringToNumber(msString) {
const cleanString = msString.trim();
const milliseconds = parseInt(cleanString, 10);
if (isNaN(milliseconds)) {
return 0;
}
return milliseconds;
}
export function parseBilingualVtt(vttText) {
const cleanText = vttText.replace(/^\uFEFF/, "").trim();
const cues = cleanText.split(/\n\n+/);
const result = [];
for (const cue of cues) {
if (!cue.includes("-->")) continue;
const lines = cue.split("\n");
const timestampLineIndex = lines.findIndex((line) => line.includes("-->"));
if (timestampLineIndex === -1) continue;
const [startTimeString, endTimeString] =
lines[timestampLineIndex].split(" --> ");
const textLines = lines.slice(timestampLineIndex + 1);
if (startTimeString && endTimeString && textLines.length > 0) {
const originalText = textLines[0].trim();
const translatedText = (textLines[1] || "").trim();
result.push({
start: millisecondsStringToNumber(startTimeString),
end: millisecondsStringToNumber(endTimeString),
text: originalText,
translation: translatedText,
});
}
}
return result;
}