fix: subtitle
This commit is contained in:
@@ -510,6 +510,7 @@ export const apiSubtitle = async ({
|
||||
apiSetting,
|
||||
}) => {
|
||||
const cacheOpts = {
|
||||
apiSlug: apiSetting.apiSlug,
|
||||
videoId,
|
||||
fromLang,
|
||||
toLang,
|
||||
|
||||
@@ -34,6 +34,7 @@ import { parseJsonObj, extractJson } from "../libs/utils";
|
||||
import { kissLog } from "../libs/log";
|
||||
import { fetchData } from "../libs/fetch";
|
||||
import { getMsgHistory } from "./history";
|
||||
import { parseBilingualVtt } from "../subtitle/vtt";
|
||||
|
||||
const keyMap = new Map();
|
||||
const urlMap = new Map();
|
||||
@@ -118,8 +119,9 @@ const parseSTRes = (raw) => {
|
||||
}
|
||||
|
||||
try {
|
||||
const jsonString = extractJson(raw);
|
||||
const data = JSON.parse(jsonString);
|
||||
// const jsonString = extractJson(raw);
|
||||
// const data = JSON.parse(jsonString);
|
||||
const data = parseBilingualVtt(raw);
|
||||
if (Array.isArray(data)) {
|
||||
return data;
|
||||
}
|
||||
|
||||
@@ -354,23 +354,49 @@ Output: {"translations":[{"id":1,"text":"一个<b>React</b>组件","sourceLangua
|
||||
|
||||
Fail-safe: On any error, return {"translations":[]}.`;
|
||||
|
||||
const defaultSubtitlePrompt = `Goal: Convert raw subtitle event JSON into a clean, sentence-based JSON array.
|
||||
// const defaultSubtitlePrompt = `Goal: Convert raw subtitle event JSON into a clean, sentence-based JSON array.
|
||||
|
||||
Output (valid JSON array, output ONLY this array):
|
||||
[{
|
||||
"text": "string", // Full sentence with correct punctuation
|
||||
"translation": "string", // Translation in ${INPUT_PLACE_TO}
|
||||
"start": int, // Start time (ms)
|
||||
"end": int, // End time (ms)
|
||||
"duration": int // end - start
|
||||
}]
|
||||
// Output (valid JSON array, output ONLY this array):
|
||||
// [{
|
||||
// "text": "string", // Full sentence with correct punctuation
|
||||
// "translation": "string", // Translation in ${INPUT_PLACE_TO}
|
||||
// "start": int, // Start time (ms)
|
||||
// "end": int, // End time (ms)
|
||||
// }]
|
||||
|
||||
Guidelines:
|
||||
1. **Segmentation**: Merge sequential 'utf8' strings from 'segs' into full sentences, merging groups logically.
|
||||
2. **Punctuation**: Ensure proper sentence-final punctuation (., ?, !); add if missing.
|
||||
3. **Translation**: Translate 'text' into ${INPUT_PLACE_TO}, place result in 'translation'.
|
||||
4. **Special Cases**: '[Music]' (and similar cues) are standalone entries. Translate appropriately (e.g., '[音乐]', '[Musique]').
|
||||
`;
|
||||
// Guidelines:
|
||||
// 1. **Segmentation**: Merge sequential 'utf8' strings from 'segs' into full sentences, merging groups logically.
|
||||
// 2. **Punctuation**: Ensure proper sentence-final punctuation (., ?, !); add if missing.
|
||||
// 3. **Translation**: Translate 'text' into ${INPUT_PLACE_TO}, place result in 'translation'.
|
||||
// 4. **Special Cases**: '[Music]' (and similar cues) are standalone entries. Translate appropriately (e.g., '[音乐]', '[Musique]').
|
||||
// `;
|
||||
|
||||
const defaultSubtitlePrompt = `You are an expert AI for subtitle generation. Convert a JSON array of word-level timestamps into a bilingual VTT file.
|
||||
|
||||
**Workflow:**
|
||||
1. Merge \`text\` fields into complete sentences; ignore empty text.
|
||||
2. Split long sentences into smaller, manageable subtitle cues (one sentence per cue).
|
||||
3. Translate each cue into ${INPUT_PLACE_TO}.
|
||||
4. Format as VTT:
|
||||
- Start with \`WEBVTT\`.
|
||||
- Each cue: timestamps (\`start --> end\` in milliseconds), original text, translated text.
|
||||
- Keep non-speech text (e.g., \`[Music]\`) untranslated.
|
||||
- Separate cues with a blank line.
|
||||
|
||||
**Output:** Only the pure VTT content.
|
||||
|
||||
**Example:**
|
||||
\`\`\`vtt
|
||||
WEBVTT
|
||||
|
||||
1000 --> 3500
|
||||
Hello world!
|
||||
你好,世界!
|
||||
|
||||
4000 --> 6000
|
||||
Good morning.
|
||||
早上好。
|
||||
\`\`\``;
|
||||
|
||||
const defaultRequestHook = `async (args, { url, body, headers, userMsg, method } = {}) => {
|
||||
console.log("request hook args:", args);
|
||||
|
||||
@@ -44,7 +44,7 @@ export function useApiList() {
|
||||
);
|
||||
|
||||
const aiEnabledApis = useMemo(
|
||||
() => enabledApis.filter((api) => API_SPE_TYPES.ai.has(api.apiSlug)),
|
||||
() => enabledApis.filter((api) => API_SPE_TYPES.ai.has(api.apiType)),
|
||||
[enabledApis]
|
||||
);
|
||||
|
||||
@@ -124,6 +124,7 @@ export function useApiItem(apiSlug) {
|
||||
apiSlug: item.apiSlug,
|
||||
apiName: item.apiName,
|
||||
apiType: item.apiType,
|
||||
key: item.key,
|
||||
};
|
||||
}
|
||||
return item;
|
||||
|
||||
@@ -111,6 +111,11 @@ class YouTubeCaptionProvider {
|
||||
kissControls.appendChild(toggleButton);
|
||||
|
||||
toggleButton.onclick = () => {
|
||||
if (this.#isBusy) {
|
||||
logger.info(`Youtube Provider: It's budy now...`);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!this.#enabled) {
|
||||
logger.info(`Youtube Provider: Feature toggled ON.`);
|
||||
this.#startManager();
|
||||
@@ -283,9 +288,10 @@ class YouTubeCaptionProvider {
|
||||
OPT_LANGS_TO_CODE[OPT_TRANS_MICROSOFT].get(lang.slice(0, 2)) ||
|
||||
"auto";
|
||||
if (potUrl.searchParams.get("kind") === "asr" && segApiSetting) {
|
||||
// todo: 切分多次发送接受以适应接口处理能力
|
||||
subtitles = await this.#aiSegment({
|
||||
videoId,
|
||||
events,
|
||||
events: this.#flatEvents(events),
|
||||
fromLang,
|
||||
toLang,
|
||||
segApiSetting,
|
||||
@@ -408,10 +414,7 @@ class YouTubeCaptionProvider {
|
||||
lines = this.#processSubtitles({ events, usePause: true });
|
||||
}
|
||||
|
||||
return lines.map((item) => ({
|
||||
...item,
|
||||
duration: Math.max(0, item.end - item.start),
|
||||
}));
|
||||
return lines;
|
||||
}
|
||||
|
||||
#isQualityPoor(lines, lengthThreshold = 250, percentageThreshold = 0.1) {
|
||||
@@ -580,6 +583,39 @@ class YouTubeCaptionProvider {
|
||||
|
||||
return sentences;
|
||||
}
|
||||
|
||||
#flatEvents(events = []) {
|
||||
const segments = [];
|
||||
let buffer = null;
|
||||
|
||||
events.forEach(({ segs = [], tStartMs = 0, dDurationMs = 0 }) => {
|
||||
segs.forEach(({ utf8 = "", tOffsetMs = 0 }, j) => {
|
||||
const text = utf8.trim().replace(/\s+/g, " ");
|
||||
const start = tStartMs + tOffsetMs;
|
||||
|
||||
if (buffer) {
|
||||
if (!buffer.end || buffer.end > start) {
|
||||
buffer.end = start;
|
||||
}
|
||||
segments.push(buffer);
|
||||
buffer = null;
|
||||
}
|
||||
|
||||
buffer = {
|
||||
text,
|
||||
start,
|
||||
};
|
||||
|
||||
if (j === segs.length - 1) {
|
||||
buffer.end = tStartMs + dDurationMs;
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
segments.push(buffer);
|
||||
|
||||
return segments.filter((item) => item.text);
|
||||
}
|
||||
}
|
||||
|
||||
export const YouTubeInitializer = (() => {
|
||||
|
||||
44
src/subtitle/vtt.js
Normal file
44
src/subtitle/vtt.js
Normal file
@@ -0,0 +1,44 @@
|
||||
function millisecondsStringToNumber(msString) {
|
||||
const cleanString = msString.trim();
|
||||
const milliseconds = parseInt(cleanString, 10);
|
||||
|
||||
if (isNaN(milliseconds)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return milliseconds;
|
||||
}
|
||||
|
||||
export function parseBilingualVtt(vttText) {
|
||||
const cleanText = vttText.replace(/^\uFEFF/, "").trim();
|
||||
const cues = cleanText.split(/\n\n+/);
|
||||
|
||||
const result = [];
|
||||
|
||||
for (const cue of cues) {
|
||||
if (!cue.includes("-->")) continue;
|
||||
|
||||
const lines = cue.split("\n");
|
||||
|
||||
const timestampLineIndex = lines.findIndex((line) => line.includes("-->"));
|
||||
if (timestampLineIndex === -1) continue;
|
||||
|
||||
const [startTimeString, endTimeString] =
|
||||
lines[timestampLineIndex].split(" --> ");
|
||||
const textLines = lines.slice(timestampLineIndex + 1);
|
||||
|
||||
if (startTimeString && endTimeString && textLines.length > 0) {
|
||||
const originalText = textLines[0].trim();
|
||||
const translatedText = (textLines[1] || "").trim();
|
||||
|
||||
result.push({
|
||||
start: millisecondsStringToNumber(startTimeString),
|
||||
end: millisecondsStringToNumber(endTimeString),
|
||||
text: originalText,
|
||||
translation: translatedText,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
Reference in New Issue
Block a user