feat: support subtitle chunks for AI

This commit is contained in:
Gabe
2025-10-11 21:06:38 +08:00
parent d9b4399c57
commit 1afe976777
6 changed files with 304 additions and 84 deletions

View File

@@ -517,6 +517,7 @@ export const apiTranslate = async ({
// 字幕处理/翻译
export const apiSubtitle = async ({
videoId,
chunkSign,
fromLang = "auto",
toLang,
events = [],
@@ -525,6 +526,7 @@ export const apiSubtitle = async ({
const cacheOpts = {
apiSlug: apiSetting.apiSlug,
videoId,
chunkSign,
fromLang,
toLang,
};

View File

@@ -1554,9 +1554,9 @@ export const I18N = {
zh_TW: `啟用字幕翻譯`,
},
is_bilingual_view: {
zh: `启用双语显示`,
en: `DEnable bilingual display`,
zh_TW: `啟用雙語顯示`,
zh: `双语显示`,
en: `Enable bilingual display`,
zh_TW: `雙語顯示`,
},
background_styles: {
zh: `背景样式`,
@@ -1578,6 +1578,11 @@ export const I18N = {
en: `AI intelligent punctuation`,
zh_TW: `AI智慧斷句`,
},
ai_chunk_length: {
zh: `AI处理切割长度`,
en: `AI processing chunk length`,
zh_TW: `AI处理切割长度`,
},
subtitle_helper_1: {
zh: `1、目前仅支持Youtube且仅支持浏览器扩展。`,
en: `1. Currently only supports Youtube and browser extensions.`,

View File

@@ -112,6 +112,7 @@ export const DEFAULT_SUBTITLE_SETTING = {
enabled: true, // 是否开启
apiSlug: OPT_TRANS_MICROSOFT,
segSlug: "-", // AI智能断句
chunkLength: 1000, // AI处理切割长度
// fromLang: "en",
toLang: "zh-CN",
isBilingual: true, // 是否双语显示

View File

@@ -230,4 +230,23 @@ export class BilingualSubtitleManager {
}
}
}
/**
* 追加新的字幕
* @param {Array<object>} newSubtitlesChunk - 新的、要追加的字幕数据块。
*/
appendSubtitles(newSubtitlesChunk) {
if (!newSubtitlesChunk || newSubtitlesChunk.length === 0) {
return;
}
logger.info(
`Bilingual Subtitle Manager: Appending ${newSubtitlesChunk.length} new subtitles...`
);
this.#formattedSubtitles.push(...newSubtitlesChunk);
this.#formattedSubtitles.sort((a, b) => a.start - b.start);
this.#currentSubtitleIndex = -1;
this.onTimeUpdate();
}
}

View File

@@ -218,10 +218,14 @@ class YouTubeCaptionProvider {
return docUrl.searchParams.get("v");
}
async #aiSegment({ videoId, toLang, events, segApiSetting }) {
async #aiSegment({ videoId, fromLang, toLang, chunkEvents, segApiSetting }) {
try {
const events = chunkEvents.filter((item) => item.text);
const chunkSign = `${events[0].start} --> ${events[events.length - 1].end}`;
const subtitles = await apiSubtitle({
videoId,
chunkSign,
fromLang,
toLang,
events,
apiSetting: segApiSetting,
@@ -279,7 +283,8 @@ class YouTubeCaptionProvider {
return;
}
let subtitles = [];
const flatEvents = this.#flatEvents(events);
if (!flatEvents.length) return;
const { segApiSetting, toLang } = this.#setting;
const lang = potUrl.searchParams.get("lang");
@@ -287,26 +292,77 @@ class YouTubeCaptionProvider {
OPT_LANGS_TO_CODE[OPT_TRANS_MICROSOFT].get(lang) ||
OPT_LANGS_TO_CODE[OPT_TRANS_MICROSOFT].get(lang.slice(0, 2)) ||
"auto";
if (potUrl.searchParams.get("kind") === "asr" && segApiSetting) {
// todo: 切分多次发送接受以适应接口处理能力
subtitles = await this.#aiSegment({
logger.info("Youtube Provider: Starting AI ...");
const eventChunks = this.#splitEventsIntoChunks(
flatEvents,
segApiSetting.chunkLength
);
const subtitlesFallback = () =>
this.#formatSubtitles(flatEvents, fromLang);
if (eventChunks.length === 0) {
this.#onCaptionsReady({
videoId,
subtitles: subtitlesFallback(),
fromLang,
isInitialLoad: true,
});
return;
}
const firstChunkEvents = eventChunks[0];
const firstBatchSubtitles = await this.#aiSegment({
videoId,
events: this.#flatEvents(events),
chunkEvents: firstChunkEvents,
fromLang,
toLang,
segApiSetting,
});
}
if (!subtitles?.length) {
subtitles = this.#formatSubtitles(events, fromLang);
}
if (!subtitles?.length) {
logger.info("Youtube Provider: No subtitles after format.");
return;
}
if (!firstBatchSubtitles?.length) {
this.#onCaptionsReady({
videoId,
subtitles: subtitlesFallback(),
fromLang,
isInitialLoad: true,
});
return;
}
this.#onCaptionsReady({ videoId, subtitles, fromLang });
this.#onCaptionsReady({
videoId,
subtitles: firstBatchSubtitles,
fromLang,
isInitialLoad: true,
});
if (eventChunks.length > 1) {
const remainingChunks = eventChunks.slice(1);
this.#processRemainingChunksAsync({
chunks: remainingChunks,
videoId,
fromLang,
toLang,
segApiSetting,
});
}
} else {
const subtitles = this.#formatSubtitles(flatEvents, fromLang);
if (!subtitles?.length) {
logger.info("Youtube Provider: No subtitles after format.");
return;
}
this.#onCaptionsReady({
videoId,
subtitles,
fromLang,
isInitialLoad: true,
});
}
} catch (error) {
logger.warn("Youtube Provider: unknow error", error);
} finally {
@@ -382,8 +438,8 @@ class YouTubeCaptionProvider {
}
}
#formatSubtitles(events, lang) {
if (!events?.length) return [];
#formatSubtitles(flatEvents, lang) {
if (!flatEvents?.length) return [];
const noSpaceLanguages = [
"zh", // 中文
@@ -396,25 +452,49 @@ class YouTubeCaptionProvider {
];
if (noSpaceLanguages.some((l) => lang?.startsWith(l))) {
return events
.map(({ segs = [], tStartMs = 0, dDurationMs = 0 }) => ({
text: segs
.map(({ utf8 = "" }) => utf8)
.join("")
?.trim(),
start: tStartMs,
end: tStartMs + dDurationMs,
}))
.filter((item) => item.text);
const subtitles = [];
let currentLine = null;
const MAX_LENGTH = 100;
for (const segment of flatEvents) {
if (segment.text) {
if (!currentLine) {
currentLine = {
text: segment.text,
start: segment.start,
end: segment.end,
};
} else {
currentLine.text += segment.text;
currentLine.end = segment.end;
}
if (currentLine.text.length >= MAX_LENGTH) {
subtitles.push(currentLine);
currentLine = null;
}
} else {
if (currentLine) {
subtitles.push(currentLine);
currentLine = null;
}
}
}
if (currentLine) {
subtitles.push(currentLine);
}
return subtitles;
}
let lines = this.#processSubtitles({ events });
const isPoor = this.#isQualityPoor(lines);
let subtitles = this.#processSubtitles({ flatEvents });
const isPoor = this.#isQualityPoor(subtitles);
if (isPoor) {
lines = this.#processSubtitles({ events, usePause: true });
subtitles = this.#processSubtitles({ flatEvents, usePause: true });
}
return lines;
return subtitles;
}
#isQualityPoor(lines, lengthThreshold = 250, percentageThreshold = 0.1) {
@@ -426,9 +506,9 @@ class YouTubeCaptionProvider {
}
#processSubtitles({
events,
flatEvents,
usePause = false,
timeout = 1500,
timeout = 1000,
maxWords = 15,
} = {}) {
const groupedPauseWords = {
@@ -516,67 +596,54 @@ class YouTubeCaptionProvider {
let currentBuffer = [];
let bufferWordCount = 0;
const joinSegs = (segs) => ({
text: segs
.map((s) => s.text)
.join(" ")
.trim(),
start: segs[0].start,
end: segs[segs.length - 1].end,
});
const flushBuffer = () => {
if (currentBuffer.length > 0) {
sentences.push(joinSegs(currentBuffer));
sentences.push({
text: currentBuffer
.map((s) => s.text)
.join(" ")
.trim(),
start: currentBuffer[0].start,
end: currentBuffer[currentBuffer.length - 1].end,
});
}
currentBuffer = [];
bufferWordCount = 0;
};
events.forEach(({ segs = [], tStartMs = 0, dDurationMs = 0 }) => {
segs.forEach(({ utf8 = "", tOffsetMs = 0 }, j) => {
const text = utf8?.trim().replace(/\s+/g, " ") || "";
if (!text) return;
flatEvents.forEach((segment) => {
if (!segment.text) return;
const start = tStartMs + tOffsetMs;
const lastSegment = currentBuffer[currentBuffer.length - 1];
const lastSegment = currentBuffer[currentBuffer.length - 1];
if (lastSegment) {
if (!lastSegment.end || lastSegment.end > start) {
lastSegment.end = start;
}
if (lastSegment) {
const isEndOfSentence = /[.?!…\])]$/.test(lastSegment.text);
const isPauseOfSentence = /[,]$/.test(lastSegment.text);
const isTimeout = segment.start - lastSegment.end > timeout;
const isWordLimitExceeded =
(usePause || isPauseOfSentence) && bufferWordCount >= maxWords;
const isEndOfSentence = /[.?!…\])]$/.test(lastSegment.text);
const isPauseOfSentence = /[,]$/.test(lastSegment.text);
const isTimeout = start - lastSegment.end > timeout;
const isWordLimitExceeded =
(usePause || isPauseOfSentence) && bufferWordCount >= maxWords;
const startsWithSign = /^[[(♪]/.test(segment.text);
const startsWithPauseWord =
usePause &&
groupedPauseWords["1"].has(
segment.text.toLowerCase().split(" ")[0]
) &&
currentBuffer.length > 1;
const startsWithSign = /^[[(♪]/.test(text);
const startsWithPauseWord =
usePause &&
groupedPauseWords["1"].has(text.toLowerCase().split(" ")[0]) && // todo: 考虑连词开头
currentBuffer.length > 1;
if (
isEndOfSentence ||
isTimeout ||
isWordLimitExceeded ||
startsWithSign ||
startsWithPauseWord
) {
flushBuffer();
}
if (
isEndOfSentence ||
isTimeout ||
isWordLimitExceeded ||
startsWithSign ||
startsWithPauseWord
) {
flushBuffer();
}
}
const currentSegment = { text, start };
if (j === segs.length - 1) {
currentSegment.end = tStartMs + dDurationMs;
}
currentBuffer.push(currentSegment);
bufferWordCount += text.split(/\s+/).length;
});
currentBuffer.push(segment);
bufferWordCount += segment.text.split(/\s+/).length;
});
flushBuffer();
@@ -614,7 +681,114 @@ class YouTubeCaptionProvider {
segments.push(buffer);
return segments.filter((item) => item.text);
return segments;
}
#splitEventsIntoChunks(flatEvents, chunkLength = 1000) {
if (!flatEvents || flatEvents.length === 0) {
return [];
}
const eventChunks = [];
let currentChunk = [];
let currentChunkTextLength = 0;
const MAX_CHUNK_LENGTH = chunkLength + 500;
const PAUSE_THRESHOLD_MS = 1000;
for (let i = 0; i < flatEvents.length; i++) {
const event = flatEvents[i];
currentChunk.push(event);
currentChunkTextLength += event.text.length;
const isLastEvent = i === flatEvents.length - 1;
if (isLastEvent) {
continue;
}
let shouldSplit = false;
if (currentChunkTextLength >= MAX_CHUNK_LENGTH) {
shouldSplit = true;
} else if (currentChunkTextLength >= chunkLength) {
const isEndOfSentence = /[.?!…\])]$/.test(event.text);
const nextEvent = flatEvents[i + 1];
const pauseDuration = nextEvent.start - event.end;
if (isEndOfSentence || pauseDuration > PAUSE_THRESHOLD_MS) {
shouldSplit = true;
}
}
if (shouldSplit) {
eventChunks.push(currentChunk);
currentChunk = [];
currentChunkTextLength = 0;
}
}
if (currentChunk.length > 0) {
eventChunks.push(currentChunk);
}
return eventChunks;
}
async #processRemainingChunksAsync({
chunks,
videoId,
fromLang,
toLang,
segApiSetting,
}) {
logger.info(`Youtube Provider: Starting for ${chunks.length} chunks.`);
for (let i = 0; i < chunks.length; i++) {
const chunkEvents = chunks[i];
const chunkNum = i + 2;
logger.info(
`Youtube Provider: Processing subtitle chunk ${chunkNum}/${chunks.length + 1}...`
);
let subtitlesForThisChunk = [];
try {
const aiSubtitles = await this.#aiSegment({
videoId,
chunkEvents,
fromLang,
toLang,
segApiSetting,
});
if (aiSubtitles?.length > 0) {
subtitlesForThisChunk = aiSubtitles;
} else {
logger.info(
`Youtube Provider: AI segmentation for chunk ${chunkNum} returned no data.`
);
subtitlesForThisChunk = this.#formatSubtitles(chunkEvents, fromLang);
}
} catch (chunkError) {
subtitlesForThisChunk = this.#formatSubtitles(chunkEvents, fromLang);
}
if (this.#videoId !== videoId) {
logger.info("Youtube Provider: videoId changed!");
break;
}
if (subtitlesForThisChunk.length > 0 && this.#managerInstance) {
logger.info(
`Youtube Provider: Appending ${subtitlesForThisChunk.length} subtitles from chunk ${chunkNum}.`
);
this.#managerInstance.appendSubtitles(subtitlesForThisChunk);
} else {
logger.info(`Youtube Provider: Chunk ${chunkNum} no subtitles.`);
}
await sleep(randomBetween(500, 1000));
}
logger.info("Youtube Provider: All subtitle chunks processed.");
}
}

View File

@@ -10,6 +10,7 @@ import Alert from "@mui/material/Alert";
import Switch from "@mui/material/Switch";
import { useSubtitle } from "../../hooks/Subtitle";
import { useApiList } from "../../hooks/Api";
import { limitNumber } from "../../libs/utils";
export default function SubtitleSetting() {
const i18n = useI18n();
@@ -19,6 +20,12 @@ export default function SubtitleSetting() {
const handleChange = (e) => {
e.preventDefault();
let { name, value } = e.target;
switch (name) {
case "chunkLength":
value = limitNumber(value, 200, 20000);
break;
default:
}
updateSubtitle({
[name]: value,
});
@@ -28,6 +35,7 @@ export default function SubtitleSetting() {
enabled,
apiSlug,
segSlug,
chunkLength,
toLang,
isBilingual,
windowStyle,
@@ -96,6 +104,17 @@ export default function SubtitleSetting() {
))}
</TextField>
</Grid>
<Grid item xs={12} sm={12} md={6} lg={3}>
<TextField
fullWidth
size="small"
label={i18n("ai_chunk_length")}
type="number"
name="chunkLength"
value={chunkLength}
onChange={handleChange}
/>
</Grid>
<Grid item xs={12} sm={12} md={6} lg={3}>
<TextField
fullWidth