feat: subtitle: support ai segmentation

This commit is contained in:
Gabe
2025-10-09 23:55:06 +08:00
parent 2d0ea09e06
commit 9bafc937d5
12 changed files with 294 additions and 50 deletions

View File

@@ -13,10 +13,15 @@ import {
MSG_BUILTINAI_DETECT,
MSG_BUILTINAI_TRANSLATE,
OPT_TRANS_BUILTINAI,
URL_CACHE_SUBTITLE,
} from "../config";
import { sha256, withTimeout } from "../libs/utils";
import { kissLog } from "../libs/log";
import { handleTranslate, handleMicrosoftLangdetect } from "./trans";
import {
handleTranslate,
handleSubtitle,
handleMicrosoftLangdetect,
} from "./trans";
import { getHttpCachePolyfill, putHttpCachePolyfill } from "../libs/cache";
import { getBatchQueue } from "../libs/batchQueue";
import { isBuiltinAIAvailable } from "../libs/browser";
@@ -495,3 +500,36 @@ export const apiTranslate = async ({
return [trText, isSame];
};
// 字幕处理/翻译
export const apiSubtitle = async ({
videoId,
fromLang = "en",
toLang,
events = [],
apiSetting,
}) => {
const cacheOpts = {
videoId,
fromLang,
toLang,
};
const cacheInput = `${URL_CACHE_SUBTITLE}?${queryString.stringify(cacheOpts)}`;
const cache = await getHttpCachePolyfill(cacheInput);
if (cache) {
return cache;
}
const subtitles = await handleSubtitle({
events,
from: fromLang,
to: toLang,
apiSetting,
});
if (subtitles?.length) {
putHttpCachePolyfill(cacheInput, null, subtitles);
return subtitles;
}
return [];
};

View File

@@ -90,25 +90,44 @@ const genUserPrompt = ({
};
const parseAIRes = (raw) => {
let data;
if (!raw) {
return [];
}
try {
const jsonString = extractJson(raw);
data = JSON.parse(jsonString);
} catch (err) {
kissLog("parseAIRes", err);
return [];
}
if (!Array.isArray(data.translations)) {
return [];
}
const data = JSON.parse(jsonString);
if (Array.isArray(data.translations)) {
// todo: 考虑序号id可能会打乱
return data.translations.map((item) => [
item?.text ?? "",
item?.sourceLanguage ?? "",
]);
}
} catch (err) {
kissLog("parseAIRes", err);
}
return [];
};
const parseSTRes = (raw) => {
if (!raw) {
return [];
}
try {
const jsonString = extractJson(raw);
const data = JSON.parse(jsonString);
if (Array.isArray(data)) {
return data;
}
} catch (err) {
kissLog("parseAIRes: subtitle", err);
}
return [];
};
const genGoogle = ({ texts, from, to, url, key }) => {
@@ -258,7 +277,7 @@ const genOpenAI = ({
model,
temperature,
maxTokens,
hisMsgs,
hisMsgs = [],
}) => {
const userMsg = {
role: "user",
@@ -295,7 +314,7 @@ const genGemini = ({
model,
temperature,
maxTokens,
hisMsgs,
hisMsgs = [],
}) => {
url = url
.replaceAll(INPUT_PLACE_MODEL, model)
@@ -359,7 +378,7 @@ const genGemini2 = ({
model,
temperature,
maxTokens,
hisMsgs,
hisMsgs = [],
}) => {
const userMsg = {
role: "user",
@@ -395,7 +414,7 @@ const genClaude = ({
model,
temperature,
maxTokens,
hisMsgs,
hisMsgs = [],
}) => {
const userMsg = {
role: "user",
@@ -427,7 +446,7 @@ const genOpenRouter = ({
model,
temperature,
maxTokens,
hisMsgs,
hisMsgs = [],
}) => {
const userMsg = {
role: "user",
@@ -464,7 +483,7 @@ const genOllama = ({
model,
temperature,
maxTokens,
hisMsgs,
hisMsgs = [],
}) => {
const userMsg = {
role: "user",
@@ -579,7 +598,7 @@ const genInit = ({
* @param {*}
* @returns
*/
export const genTransReq = async ({ reqHook, resHook, ...args }) => {
export const genTransReq = async ({ reqHook, ...args }) => {
const {
apiType,
apiSlug,
@@ -593,6 +612,7 @@ export const genTransReq = async ({ reqHook, resHook, ...args }) => {
glossary,
customHeader,
customBody,
events,
} = args;
if (API_SPE_TYPES.mulkeys.has(apiType)) {
@@ -605,7 +625,9 @@ export const genTransReq = async ({ reqHook, resHook, ...args }) => {
if (API_SPE_TYPES.ai.has(apiType)) {
args.systemPrompt = genSystemPrompt({ systemPrompt, from, to });
args.userPrompt = genUserPrompt({
args.userPrompt = !!events
? JSON.stringify(events)
: genUserPrompt({
userPrompt,
from,
to,
@@ -632,7 +654,7 @@ export const genTransReq = async ({ reqHook, resHook, ...args }) => {
}
// 执行 request hook
if (reqHook?.trim()) {
if (reqHook?.trim() && !events) {
try {
interpreter.run(`exports.reqHook = ${reqHook}`);
const hookResult = await interpreter.exports.reqHook(args, {
@@ -864,7 +886,8 @@ export const handleTranslate = async (
httpTimeout,
});
if (!res) {
throw new Error("tranlate got empty response");
kissLog("tranlate got empty response");
return [];
}
return parseTransRes(res, {
@@ -908,3 +931,54 @@ export const handleMicrosoftLangdetect = async (texts = []) => {
return [];
};
/**
* 字幕翻译
* @param {*} param0
* @returns
*/
export const handleSubtitle = async ({
events,
from = "en",
to,
apiSetting,
}) => {
const { apiType, fetchInterval, fetchLimit, httpTimeout } = apiSetting;
const [input, init] = await genTransReq({
...apiSetting,
events,
from,
to,
systemPrompt: apiSetting.subtitlePrompt,
});
const res = await fetchData(input, init, {
useCache: false,
usePool: true,
fetchInterval,
fetchLimit,
httpTimeout,
});
if (!res) {
kissLog("subtitle got empty response");
return [];
}
switch (apiType) {
case OPT_TRANS_OPENAI:
case OPT_TRANS_GEMINI_2:
case OPT_TRANS_OPENROUTER:
case OPT_TRANS_OLLAMA:
return parseSTRes(res?.choices?.[0]?.message?.content ?? "");
case OPT_TRANS_GEMINI:
return parseSTRes(res?.candidates?.[0]?.content?.parts?.[0]?.text ?? "");
case OPT_TRANS_CLAUDE:
return parseSTRes(res?.content?.[0]?.text ?? "");
case OPT_TRANS_CUSTOMIZE:
return res;
default:
}
return [];
};

View File

@@ -354,6 +354,24 @@ Output: {"translations":[{"id":1,"text":"一个<b>React</b>组件","sourceLangua
Fail-safe: On any error, return {"translations":[]}.`;
const defaultSubtitlePrompt = `Goal: Convert raw subtitle event JSON into a clean, sentence-based JSON array.
Output (valid JSON array, output ONLY this array):
[{
"text": "string", // Full sentence with correct punctuation
"translation": "string", // Translation in ${INPUT_PLACE_TO}
"start": int, // Start time (ms)
"end": int, // End time (ms)
"duration": int // end - start
}]
Guidelines:
1. **Segmentation**: Merge sequential 'utf8' strings from 'segs' into full sentences, merging groups logically.
2. **Punctuation**: Ensure proper sentence-final punctuation (., ?, !); add if missing.
3. **Translation**: Translate 'text' into ${INPUT_PLACE_TO}, place result in 'translation'.
4. **Special Cases**: '[Music]' (and similar cues) are standalone entries. Translate appropriately (e.g., '[音乐]', '[Musique]').
`;
const defaultRequestHook = `async (args, { url, body, headers, userMsg, method } = {}) => {
console.log("request hook args:", args);
// return { url, body, headers, userMsg, method };
@@ -375,6 +393,7 @@ const defaultApi = {
key: "",
model: "", // 模型名称
systemPrompt: defaultSystemPrompt,
subtitlePrompt: defaultSubtitlePrompt,
userPrompt: "",
tone: BUILTIN_STONES[0], // 翻译风格
placeholder: BUILTIN_PLACEHOLDERS[0], // 占位符

View File

@@ -1484,9 +1484,9 @@ export const I18N = {
zh_TW: `佔位標名`,
},
system_prompt_helper: {
zh: `在未完全理解默认Prompt的情况下请勿随意修改否则可能翻译失败`,
en: `If you do not fully understand the default prompt, please do not modify it at will, otherwise the translation may fail.`,
zh_TW: `在未完全理解預設Prompt的情況下請勿隨意修改否則可能翻譯失敗`,
zh: `在未完全理解默认Prompt的情况下请勿随意修改否则可能无法工作`,
en: `Do not modify the default prompt without fully understanding it, otherwise it may not work.`,
zh_TW: `在未完全理解預設Prompt的情況下請勿隨意修改否則可能無法運作`,
},
if_pre_init: {
zh: `是否预初始化`,
@@ -1568,4 +1568,19 @@ export const I18N = {
en: `Translation style`,
zh_TW: `譯文樣式`,
},
ai_segmentation: {
zh: `AI智能断句`,
en: `AI intelligent punctuation`,
zh_TW: `AI智慧斷句`,
},
subtitle_helper_1: {
zh: `1、目前仅支持Youtube英文字幕双语翻译且仅支持浏览器扩展。`,
en: `1. Currently only supports bilingual translation of Youtube English subtitles, and only supports browser extensions.`,
zh_TW: `1.目前僅支援Youtube英文字幕雙語翻譯且僅支援瀏覽器擴充功能。`,
},
subtitle_helper_2: {
zh: `2、插件内置基础断句逻辑如不理想可以启用AI智能断句但需考虑视频长度与AI接口的处理能力可能等待的时间会很长甚至失败。`,
en: `2. The plug-in has built-in basic segmentation logic. If it is not ideal, you can enable AI intelligent segmentation. However, you need to consider the video length and the processing power of the AI interface. The waiting time may be very long or even fail.`,
zh_TW: `2.插件內建基礎斷句邏輯如不理想可以啟用AI智能斷句但需考慮視訊長度與AI介面的處理能力可能等待的時間會很長甚至失敗。`,
},
};

View File

@@ -111,6 +111,7 @@ const SUBTITLE_TRANSLATION_STYLE = `font-size: clamp(1.5rem, 3cqw, 3rem);`;
export const DEFAULT_SUBTITLE_SETTING = {
enabled: true, // 是否开启
apiSlug: OPT_TRANS_MICROSOFT,
segSlug: "-", // AI智能断句
// fromLang: "en",
toLang: "zh-CN",
isBilingual: true, // 是否双语显示

View File

@@ -1,6 +1,7 @@
import { APP_LCNAME } from "./app";
export const URL_CACHE_TRAN = `https://${APP_LCNAME}/translate`;
export const URL_CACHE_SUBTITLE = `https://${APP_LCNAME}/subtitle`;
export const URL_CACHE_DELANG = `https://${APP_LCNAME}/detectlang`;
export const URL_CACHE_BINGDICT = `https://${APP_LCNAME}/bingdict`;

View File

@@ -43,6 +43,11 @@ export function useApiList() {
[transApis]
);
const aiEnabledApis = useMemo(
() => enabledApis.filter((api) => API_SPE_TYPES.ai.has(api.apiSlug)),
[enabledApis]
);
const addApi = useCallback(
(apiType) => {
const defaultApiOpt =
@@ -76,7 +81,15 @@ export function useApiList() {
[updateSetting]
);
return { transApis, userApis, builtinApis, enabledApis, addApi, deleteApi };
return {
transApis,
userApis,
builtinApis,
enabledApis,
aiEnabledApis,
addApi,
deleteApi,
};
}
export function useApiItem(apiSlug) {

View File

@@ -298,11 +298,9 @@ export const parseJsonObj = (str) => {
* @returns
*/
export const extractJson = (raw) => {
if (!raw) return "{}";
let s = raw.replace(/^\s*```(?:json)?\s*/i, "").replace(/\s*```\s*$/i, "");
const match = s.match(/\{[\s\S]*\}/);
return match ? match[0] : "{}";
const jsonRegex = /({.*}|\[.*\])/s;
const match = raw.match(jsonRegex);
return match ? match[0] : null;
};
/**

View File

@@ -1,10 +1,11 @@
import { logger } from "../libs/log.js";
import { apiTranslate } from "../apis/index.js";
import { apiSubtitle, apiTranslate } from "../apis/index.js";
import { BilingualSubtitleManager } from "./BilingualSubtitleManager.js";
import { MSG_XHR_DATA_YOUTUBE, APP_NAME } from "../config";
import { truncateWords, sleep } from "../libs/utils.js";
import { createLogoSvg } from "../libs/svg.js";
import { randomBetween } from "../libs/utils.js";
import { fetchData } from "../libs/fetch.js";
const VIDEO_SELECT = "#container video";
const CONTORLS_SELECT = ".ytp-right-controls";
@@ -114,7 +115,12 @@ class YouTubeCaptionProvider {
this.#ytControls.before(kissControls);
}
// todo: 优化逻辑
#findCaptionTrack(captionTracks) {
if (!captionTracks.length) {
return null;
}
let captionTrack = captionTracks.find((item) =>
item.vssId?.startsWith(".en")
);
@@ -123,6 +129,10 @@ class YouTubeCaptionProvider {
item.vssId?.startsWith("a.en")
);
}
captionTrack = captionTracks[0];
captionTrack.baseUrl += "&tlang=en";
return captionTrack;
}
@@ -144,7 +154,8 @@ class YouTubeCaptionProvider {
async #getSubtitleEvents(captionTrack, potUrl, responseText) {
if (potUrl.searchParams.get("lang") === captionTrack.languageCode) {
try {
return JSON.parse(responseText);
const json = JSON.parse(responseText);
return json;
} catch (err) {
logger.error("Youtube Provider: parse responseText", err);
return null;
@@ -161,7 +172,7 @@ class YouTubeCaptionProvider {
potUrl.searchParams.delete("kind");
}
const res = await fetch(potUrl);
const res = await fetchData(potUrl, null, { useCache: true });
if (res.ok) {
const json = await res.json();
return json;
@@ -181,6 +192,24 @@ class YouTubeCaptionProvider {
return docUrl.searchParams.get("v");
}
async #aiSegment({ videoId, toLang, events, segApiSetting }) {
try {
const subtitles = await apiSubtitle({
videoId,
toLang,
events,
apiSetting: segApiSetting,
});
if (Array.isArray(subtitles)) {
return subtitles;
}
} catch (err) {
logger.info("Youtube Provider: ai segmentation", err);
}
return [];
}
async #handleInterceptedRequest(url, responseText) {
try {
if (!responseText) {
@@ -216,12 +245,27 @@ class YouTubeCaptionProvider {
potUrl,
responseText
);
if (!subtitleEvents) {
const events = subtitleEvents?.events;
if (!Array.isArray(events)) {
logger.info("Youtube Provider: SubtitleEvents not got.");
return;
}
const subtitles = this.#formatSubtitles(subtitleEvents);
let subtitles = [];
const { segApiSetting, toLang } = this.#setting;
if (captionTrack.kind === "asr" && segApiSetting) {
// todo: 提示用户等待中
subtitles = await this.#aiSegment({
videoId,
events,
toLang,
segApiSetting,
});
}
if (subtitles.length === 0) {
subtitles = this.#formatSubtitles(events);
}
if (subtitles.length === 0) {
logger.info("Youtube Provider: No subtitles after format.");
return;
@@ -300,8 +344,7 @@ class YouTubeCaptionProvider {
}
}
#formatSubtitles(data) {
const events = data?.events;
#formatSubtitles(events) {
if (!Array.isArray(events)) return [];
const lines = [];
@@ -362,7 +405,7 @@ class YouTubeCaptionProvider {
const isPoor = this.#isQualityPoor(lines);
if (isPoor) {
return this.#processSubtitles(data);
return this.#processSubtitles(events);
}
return lines.map((item) => ({
@@ -380,7 +423,7 @@ class YouTubeCaptionProvider {
return longLinesCount / lines.length > percentageThreshold;
}
#processSubtitles(data, { timeout = 1500, maxWords = 15 } = {}) {
#processSubtitles(events, { timeout = 1500, maxWords = 15 } = {}) {
const groupedPauseWords = {
1: new Set([
"actually",
@@ -483,7 +526,7 @@ class YouTubeCaptionProvider {
bufferWordCount = 0;
};
data.events?.forEach((event) => {
events?.forEach((event) => {
event.segs?.forEach((seg, j) => {
const text = seg.utf8?.trim() || "";
if (!text) return;

View File

@@ -10,7 +10,7 @@ const providers = [
{ pattern: "https://www.youtube.com/watch", start: YouTubeInitializer },
];
export function runSubtitle({ href, setting, rule }) {
export function runSubtitle({ href, setting }) {
try {
const subtitleSetting = setting.subtitleSetting || DEFAULT_SUBTITLE_SETTING;
if (!subtitleSetting.enabled) {
@@ -24,11 +24,16 @@ export function runSubtitle({ href, setting, rule }) {
injectExternalJs(src, id);
const apiSetting =
setting.transApis.find((api) => api.apiSlug === rule.apiSlug) ||
DEFAULT_API_SETTING;
setting.transApis.find(
(api) => api.apiSlug === subtitleSetting.apiSlug
) || DEFAULT_API_SETTING;
const segApiSetting = setting.transApis.find(
(api) => api.apiSlug === subtitleSetting.segSlug
);
provider.start({
...subtitleSetting,
apiSetting,
segApiSetting,
});
}
} catch (err) {

View File

@@ -210,6 +210,7 @@ function ApiFields({ apiSlug, isUserApi, deleteApi }) {
model = "",
apiType,
systemPrompt = "",
subtitlePrompt = "",
// userPrompt = "",
customHeader = "",
customBody = "",
@@ -344,6 +345,16 @@ function ApiFields({ apiSlug, isUserApi, deleteApi }) {
maxRows={10}
helperText={i18n("system_prompt_helper")}
/>
<TextField
size="small"
label={"SUBTITLE PROMPT"}
name="subtitlePrompt"
value={subtitlePrompt}
onChange={handleChange}
multiline
maxRows={10}
helperText={i18n("system_prompt_helper")}
/>
{/* <TextField
size="small"
label={"USER PROMPT"}

View File

@@ -6,6 +6,7 @@ import Grid from "@mui/material/Grid";
import { useI18n } from "../../hooks/I18n";
import { OPT_LANGS_TO } from "../../config";
import FormControlLabel from "@mui/material/FormControlLabel";
import Alert from "@mui/material/Alert";
import Switch from "@mui/material/Switch";
import { useSubtitle } from "../../hooks/Subtitle";
import { useApiList } from "../../hooks/Api";
@@ -13,7 +14,7 @@ import { useApiList } from "../../hooks/Api";
export default function SubtitleSetting() {
const i18n = useI18n();
const { subtitleSetting, updateSubtitle } = useSubtitle();
const { enabledApis } = useApiList();
const { enabledApis, aiEnabledApis } = useApiList();
const handleChange = (e) => {
e.preventDefault();
@@ -26,6 +27,7 @@ export default function SubtitleSetting() {
const {
enabled,
apiSlug,
segSlug,
toLang,
isBilingual,
windowStyle,
@@ -36,6 +38,12 @@ export default function SubtitleSetting() {
return (
<Box>
<Stack spacing={3}>
<Alert severity="info">
{i18n("subtitle_helper_1")}
<br />
{i18n("subtitle_helper_2")}
</Alert>
<FormControlLabel
control={
<Switch
@@ -69,6 +77,24 @@ export default function SubtitleSetting() {
))}
</TextField>
</Grid>
<Grid item xs={12} sm={12} md={6} lg={3}>
<TextField
select
fullWidth
size="small"
name="segSlug"
value={segSlug}
label={i18n("ai_segmentation")}
onChange={handleChange}
>
<MenuItem value={"-"}>{i18n("disable")}</MenuItem>
{aiEnabledApis.map((api) => (
<MenuItem key={api.apiSlug} value={api.apiSlug}>
{api.apiName}
</MenuItem>
))}
</TextField>
</Grid>
<Grid item xs={12} sm={12} md={6} lg={3}>
<TextField
fullWidth