fix: 当 JSON-LD 解析失败时添加回退机制

豆瓣现在对未登录请求返回反爬虫验证页面,导致 JSON-LD 解析返回 undefined。

添加从 OG meta 标签提取基本信息的回退机制,防止代码崩溃。

同时给所有数组字段添加默认值 || []。
This commit is contained in:
YuBai 2026-02-02 11:25:56 +08:00
parent a0eccf7370
commit 297ccd33cf
2 changed files with 111 additions and 29 deletions

@ -35,24 +35,24 @@ export default class DoubanMovieLoadHandler extends DoubanAbstractLoadHandler<Do
"director", "director",
DataValueType.array, DataValueType.array,
extract.director, extract.director,
extract.director.map(SchemaOrg.getPersonName).filter(c => c) (extract.director || []).map(SchemaOrg.getPersonName).filter(c => c)
)); ));
variableMap.set("actor", new DataField( variableMap.set("actor", new DataField(
"actor", "actor",
DataValueType.array, DataValueType.array,
extract.actor, extract.actor,
extract.actor.map(SchemaOrg.getPersonName).filter(c => c) (extract.actor || []).map(SchemaOrg.getPersonName).filter(c => c)
)); ));
variableMap.set("author", new DataField( variableMap.set("author", new DataField(
"author", "author",
DataValueType.array, DataValueType.array,
extract.author, extract.author,
extract.author.map(SchemaOrg.getPersonName).map(name => super.getPersonName(name, context)).filter(c => c) (extract.author || []).map(SchemaOrg.getPersonName).map(name => super.getPersonName(name, context)).filter(c => c)
)); ));
variableMap.set("aliases", new DataField("aliases", DataValueType.array, extract.aliases, variableMap.set("aliases", new DataField("aliases", DataValueType.array, extract.aliases,
extract.aliases.map(a=>a (extract.aliases || []).map(a=>a
.trim() .trim()
// .replace(TITLE_ALIASES_SPECIAL_CHAR_REG_G, '_') // .replace(TITLE_ALIASES_SPECIAL_CHAR_REG_G, '_')
// //replase multiple _ to single _ // //replase multiple _ to single _
@ -98,7 +98,7 @@ export default class DoubanMovieLoadHandler extends DoubanAbstractLoadHandler<Do
} }
parseSubjectFromHtml(html: CheerioAPI, context: HandleContext): DoubanMovieSubject { parseSubjectFromHtml(html: CheerioAPI, context: HandleContext): DoubanMovieSubject {
const movie:DoubanMovieSubject = html('script') let movie: DoubanMovieSubject | undefined = html('script')
.get() .get()
.filter(scd => "application/ld+json" == html(scd).attr("type")) .filter(scd => "application/ld+json" == html(scd).attr("type"))
.map(i => { .map(i => {
@ -108,7 +108,7 @@ export default class DoubanMovieLoadHandler extends DoubanAbstractLoadHandler<Do
const idPattern = /(\d){5,10}/g; const idPattern = /(\d){5,10}/g;
const id = idPattern.exec(obj.url); const id = idPattern.exec(obj.url);
const name = obj.name; const name = obj.name;
const title = super.getTitleNameByMode(name, PersonNameMode.CH_NAME, context)??name; const title = super.getTitleNameByMode(name, PersonNameMode.CH_NAME, context) ?? name;
const originalTitle = super.getTitleNameByMode(name, PersonNameMode.EN_NAME, context) ?? name; const originalTitle = super.getTitleNameByMode(name, PersonNameMode.EN_NAME, context) ?? name;
const result: DoubanMovieSubject = { const result: DoubanMovieSubject = {
@ -119,14 +119,14 @@ export default class DoubanMovieLoadHandler extends DoubanAbstractLoadHandler<Do
originalTitle: originalTitle, originalTitle: originalTitle,
desc: obj.description, desc: obj.description,
url: "https://movie.douban.com" + obj.url, url: "https://movie.douban.com" + obj.url,
director: obj.director, director: obj.director || [],
author: obj.author, author: obj.author || [],
actor: obj.actor, actor: obj.actor || [],
aggregateRating: obj.aggregateRating, aggregateRating: obj.aggregateRating,
datePublished: obj.datePublished ? new Date(obj.datePublished) : undefined, datePublished: obj.datePublished ? new Date(obj.datePublished) : undefined,
image: obj.image, image: obj.image,
imageUrl: obj.image, imageUrl: obj.image,
genre: obj.genre, genre: obj.genre || [],
publisher: '', publisher: '',
aliases: [""], aliases: [""],
language: [""], language: [""],
@ -136,10 +136,52 @@ export default class DoubanMovieLoadHandler extends DoubanAbstractLoadHandler<Do
} }
return result; return result;
})[0]; })[0];
// Fallback: if JSON-LD parsing failed (e.g., anti-bot page), extract from meta tags
if (!movie) {
const title = html(html("head > meta[property='og:title']").get(0)).attr("content") || '';
const image = html(html("head > meta[property='og:image']").get(0)).attr("content") || '';
const urlMeta = html(html("head > meta[property='og:url']").get(0)).attr("content") || '';
const desc = html(html("head > meta[property='og:description']").get(0)).attr("content") || '';
// Extract ID from URL
const idPattern = /(\d){5,10}/g;
const idMatch = idPattern.exec(urlMeta);
const id = idMatch ? idMatch[0] : '';
// Extract score from HTML
const scoreText = html("#interest_sectl strong[property='v:average']").text();
const score = scoreText ? parseFloat(scoreText) : undefined;
movie = {
id,
title,
type: this.getSupportType(),
score,
originalTitle: title,
desc,
url: urlMeta || (id ? `https://movie.douban.com/subject/${id}/` : ''),
director: [],
author: [],
actor: [],
aggregateRating: undefined,
datePublished: undefined,
image,
imageUrl: image,
genre: [],
publisher: '',
aliases: [],
language: [],
country: [],
time: null,
IMDb: null,
};
}
this.handlePersonNameByMeta(html, movie, context, 'video:actor', 'actor'); this.handlePersonNameByMeta(html, movie, context, 'video:actor', 'actor');
this.handlePersonNameByMeta(html, movie, context, 'video:director', 'director'); this.handlePersonNameByMeta(html, movie, context, 'video:director', 'director');
const desc:string = html("span[property='v:summary']").text(); const desc: string = html("span[property='v:summary']").text();
if (desc) { if (desc) {
movie.desc = desc; movie.desc = desc;
} }
@ -156,7 +198,7 @@ export default class DoubanMovieLoadHandler extends DoubanAbstractLoadHandler<Do
// value = html(info.next.next).text().trim(); // value = html(info.next.next).text().trim();
const vas = html(info.next).text().trim(); const vas = html(info.next).text().trim();
value = vas.split("/").map((v) => v.trim()); value = vas.split("/").map((v) => v.trim());
} else if(key.indexOf('片长') >= 0) { } else if (key.indexOf('片长') >= 0) {
value = html(info.next.next).text().trim() value = html(info.next.next).text().trim()
} else { } else {
value = html(info.next).text().trim(); value = html(info.next).text().trim();

@ -25,22 +25,22 @@ export class DoubanTeleplayLoadHandler extends DoubanAbstractLoadHandler<DoubanT
} }
parseVariable(beforeContent: string, variableMap:Map<string, DataField>, extract: DoubanTeleplaySubject, context: HandleContext): void { parseVariable(beforeContent: string, variableMap:Map<string, DataField>, extract: DoubanTeleplaySubject, context: HandleContext): void {
variableMap.set("director", new DataField("director", DataValueType.array, extract.director,extract.director.map(SchemaOrg.getPersonName).filter(c => c))); variableMap.set("director", new DataField("director", DataValueType.array, extract.director,(extract.director || []).map(SchemaOrg.getPersonName).filter(c => c)));
variableMap.set("actor", new DataField( variableMap.set("actor", new DataField(
"actor", "actor",
DataValueType.array, DataValueType.array,
extract.actor, extract.actor,
extract.actor.map(SchemaOrg.getPersonName).filter(c => c) (extract.actor || []).map(SchemaOrg.getPersonName).filter(c => c)
)); ));
variableMap.set("author", new DataField( variableMap.set("author", new DataField(
"author", "author",
DataValueType.array, DataValueType.array,
extract.author, extract.author,
extract.author.map(SchemaOrg.getPersonName).map(name => super.getPersonName(name, context)).filter(c => c) (extract.author || []).map(SchemaOrg.getPersonName).map(name => super.getPersonName(name, context)).filter(c => c)
)); ));
variableMap.set("aliases", new DataField("aliases", DataValueType.array, extract.aliases, variableMap.set("aliases", new DataField("aliases", DataValueType.array, extract.aliases,
extract.aliases.map(a=>a (extract.aliases || []).map(a=>a
.trim() .trim()
// .replace(TITLE_ALIASES_SPECIAL_CHAR_REG_G, '_') // .replace(TITLE_ALIASES_SPECIAL_CHAR_REG_G, '_')
// //replase multiple _ to single _ // //replase multiple _ to single _
@ -84,7 +84,7 @@ export class DoubanTeleplayLoadHandler extends DoubanAbstractLoadHandler<DoubanT
} }
parseSubjectFromHtml(html: CheerioAPI, context: HandleContext): DoubanTeleplaySubject { parseSubjectFromHtml(html: CheerioAPI, context: HandleContext): DoubanTeleplaySubject {
const teleplay:DoubanTeleplaySubject = html('script') let teleplay: DoubanTeleplaySubject | undefined = html('script')
.get() .get()
.filter(scd => "application/ld+json" == html(scd).attr("type")) .filter(scd => "application/ld+json" == html(scd).attr("type"))
.map(i => { .map(i => {
@ -104,14 +104,14 @@ export class DoubanTeleplayLoadHandler extends DoubanAbstractLoadHandler<DoubanT
originalTitle: originalTitle, originalTitle: originalTitle,
desc: obj.description, desc: obj.description,
url: "https://movie.douban.com" + obj.url, url: "https://movie.douban.com" + obj.url,
director: obj.director, director: obj.director || [],
author: obj.author, author: obj.author || [],
actor: obj.actor, actor: obj.actor || [],
aggregateRating: obj.aggregateRating, aggregateRating: obj.aggregateRating,
datePublished: obj.datePublished ? new Date(obj.datePublished) : undefined, datePublished: obj.datePublished ? new Date(obj.datePublished) : undefined,
image: obj.image, image: obj.image,
imageUrl: obj.image, imageUrl: obj.image,
genre: obj.genre, genre: obj.genre || [],
score: obj.aggregateRating ? obj.aggregateRating.ratingValue : undefined, score: obj.aggregateRating ? obj.aggregateRating.ratingValue : undefined,
publisher: "", publisher: "",
aliases: [""], aliases: [""],
@ -124,6 +124,46 @@ export class DoubanTeleplayLoadHandler extends DoubanAbstractLoadHandler<DoubanT
return result; return result;
})[0]; })[0];
// Fallback: if JSON-LD parsing failed, extract from meta tags
if (!teleplay) {
const title = html(html("head > meta[property='og:title']").get(0)).attr("content") || '';
const image = html(html("head > meta[property='og:image']").get(0)).attr("content") || '';
const urlMeta = html(html("head > meta[property='og:url']").get(0)).attr("content") || '';
const desc = html(html("head > meta[property='og:description']").get(0)).attr("content") || '';
const idPattern = /(\d){5,10}/g;
const idMatch = idPattern.exec(urlMeta);
const id = idMatch ? idMatch[0] : '';
const scoreText = html("#interest_sectl strong[property='v:average']").text();
const score = scoreText ? parseFloat(scoreText) : undefined;
teleplay = {
id,
title,
type: this.getSupportType(),
score,
originalTitle: title,
desc,
url: urlMeta || (id ? `https://movie.douban.com/subject/${id}/` : ''),
director: [],
author: [],
actor: [],
aggregateRating: undefined,
datePublished: undefined,
image,
imageUrl: image,
genre: [],
publisher: '',
aliases: [],
language: [],
country: [],
episode: null,
time: null,
IMDb: null,
};
}
this.handlePersonNameByMeta(html, teleplay, context, 'video:actor', 'actor'); this.handlePersonNameByMeta(html, teleplay, context, 'video:actor', 'actor');
this.handlePersonNameByMeta(html, teleplay, context, 'video:director', 'director'); this.handlePersonNameByMeta(html, teleplay, context, 'video:director', 'director');
const desc:string = html("span[property='v:summary']").text(); const desc:string = html("span[property='v:summary']").text();