fixes and library updates

This commit is contained in:
zadam
2020-04-02 22:49:27 +02:00
parent 1a58026b23
commit 338f01be01
8 changed files with 669 additions and 1305 deletions

View File

@@ -1,5 +1,3 @@
// https://github.com/mozilla/readability/tree/814f0a3884350b6f1adfdebb79ca3599e9806605
/*eslint-env es6:false*/
/*
* Copyright (c) 2010 Arc90 Inc
@@ -38,6 +36,7 @@ function Readability(doc, options) {
options = options || {};
this._doc = doc;
this._docJSDOMParser = this._doc.firstChild.__JSDOMParser__;
this._articleTitle = null;
this._articleByline = null;
this._articleDir = null;
@@ -50,11 +49,12 @@ function Readability(doc, options) {
this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD;
this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []);
this._keepClasses = !!options.keepClasses;
// Start with all flags set
this._flags = this.FLAG_STRIP_UNLIKELYS |
this.FLAG_WEIGHT_CLASSES |
this.FLAG_CLEAN_CONDITIONALLY;
this.FLAG_WEIGHT_CLASSES |
this.FLAG_CLEAN_CONDITIONALLY;
var logEl;
@@ -116,8 +116,8 @@ Readability.prototype = {
REGEXPS: {
// NOTE: These two regular expressions are duplicated in
// Readability-readerable.js. Please keep both copies in sync.
unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i,
positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
@@ -126,6 +126,7 @@ Readability.prototype = {
replaceFonts: /<(\/?)font[^>]*>/gi,
normalize: /\s{2,}/g,
videos: /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i,
shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i,
nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
prevLink: /(prev|earl|old|new|<|«)/i,
whitespace: /^\s*$/,
@@ -159,13 +160,15 @@ Readability.prototype = {
*
* @param Element
* @return void
**/
**/
_postProcessContent: function(articleContent) {
// Readability cannot open relative uris so we convert them to absolute uris.
this._fixRelativeUris(articleContent);
// Remove classes.
this._cleanClasses(articleContent);
if (!this._keepClasses) {
// Remove classes.
this._cleanClasses(articleContent);
}
},
/**
@@ -179,6 +182,10 @@ Readability.prototype = {
* @return void
*/
_removeNodes: function(nodeList, filterFn) {
// Avoid ever operating on live node lists.
if (this._docJSDOMParser && nodeList._isLiveNodeList) {
throw new Error("Do not pass live node lists to _removeNodes");
}
for (var i = nodeList.length - 1; i >= 0; i--) {
var node = nodeList[i];
var parentNode = node.parentNode;
@@ -198,6 +205,10 @@ Readability.prototype = {
* @return void
*/
_replaceNodeTags: function(nodeList, newTagName) {
// Avoid ever operating on live node lists.
if (this._docJSDOMParser && nodeList._isLiveNodeList) {
throw new Error("Do not pass live node lists to _replaceNodeTags");
}
for (var i = nodeList.length - 1; i >= 0; i--) {
var node = nodeList[i];
this._setNodeTag(node, newTagName);
@@ -285,11 +296,11 @@ Readability.prototype = {
_cleanClasses: function(node) {
var classesToPreserve = this._classesToPreserve;
var className = (node.getAttribute("class") || "")
.split(/\s+/)
.filter(function(cls) {
return classesToPreserve.indexOf(cls) != -1;
})
.join(" ");
.split(/\s+/)
.filter(function(cls) {
return classesToPreserve.indexOf(cls) != -1;
})
.join(" ");
if (className) {
node.setAttribute("class", className);
@@ -330,11 +341,21 @@ Readability.prototype = {
this._forEachNode(links, function(link) {
var href = link.getAttribute("href");
if (href) {
// Replace links with javascript: URIs with text content, since
// Remove links with javascript: URIs, since
// they won't work after scripts have been removed from the page.
if (href.indexOf("javascript:") === 0) {
var text = this._doc.createTextNode(link.textContent);
link.parentNode.replaceChild(text, link);
// if the link only contains simple text content, it can be converted to a text node
if (link.childNodes.length === 1 && link.childNodes[0].nodeType === this.TEXT_NODE) {
var text = this._doc.createTextNode(link.textContent);
link.parentNode.replaceChild(text, link);
} else {
// if the link has multiple children, they should all be preserved
var container = this._doc.createElement("span");
while (link.childNodes.length > 0) {
container.appendChild(link.childNodes[0]);
}
link.parentNode.replaceChild(container, link);
}
} else {
link.setAttribute("href", toAbsoluteURI(href));
}
@@ -386,8 +407,8 @@ Readability.prototype = {
// Check if we have an heading containing this exact string, so we
// could assume it's the full title.
var headings = this._concatNodeLists(
doc.getElementsByTagName("h1"),
doc.getElementsByTagName("h2")
doc.getElementsByTagName("h1"),
doc.getElementsByTagName("h2")
);
var trimmedTitle = curTitle.trim();
var match = this._someNode(headings, function(heading) {
@@ -422,7 +443,7 @@ Readability.prototype = {
var curTitleWordCount = wordCount(curTitle);
if (curTitleWordCount <= 4 &&
(!titleHadHierarchicalSeparators ||
curTitleWordCount != wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1)) {
curTitleWordCount != wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1)) {
curTitle = origTitle;
}
@@ -439,13 +460,13 @@ Readability.prototype = {
var doc = this._doc;
// Remove all style tags in head
this._removeNodes(doc.getElementsByTagName("style"));
this._removeNodes(this._getAllNodesWithTag(doc, ["style"]));
if (doc.body) {
this._replaceBrs(doc.body);
}
this._replaceNodeTags(doc.getElementsByTagName("font"), "SPAN");
this._replaceNodeTags(this._getAllNodesWithTag(doc, ["font"]), "SPAN");
},
/**
@@ -456,8 +477,8 @@ Readability.prototype = {
_nextElement: function (node) {
var next = node;
while (next
&& (next.nodeType != this.ELEMENT_NODE)
&& this.REGEXPS.whitespace.test(next.textContent)) {
&& (next.nodeType != this.ELEMENT_NODE)
&& this.REGEXPS.whitespace.test(next.textContent)) {
next = next.nextSibling;
}
return next;
@@ -525,7 +546,7 @@ Readability.prototype = {
_setNodeTag: function (node, tag) {
this.log("_setNodeTag", node, tag);
if (node.__JSDOMParser__) {
if (this._docJSDOMParser) {
node.localName = tag.toLowerCase();
node.tagName = tag.toUpperCase();
return node;
@@ -588,7 +609,7 @@ Readability.prototype = {
this._forEachNode(articleContent.children, function (topCandidate) {
this._cleanMatchedNodes(topCandidate, function (node, matchString) {
return /share/.test(matchString) && node.textContent.length < shareElementThreshold;
return this.REGEXPS.shareElements.test(matchString) && node.textContent.length < shareElementThreshold;
});
});
@@ -625,7 +646,7 @@ Readability.prototype = {
this._cleanConditionally(articleContent, "div");
// Remove extra paragraphs
this._removeNodes(articleContent.getElementsByTagName("p"), function (paragraph) {
this._removeNodes(this._getAllNodesWithTag(articleContent, ["p"]), function (paragraph) {
var imgCount = paragraph.getElementsByTagName("img").length;
var embedCount = paragraph.getElementsByTagName("embed").length;
var objectCount = paragraph.getElementsByTagName("object").length;
@@ -662,7 +683,7 @@ Readability.prototype = {
*
* @param Element
* @return void
**/
**/
_initializeNode: function(node) {
node.readability = {"contentScore": 0};
@@ -769,7 +790,7 @@ Readability.prototype = {
*
* @param page a document to run upon. Needs to be a full document, complete with body.
* @return Element
**/
**/
_grabArticle: function (page) {
this.log("**** grabArticle ****");
var doc = this._doc;
@@ -823,8 +844,8 @@ Readability.prototype = {
// Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
if ((node.tagName === "DIV" || node.tagName === "SECTION" || node.tagName === "HEADER" ||
node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" ||
node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") &&
node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" ||
node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") &&
this._isElementWithoutContent(node)) {
node = this._removeAndGetNext(node);
continue;
@@ -880,7 +901,7 @@ Readability.prototype = {
* Then add their score to their parent node.
*
* A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
**/
**/
var candidates = [];
this._forEachNode(elementsToScore, function(elementToScore) {
if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === "undefined")
@@ -1085,7 +1106,7 @@ Readability.prototype = {
if (nodeLength > 80 && linkDensity < 0.25) {
append = true;
} else if (nodeLength < 80 && nodeLength > 0 && linkDensity === 0 &&
nodeContent.search(/\.( |$)/) !== -1) {
nodeContent.search(/\.( |$)/) !== -1) {
append = true;
}
}
@@ -1264,12 +1285,12 @@ Readability.prototype = {
// get title
metadata.title = values["dc:title"] ||
values["dcterm:title"] ||
values["og:title"] ||
values["weibo:article:title"] ||
values["weibo:webpage:title"] ||
values["title"] ||
values["twitter:title"];
values["dcterm:title"] ||
values["og:title"] ||
values["weibo:article:title"] ||
values["weibo:webpage:title"] ||
values["title"] ||
values["twitter:title"];
if (!metadata.title) {
metadata.title = this._getArticleTitle();
@@ -1277,17 +1298,17 @@ Readability.prototype = {
// get author
metadata.byline = values["dc:creator"] ||
values["dcterm:creator"] ||
values["author"];
values["dcterm:creator"] ||
values["author"];
// get description
metadata.excerpt = values["dc:description"] ||
values["dcterm:description"] ||
values["og:description"] ||
values["weibo:article:description"] ||
values["weibo:webpage:description"] ||
values["description"] ||
values["twitter:description"];
values["dcterm:description"] ||
values["og:description"] ||
values["weibo:article:description"] ||
values["weibo:webpage:description"] ||
values["description"] ||
values["twitter:description"];
// get site name
metadata.siteName = values["og:site_name"];
@@ -1299,14 +1320,14 @@ Readability.prototype = {
* Removes script tags from the document.
*
* @param Element
**/
**/
_removeScripts: function(doc) {
this._removeNodes(doc.getElementsByTagName("script"), function(scriptNode) {
this._removeNodes(this._getAllNodesWithTag(doc, ["script"]), function(scriptNode) {
scriptNode.nodeValue = "";
scriptNode.removeAttribute("src");
return true;
});
this._removeNodes(doc.getElementsByTagName("noscript"));
this._removeNodes(this._getAllNodesWithTag(doc, ["noscript"]));
},
/**
@@ -1316,7 +1337,7 @@ Readability.prototype = {
*
* @param Element
* @param string tag of child element
**/
**/
_hasSingleTagInsideElement: function(element, tag) {
// There should be exactly 1 element child with given tag
if (element.children.length != 1 || element.children[0].tagName !== tag) {
@@ -1326,15 +1347,15 @@ Readability.prototype = {
// And there should be no text nodes with real content
return !this._someNode(element.childNodes, function(node) {
return node.nodeType === this.TEXT_NODE &&
this.REGEXPS.hasContent.test(node.textContent);
this.REGEXPS.hasContent.test(node.textContent);
});
},
_isElementWithoutContent: function(node) {
return node.nodeType === this.ELEMENT_NODE &&
node.textContent.trim().length == 0 &&
(node.children.length == 0 ||
node.children.length == node.getElementsByTagName("br").length + node.getElementsByTagName("hr").length);
node.textContent.trim().length == 0 &&
(node.children.length == 0 ||
node.children.length == node.getElementsByTagName("br").length + node.getElementsByTagName("hr").length);
},
/**
@@ -1345,23 +1366,23 @@ Readability.prototype = {
_hasChildBlockElement: function (element) {
return this._someNode(element.childNodes, function(node) {
return this.DIV_TO_P_ELEMS.indexOf(node.tagName) !== -1 ||
this._hasChildBlockElement(node);
this._hasChildBlockElement(node);
});
},
/***
* Determine if a node qualifies as phrasing content.
* https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content
**/
**/
_isPhrasingContent: function(node) {
return node.nodeType === this.TEXT_NODE || this.PHRASING_ELEMS.indexOf(node.tagName) !== -1 ||
((node.tagName === "A" || node.tagName === "DEL" || node.tagName === "INS") &&
this._everyNode(node.childNodes, this._isPhrasingContent));
((node.tagName === "A" || node.tagName === "DEL" || node.tagName === "INS") &&
this._everyNode(node.childNodes, this._isPhrasingContent));
},
_isWhitespace: function(node) {
return (node.nodeType === this.TEXT_NODE && node.textContent.trim().length === 0) ||
(node.nodeType === this.ELEMENT_NODE && node.tagName === "BR");
(node.nodeType === this.ELEMENT_NODE && node.tagName === "BR");
},
/**
@@ -1371,7 +1392,7 @@ Readability.prototype = {
* @param Element
* @param Boolean normalizeSpaces (default: true)
* @return string
**/
**/
_getInnerText: function(e, normalizeSpaces) {
normalizeSpaces = (typeof normalizeSpaces === "undefined") ? true : normalizeSpaces;
var textContent = e.textContent.trim();
@@ -1388,7 +1409,7 @@ Readability.prototype = {
* @param Element
* @param string - what to split on. Default is ","
* @return number (integer)
**/
**/
_getCharCount: function(e, s) {
s = s || ",";
return this._getInnerText(e).split(s).length - 1;
@@ -1400,7 +1421,7 @@ Readability.prototype = {
*
* @param Element
* @return void
**/
**/
_cleanStyles: function(e) {
if (!e || e.tagName.toLowerCase() === "svg")
return;
@@ -1428,7 +1449,7 @@ Readability.prototype = {
*
* @param Element
* @return number (float)
**/
**/
_getLinkDensity: function(element) {
var textLength = this._getInnerText(element).length;
if (textLength === 0)
@@ -1450,7 +1471,7 @@ Readability.prototype = {
*
* @param Element
* @return number (Integer)
**/
**/
_getClassWeight: function(e) {
if (!this._flagIsActive(this.FLAG_WEIGHT_CLASSES))
return 0;
@@ -1489,7 +1510,7 @@ Readability.prototype = {
_clean: function(e, tag) {
var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1;
this._removeNodes(e.getElementsByTagName(tag), function(element) {
this._removeNodes(this._getAllNodesWithTag(e, [tag]), function(element) {
// Allow youtube and vimeo videos through as people usually want to see those.
if (isEmbed) {
// First, check the elements attributes to see if any of them contain youtube or vimeo
@@ -1670,7 +1691,7 @@ Readability.prototype = {
// without effecting the traversal.
//
// TODO: Consider taking into account original contentScore here.
this._removeNodes(e.getElementsByTagName(tag), function(node) {
this._removeNodes(this._getAllNodesWithTag(e, [tag]), function(node) {
// First check if this node IS data table, in which case don't remove it.
var isDataTable = function(t) {
return t._readabilityDataTable;
@@ -1704,10 +1725,7 @@ Readability.prototype = {
var input = node.getElementsByTagName("input").length;
var embedCount = 0;
var embeds = this._concatNodeLists(
node.getElementsByTagName("object"),
node.getElementsByTagName("embed"),
node.getElementsByTagName("iframe"));
var embeds = this._getAllNodesWithTag(node, ["object", "embed", "iframe"]);
for (var i = 0; i < embeds.length; i++) {
// If this embed has attribute that matches video regex, don't delete it.
@@ -1729,13 +1747,13 @@ Readability.prototype = {
var contentLength = this._getInnerText(node).length;
var haveToRemove =
(img > 1 && p / img < 0.5 && !this._hasAncestorTag(node, "figure")) ||
(!isList && li > p) ||
(input > Math.floor(p/3)) ||
(!isList && contentLength < 25 && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")) ||
(!isList && weight < 25 && linkDensity > 0.2) ||
(weight >= 25 && linkDensity > 0.5) ||
((embedCount === 1 && contentLength < 75) || embedCount > 1);
(img > 1 && p / img < 0.5 && !this._hasAncestorTag(node, "figure")) ||
(!isList && li > p) ||
(input > Math.floor(p/3)) ||
(!isList && contentLength < 25 && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")) ||
(!isList && weight < 25 && linkDensity > 0.2) ||
(weight >= 25 && linkDensity > 0.5) ||
((embedCount === 1 && contentLength < 75) || embedCount > 1);
return haveToRemove;
}
return false;
@@ -1753,7 +1771,7 @@ Readability.prototype = {
var endOfSearchMarkerNode = this._getNextNode(e, true);
var next = this._getNextNode(e);
while (next && next != endOfSearchMarkerNode) {
if (filter(next, next.className + " " + next.id)) {
if (filter.call(this, next, next.className + " " + next.id)) {
next = this._removeAndGetNext(next);
} else {
next = this._getNextNode(next);
@@ -1766,13 +1784,11 @@ Readability.prototype = {
*
* @param Element
* @return void
**/
**/
_cleanHeaders: function(e) {
for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) {
this._removeNodes(e.getElementsByTagName("h" + headerIndex), function (header) {
return this._getClassWeight(header) < 0;
});
}
this._removeNodes(this._getAllNodesWithTag(e, ["h1", "h2"]), function (header) {
return this._getClassWeight(header) < 0;
});
},
_flagIsActive: function(flag) {
@@ -1784,7 +1800,11 @@ Readability.prototype = {
},
_isProbablyVisible: function(node) {
return (!node.style || node.style.display != "none") && !node.hasAttribute("hidden");
// Have to null-check node.style and node.className.indexOf to deal with SVG and MathML nodes.
return (!node.style || node.style.display != "none")
&& !node.hasAttribute("hidden")
//check for "fallback-image" so that wikimedia math images are displayed
&& (!node.hasAttribute("aria-hidden") || node.getAttribute("aria-hidden") != "true" || (node.className && node.className.indexOf && node.className.indexOf("fallback-image") !== -1));
},
/**