mirror of
https://github.com/ManchildProductions/UXP-Fixed.git
synced 2026-05-29 14:23:25 +00:00
Update Readability from mozilla-central release branch (FF 60.0).
This commit is contained in:
@@ -560,7 +560,8 @@
|
||||
},
|
||||
};
|
||||
|
||||
var Document = function () {
|
||||
var Document = function (url) {
|
||||
this.documentURI = url;
|
||||
this.styleSheets = [];
|
||||
this.childNodes = [];
|
||||
this.children = [];
|
||||
@@ -600,6 +601,20 @@
|
||||
node.textContent = text;
|
||||
return node;
|
||||
},
|
||||
|
||||
get baseURI() {
|
||||
if (!this.hasOwnProperty("_baseURI")) {
|
||||
this._baseURI = this.documentURI;
|
||||
var baseElements = this.getElementsByTagName("base");
|
||||
var href = baseElements[0] && baseElements[0].getAttribute("href");
|
||||
if (href) {
|
||||
try {
|
||||
this._baseURI = (new URL(href, this._baseURI)).href;
|
||||
} catch (ex) {/* Just fall back to documentURI */}
|
||||
}
|
||||
}
|
||||
return this._baseURI;
|
||||
},
|
||||
};
|
||||
|
||||
var Element = function (tag) {
|
||||
@@ -1118,9 +1133,9 @@
|
||||
/**
|
||||
* Parses an HTML string and returns a JS implementation of the Document.
|
||||
*/
|
||||
parse: function (html) {
|
||||
parse: function (html, url) {
|
||||
this.html = html;
|
||||
var doc = this.doc = new Document();
|
||||
var doc = this.doc = new Document(url);
|
||||
this.readChildren(doc);
|
||||
|
||||
// If this is an HTML document, remove root-level children except for the
|
||||
|
||||
@@ -41,6 +41,7 @@ function Readability(uri, doc, options) {
|
||||
this._articleTitle = null;
|
||||
this._articleByline = null;
|
||||
this._articleDir = null;
|
||||
this._attempts = [];
|
||||
|
||||
// Configurable options
|
||||
this._debug = !!options.debug;
|
||||
@@ -275,34 +276,20 @@ Readability.prototype = {
|
||||
* @return void
|
||||
*/
|
||||
_fixRelativeUris: function(articleContent) {
|
||||
var scheme = this._uri.scheme;
|
||||
var prePath = this._uri.prePath;
|
||||
var pathBase = this._uri.pathBase;
|
||||
|
||||
var baseURI = this._doc.baseURI;
|
||||
var documentURI = this._doc.documentURI;
|
||||
function toAbsoluteURI(uri) {
|
||||
// If this is already an absolute URI, return it.
|
||||
if (/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/.test(uri))
|
||||
// Leave hash links alone if the base URI matches the document URI:
|
||||
if (baseURI == documentURI && uri.charAt(0) == "#") {
|
||||
return uri;
|
||||
|
||||
// Scheme-rooted relative URI.
|
||||
if (uri.substr(0, 2) == "//")
|
||||
return scheme + "://" + uri.substr(2);
|
||||
|
||||
// Prepath-rooted relative URI.
|
||||
if (uri[0] == "/")
|
||||
return prePath + uri;
|
||||
|
||||
// Dotslash relative URI.
|
||||
if (uri.indexOf("./") === 0)
|
||||
return pathBase + uri.slice(2);
|
||||
|
||||
// Ignore hash URIs:
|
||||
if (uri[0] == "#")
|
||||
return uri;
|
||||
|
||||
// Standard relative URI; add entire path. pathBase already includes a
|
||||
// trailing "/".
|
||||
return pathBase + uri;
|
||||
}
|
||||
// Otherwise, resolve against base URI:
|
||||
try {
|
||||
return new URL(uri, baseURI).href;
|
||||
} catch (ex) {
|
||||
// Something went wrong, just return the original:
|
||||
}
|
||||
return uri;
|
||||
}
|
||||
|
||||
var links = articleContent.getElementsByTagName("a");
|
||||
@@ -535,6 +522,7 @@ Readability.prototype = {
|
||||
this._clean(articleContent, "embed");
|
||||
this._clean(articleContent, "h1");
|
||||
this._clean(articleContent, "footer");
|
||||
this._clean(articleContent, "link");
|
||||
|
||||
// Clean out elements have "share" in their id/class combinations from final top candidates,
|
||||
// which means we don't remove the top candidates even they have "share".
|
||||
@@ -1089,24 +1077,45 @@ Readability.prototype = {
|
||||
if (this._debug)
|
||||
this.log("Article content after paging: " + articleContent.innerHTML);
|
||||
|
||||
var parseSuccessful = true;
|
||||
|
||||
// Now that we've gone through the full algorithm, check to see if
|
||||
// we got any meaningful content. If we didn't, we may need to re-run
|
||||
// grabArticle with different flags set. This gives us a higher likelihood of
|
||||
// finding the content, and the sieve approach gives us a higher likelihood of
|
||||
// finding the -right- content.
|
||||
if (this._getInnerText(articleContent, true).length < this._wordThreshold) {
|
||||
var textLength = this._getInnerText(articleContent, true).length;
|
||||
if (textLength < this._wordThreshold) {
|
||||
parseSuccessful = false;
|
||||
page.innerHTML = pageCacheHtml;
|
||||
|
||||
if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
|
||||
this._removeFlag(this.FLAG_STRIP_UNLIKELYS);
|
||||
this._attempts.push({articleContent: articleContent, textLength: textLength});
|
||||
} else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) {
|
||||
this._removeFlag(this.FLAG_WEIGHT_CLASSES);
|
||||
this._attempts.push({articleContent: articleContent, textLength: textLength});
|
||||
} else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
|
||||
this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY);
|
||||
this._attempts.push({articleContent: articleContent, textLength: textLength});
|
||||
} else {
|
||||
return null;
|
||||
this._attempts.push({articleContent: articleContent, textLength: textLength});
|
||||
// No luck after removing flags, just return the longest text we found during the different loops
|
||||
this._attempts.sort(function (a, b) {
|
||||
return a.textLength < b.textLength;
|
||||
});
|
||||
|
||||
// But first check if we actually have something
|
||||
if (!this._attempts[0].textLength) {
|
||||
return null;
|
||||
}
|
||||
|
||||
articleContent = this._attempts[0].articleContent;
|
||||
parseSuccessful = true;
|
||||
}
|
||||
} else {
|
||||
}
|
||||
|
||||
if (parseSuccessful) {
|
||||
// Find out text direction from ancestors of final top candidate.
|
||||
var ancestors = [parentOfTopCandidate, topCandidate].concat(this._getNodeAncestors(parentOfTopCandidate));
|
||||
this._someNode(ancestors, function(ancestor) {
|
||||
|
||||
@@ -47,7 +47,7 @@ var Agent = {
|
||||
* @return {object} Article object returned from Readability.
|
||||
*/
|
||||
parseDocument(uri, serializedDoc, options) {
|
||||
let doc = new JSDOMParser().parse(serializedDoc);
|
||||
let doc = new JSDOMParser().parse(serializedDoc, uri.spec);
|
||||
return new Readability(uri, doc, options).parse();
|
||||
},
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user