Source code for linkcheck.checker

# Copyright (C) 2000-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Main functions for link checking.
"""

import os
import html
import urllib.parse

from .. import url as urlutil, log, LOG_CHECK

MAX_FILESIZE = 1024 * 1024 * 10  # 10MB


[docs] def guess_url(url): """Guess if URL is a http or ftp URL. @param url: the URL to check @type url: unicode @return: url with *http://* or *ftp://* prepended if it's detected as a http respective ftp URL. @rtype: unicode """ if url.lower().endswith(".html") and "/" not in url: return url if url.lower().startswith("www."): # syntactic sugar return f"http://{url}" elif url.lower().startswith("ftp."): # syntactic sugar return f"ftp://{url}" return url
[docs] def absolute_url(base_url, base_ref, parent_url): """ Search for the absolute url to detect the link type. This does not join any url fragments together! @param base_url: base url from a link tag @type base_url: string or None @param base_ref: base url from <base> tag @type base_ref: string or None @param parent_url: url of parent document @type parent_url: string or None """ if base_url and urlutil.url_is_absolute(base_url): return base_url elif base_ref and urlutil.url_is_absolute(base_ref): return base_ref elif parent_url and urlutil.url_is_absolute(parent_url): return parent_url return ""
[docs] def get_url_from( base_url, recursion_level, aggregate, parent_url=None, base_ref=None, line=None, column=None, page=0, name="", parent_content_type=None, extern=None, url_encoding=None, ): """ Get url data from given base data. @param base_url: base url from a link tag @type base_url: string or None @param recursion_level: current recursion level @type recursion_level: number @param aggregate: aggregate object @type aggregate: aggregate.Consumer @param parent_url: parent url @type parent_url: string or None @param base_ref: base url from <base> tag @type base_ref: string or None @param line: line number @type line: number @param column: column number @type column: number @param page: page number @type page: number @param name: link name @type name: string @param extern: (is_extern, is_strict) or None @type extern: tuple(int, int) or None """ if base_url is not None: # left strip for detection of URL scheme base_url_stripped = base_url.lstrip() else: base_url_stripped = base_url url = absolute_url(base_url_stripped, base_ref, parent_url).lower() if ":" in url: scheme = url.split(":", 1)[0].lower() else: scheme = None if not (url or name): # use filename as base url, with slash as path separator name = base_url.replace("\\", "/") allowed_schemes = aggregate.config["allowedschemes"] # ignore local PHP files with execution directives local_php = ( parent_content_type == 'application/x-httpd-php' and '<?' in base_url and '?>' in base_url and scheme == 'file' ) if local_php or (allowed_schemes and scheme not in allowed_schemes): klass = ignoreurl.IgnoreUrl else: assume_local_file = recursion_level == 0 klass = get_urlclass_from(scheme, assume_local_file=assume_local_file) if "AnchorCheck" in aggregate.config["enabledplugins"] and \ klass == fileurl.FileUrl: klass = fileurl.AnchorCheckFileUrl log.debug(LOG_CHECK, "%s handles url %s", klass.__name__, base_url) return klass( base_url, recursion_level, aggregate, parent_url=parent_url, base_ref=base_ref, line=line, column=column, page=page, name=name, extern=extern, url_encoding=url_encoding, )
[docs] def get_urlclass_from(scheme, assume_local_file=False): """Return checker class for given URL scheme. If the scheme cannot be matched and assume_local_file is True, assume a local file. """ if scheme in ("http", "https"): klass = httpurl.HttpUrl elif scheme == "ftp": klass = ftpurl.FtpUrl elif scheme == "file": klass = fileurl.FileUrl elif scheme == "mailto": klass = mailtourl.MailtoUrl elif scheme == "dns": klass = dnsurl.DnsUrl elif scheme == "itms-services": klass = itmsservicesurl.ItmsServicesUrl elif scheme and unknownurl.is_unknown_scheme(scheme): klass = unknownurl.UnknownUrl elif assume_local_file: klass = fileurl.FileUrl else: klass = unknownurl.UnknownUrl return klass
[docs] def get_index_html(urls): """ Construct artificial index.html from given URLs. @param urls: URL strings @type urls: iterator of string """ lines = ["<html>", "<body>"] for entry in urls: name = html.escape(entry) try: url = html.escape(urllib.parse.quote(entry)) except UnicodeEncodeError: log.warn(LOG_CHECK, "Unable to convert entry to Unicode") continue except KeyError: # Some unicode entries raise KeyError. url = name lines.append(f'<a href="{url}">{name}</a>') lines.extend(["</body>", "</html>"]) return os.linesep.join(lines).encode()
# all the URL classes from . import ( # noqa: E402 fileurl, unknownurl, ftpurl, httpurl, dnsurl, mailtourl, ignoreurl, itmsservicesurl, )