#!/usr/bin/python2.4
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2000-2005  Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
"""
Check HTML pages for broken links.
"""

import sys
import getopt
import codecs
import re
import os
import pprint
import socket
import optparse

# set default 60 seconds timeout
default_timeout = 60
socket.setdefaulttimeout(default_timeout)

import linkcheck
# initialize i18n, puts _() function into global namespace
linkcheck.init_i18n()
# override optparse gettext method
optparse._ = _
import linkcheck.log
import linkcheck.checker
import linkcheck.checker.cache
import linkcheck.checker.consumer
import linkcheck.configuration
import linkcheck.strformat
# optional modules
try:
    import optcomplete
    has_optcomplete = True
except ImportError:
    has_optcomplete = False
try:
    import profile
    has_profile = True
except ImportError:
    has_profile = False
try:
    import pstats
    has_pstats = True
except ImportError:
    has_pstats = False

# default profiling filename
_profile = "linkchecker.prof"
_username = None
_password = None

# main usage text
Usage = _("""USAGE\tlinkchecker [options] file-or-url...
""")

Notes = _("""NOTES
o A ! before a regular expression negates it. So '!^mailto:' matches
  everything but a mailto link.
o URLs on the command line starting with "ftp." are treated like
  "ftp://ftp.", URLs starting with "www." are treated like "http://www.".
  You can also give local files as arguments.
o If you have your system configured to automatically establish a
  connection to the internet (e.g. with diald), it will connect when
  checking links not pointing to your local system.
  See the --extern-strict-all option on how to prevent this.
o Javascript links are currently ignored.
o If your platform does not support threading, LinkChecker disables it
  automatically.
o You can supply multiple user/password pairs in a configuration file.
o To use proxies set $http_proxy, $https_proxy, $ftp_proxy, $gopher_proxy
  on Unix or Windows.
  On a Mac use the Internet Config.
o When checking 'news:' links the given NNTP host doesn't need to be the
  same as the host of the user browsing your pages!
""")

Retval = _(r"""RETURN VALUE
The return value is non-zero when
 o invalid links were found or
 o link warnings were found and --warnings option was given
 o a program error occurred
""")

Examples = _(r"""EXAMPLES
The most common use checks the given domain recursively, plus any
single URL pointing outside of the domain:
  linkchecker http://treasure.calvinsplayground.de/
Beware that this checks the whole site which can have several hundred
thousands URLs. Use the -r option to restrict the recursion depth.

Don't connect to mailto: hosts, only check their URL syntax. All other
links are checked as usual:
  linkchecker --intern='!^mailto:' --extern-strict-all www.mysite.org

Checking a local HTML file on Unix:
  linkchecker ../bla.html

Checking a local HTML file on Windows:
  linkchecker c:\temp\test.html

You can skip the "http://" url part if the domain starts with "www.":
  linkchecker www.myhomepage.de

You can skip the "ftp://" url part if the domain starts with "ftp.":
  linkchecker -r0 ftp.linux.org
""")

Logertypes = _(r"""OUTPUT TYPES
Note that by default only errors are logged.

text    Standard text output, logging URLs in keyword: argument fashion.
html    Log URLs in keyword: argument fashion, formatted as HTML.
        Additionally has links to the referenced pages. Invalid URLs have
        HTML and CSS syntax check links appended.
csv     Log check result in CSV format with one URL per line.
gml     Log parent-child relations between linked URLs as a GML graph.
        You should use the --verbose option to get a complete graph.
dot     Log parent-child relations between linked URLs as a DOT graph.
        You should use the --verbose option to get a complete graph.
xml     Log check result as machine-readable XML file.
sql     Log check result as SQL script with INSERT commands. An example
        script to create the initial SQL table is included as create.sql.
blacklist
        Suitable for cron jobs. Logs the check result into a file
        ~/.linkchecker/blacklist which only contains entries with invalid
        urls and the number of times they have failed.
none    Logs nothing. Suitable for scripts.
""")


def encode (s, codec="iso8859-15"):
    """
    Encode string with given codec for screen print.
    """
    return s.encode(codec, "ignore")


def print_version ():
    """
    Print the program version and exit.
    """
    print encode(linkcheck.configuration.AppInfo)
    sys.exit(0)


def print_usage (msg):
    """
    Print a program msg text to stderr and exit.
    """
    sys.stderr.write(encode(_("Error: %s") % msg))
    sys.stderr.write(os.linesep)
    sys.stderr.write(encode(_("Execute 'linkchecker -h' for help")))
    sys.stderr.write(os.linesep)
    sys.exit(1)


def viewprof ():
    """
    Print profiling data and exit.
    """
    if not has_pstats:
        linkcheck.log.error(linkcheck.LOG_CMDLINE,
                           _("The `pstats' Python module is not installed,"
                           " therefore the --viewprof option is disabled."))
        sys.exit(1)
    if not os.path.exists(_profile):
        linkcheck.log.warn(linkcheck.LOG_CMDLINE,
                           _("Could not find profiling file %r.") % _profile)
        sys.stderr.write(
                  _("Please run linkchecker with --profile to generate it."))
        sys.stderr.write(os.linesep)
        sys.exit(1)
    stats = pstats.Stats(_profile)
    stats.strip_dirs().sort_stats("cumulative").print_stats(100)
    sys.exit(0)


def try_compile_re (arg):
    """
    Try to compile the regular expression. On error print an error message
    and exit.
    """
    try:
        return re.compile(arg)
    except re.error, msg:
        linkcheck.log.error(linkcheck.LOG_CMDLINE,
           _("Syntax error in %r: %s", arg, msg))
        sys.exit(1)


def has_encoding (encoding):
    try:
        codecs.lookup(encoding)
        return True
    except LookupError:
        return False


class LCHelpFormatter (optparse.IndentedHelpFormatter):
    """
    Help formatter indenting paragraph-wise.
    """

    def format_option (self, option):
        # The help for each option consists of two parts:
        #   * the opt strings and metavars
        #     eg. ("-x", or "-fFILENAME, --file=FILENAME")
        #   * the user-supplied help string
        #     eg. ("turn on expert mode", "read data from FILENAME")
        #
        # If possible, we write both of these on the same line:
        #   -x      turn on expert mode
        #
        # But if the opt string list is too long, we put the help
        # string on a second line, indented to the same column it would
        # start in if it fit on the first line.
        #   -fFILENAME, --file=FILENAME
        #           read data from FILENAME
        result = []
        opts = self.option_strings[option]
        opt_width = self.help_position - self.current_indent - 2
        if len(opts) > opt_width:
            opts = "%*s%s\n" % (self.current_indent, "", opts)
            indent_first = self.help_position
        else:                       # start help on same line as opts
            opts = "%*s%-*s  " % (self.current_indent, "", opt_width, opts)
            indent_first = 0
        result.append(opts)
        if option.help:
            text = linkcheck.strformat.wrap(option.help, self.help_width)
            help_lines = text.splitlines()
            result.append("%*s%s\n" % (indent_first, "", help_lines[0]))
            result.extend(["%*s%s\n" % (self.help_position, "", line)
                           for line in help_lines[1:]])
        elif opts[-1] != "\n":
            result.append("\n")
        return "".join(result)


class LCOptionParser (optparse.OptionParser, object):
    """
    Option parser with custom help text layout.
    """

    def __init__ (self):
        """
        Initializing using our own help formatter class.
        """
        super(LCOptionParser, self).__init__(formatter=LCHelpFormatter())

    def error (self, msg):
        """
        Print usage info and given message.
        """
        print_usage(msg)

    def get_usage (self):
        """
        Return translated usage text.
        """
        return Usage

    def print_help (self, file=None):
        """
        Print translated help text.
        """
        s = u"%s\n%s\n%s\n%s" % (self.format_help(), Notes, Retval, Examples)
        s = s.encode("iso-8859-1", "replace")
        if os.name != 'posix':
            linkcheck.strformat.paginate(s)
        else:
            print s
        sys.exit(0)

# instantiate option parser and configure options
optparser = LCOptionParser()

################# general options ##################
group = optparse.OptionGroup(optparser, _("General options"))
group.add_option("-f", "--config", type="string", dest="configfile",
                 help=_(
"""Use CONFIGFILE as configuration file. As default LinkChecker first
searches /etc/linkchecker/linkcheckerrc and then ~/.linkchecker/linkcheckerrc
(under Windows <path-to-program>\\linkcheckerrc)."""))
group.add_option("-I", "--interactive", action="store_true",
                 dest="interactive", help=_(
"""Ask for url if none are given on the commandline."""))
group.add_option("-t", "--threads", type="int", dest="threads",
                 help=_(
"""Generate no more than num threads. Default number of threads is 10."""))

group.add_option("-V", "--version", action="store_true", dest="version",
                 help=_(
"""Print version and exit."""))

group.add_option("--priority", action="store_true", dest="priority",
                 help=_(
"""Run with normal thread scheduling priority. Per default LinkChecker
runs with low thread priority to be suitable as a background job."""))
optparser.add_option_group(group)


################# output options ##################
group = optparse.OptionGroup(optparser, _("Output options"))
group.add_option("-v", "--verbose", action="store_true", dest="verbose",
                 help=_(
"""Log all checked URLs (implies -w). Default is to log only invalid
URLs."""))
group.add_option("-w", "--warnings", action="store_true", dest="warnings",
                 help=_("""Log warnings."""))
group.add_option("-W", "--warning-regex", type="string", dest="warningregex",
                 help=_(
"""Define a regular expression which prints a warning if it matches
any content of the checked link. This applies only to valid pages,
so we can get their content.

Use this to check for pages that contain some form of error
message, for example 'This page has moved' or 'Oracle
Application Server error'. This option implies -w."""))
group.add_option("--warning-size-bytes", dest="warningsizebytes",
                 help=_(
"""Print a warning if content size is available and exceeds the given
number of bytes. This option implies -w."""))
group.add_option("-q", "--quiet", action="store_true", dest="quiet",
                 help=_(
"""Quiet operation. This is only useful with -F."""))
group.add_option("-o", "--output", type="string", dest="output",
                 metavar="TYPE[/ENCODING]",
                 help=_(
"""Specify output as %(loggertypes)s. Default output type is text.
ENCODING specifies the output encoding, the default is "iso-8859-15".
Valid encodings are listed at http://docs.python.org/lib/node127.html.""") % \
{'loggertypes': linkcheck.LoggerKeys})
group.add_option("-F", "--file-output", type="string", action="append",
                 dest="fileoutput", metavar="TYPE[/ENCODING][/FILENAME]",
                 help=_(
"""Output to a file linkchecker-out.TYPE, $HOME/.linkchecker/blacklist for
'blacklist' output, or FILENAME if specified.
ENCODING specifies the output encoding, the default is "iso-8859-15".
Valid encodings are listed at http://docs.python.org/lib/node127.html.
The FILENAME and ENCODING parts of the 'none' output type will be ignored,
else if the file already exists, it will be overwritten.
You can specify this option more than once. Valid file output TYPEs
are %(loggertypes)s. You can specify this option multiple times to output
to more than one file. Default is no file output. Note that you can
suppress all console output with the option '-o none'.""") % \
{'loggertypes': linkcheck.LoggerKeys})
group.add_option("--no-status", action="store_false", dest="status",
                 default=True, help=_(
"""Do not print check status messages."""))
group.add_option("-D", "--debug", type="string", action="append",
                 metavar="LOGGER",
                 help=_("""Print debugging output for given logger.
Available loggers are %(lognamelist)s.
Specifying 'all' is an alias for specifying all available loggers.
The option can be given multiple times to debug with more
than one logger.

For accurate results, threading will be disabled during debug runs.""") % \
{"lognamelist": linkcheck.lognamelist})
group.add_option("--profile", action="store_true", dest="profile",
                 help=_(
"""Write profiling data into a file named %s in the
current working directory. See also --viewprof.""") % _profile)
group.add_option("--viewprof", action="store_true", dest="viewprof",
                 help=_(
"""Print out previously generated profiling data. See also --profile."""))
optparser.add_option_group(group)


################# checking options ##################
group = optparse.OptionGroup(optparser, _("Checking options"))
group.add_option("-r", "--recursion-level", type="int", dest="recursionlevel",
                 help=_(
"""Check recursively all links up to given depth. A negative depth
will enable inifinite recursion. Default depth is infinite."""))
group.add_option("-i", "--intern", type="string", action="append",
                 dest="intern", help=_(
""" regex, --intern=regex
Assume URLs that match the given expression as internal.
LinkChecker descends recursively only to internal URLs, not to
external."""))
group.add_option("-e", "--extern", type="string", action="append",
                 dest="extern", help=_(
"""Assume urls that match the given expression as external.
Only internal HTML links are checked recursively."""))
group.add_option("--extern-strict", type="string", action="append",
                 dest="externstrict", help=_(
"""Assume urls that match the given expression as strict external.
Only internal HTML links are checked recursively."""))
group.add_option("-s", "--extern-strict-all", action="store_true",
                 dest="externstrictall", help=_(
"""Check only syntax of external links, do not try to connect to them.
For local file urls, only local files are internal. For
http and ftp urls, all urls at the same domain name are internal."""))
group.add_option("-d", "--denyallow", action="store_true", dest="denyallow",
                 help=_(
"""Swap checking order to external/internal. Default checking order
is internal/external."""))
group.add_option("-C", "--cookies", action="store_true", dest="cookies",
                 help=_(
"""Accept and send HTTP cookies according to RFC 2109. Only cookies
which are sent back to the originating server are accepted.
Sent and accepted cookies are provided as additional logging
information."""))
group.add_option("-a", "--anchors", action="store_true", dest="anchors",
                 help=_(
"""Check HTTP anchor references. This option applies to both internal
and external urls. Default is don't check anchors.
This option implies -w because anchor errors are always warnings."""))
group.add_option("--no-anchor-caching", action="store_false",
                 dest="anchorcaching", help=_(
"""Treat url#anchora and url#anchorb as equal on caching. This
is the default browser behaviour, but it's not specified in
the URI specification. Use with care."""))
group.add_option("-u", "--user", type="string", dest="username",
                 help=_(
"""Try given username for HTTP and FTP authorization.
For FTP the default username is 'anonymous'. See also -p."""))
group.add_option("-p", "--password", type="string", dest="password",
                 help=_(
"""Try given password for HTTP and FTP authorization.
For FTP the default password is 'anonymous@'. See also -u."""))
group.add_option("--timeout", type="int", dest="timeout",
                 help=_(
"""Set the timeout for TCP connection attempts in seconds. The default
timeout is %d seconds.""") % default_timeout)
group.add_option("-P", "--pause", type="int", dest="pause",
                 help=_(
"""Pause PAUSE seconds between each url check. This option implies -t0.
Default is no pause between requests."""))
group.add_option("-N", "--nntp-server", type="string", dest="nntpserver",
                 help=_(
"""Specify an NNTP server for 'news:...' links. Default is the
environment variable NNTP_SERVER. If no host is given,
only the syntax of the link is checked."""))
group.add_option("--no-proxy-for", type="string", action="append",
                 dest="noproxyfor", help=_(
"""Contact hosts that match the given expression directly instead of
going through a proxy."""))
optparser.add_option_group(group)

################# deprecated options ##################
group = optparse.OptionGroup(optparser, _("Deprecated options"))
group.add_option("--status", action="store_true", dest="status",
                 help=_(
"""Print check status every 5 seconds to stderr. This is the default."""))
optparser.add_option_group(group)

################# auto completion #####################
if has_optcomplete:
    optcomplete.autocomplete(optparser)

# read and parse command line options and arguments
(options, args) = optparser.parse_args()

# build a config object for this check session
config = linkcheck.configuration.Configuration()
# initialize logging
if options.debug:
    allowed_debugs = linkcheck.lognames.keys()
    for _name in options.debug:
        if _name not in allowed_debugs:
            print_usage(_("Invalid debug level %(level)r") % {'level': _name})
config.init_logging(debug=options.debug)
linkcheck.log.debug(linkcheck.LOG_CMDLINE, "Python %s on %s",
                    sys.version, sys.platform)
# read configuration files
try:
    if options.configfile:
        config.read(files=[options.configfile])
    else:
        config.read()
except linkcheck.LinkCheckerError, msg:
    # config error
    print_usage(str(msg))
# apply commandline options and arguments
constructauth = False
do_profile = False
if not options.priority:
    import linkcheck.threader
    linkcheck.threader.set_thread_priority(linkcheck.threader.PRIO_LOW)
if options.anchors is not None:
    config["anchors"] = options.anchors
    config["warnings"] = True
if options.extern:
    pats = [linkcheck.get_link_pat(arg) for arg in options.extern]
    config["externlinks"].extend(pats)
if options.externstrict:
    pats = [linkcheck.get_link_pat(arg, strict=True) \
            for arg in options.externstrict]
    config["externlinks"].extend(pats)
if options.noproxyfor:
    ros = [try_compile_re(arg) for arg in options.noproxyfor]
    config["noproxyfor"].extend(ros)
if options.output:
    if "/" in options.output:
        logtype, encoding = options.output.split("/", 1)
    else:
        logtype, encoding = options.output, "iso-8859-15"
    if not linkcheck.Loggers.has_key(logtype.lower()):
        print_usage(_("Unknown logger type %r in %r for option %s") % \
                   (logtype, options.output, "'-o, --output'"))
    if logtype != 'none' and not has_encoding(encoding):
        print_usage(_("Unknown encoding %r in %r for option %s") % \
                   (encoding, options.output, "'-o, --output'"))
    config['logger'] = config.logger_new(logtype.lower(), encoding=encoding)
if options.fileoutput:
    ns = {'fileoutput': 1}
    for arg in options.fileoutput:
        ftype = arg
        # look for (optional) filename and encoding
        if '/' in ftype:
            ftype, suffix = ftype.split('/', 1)
            if suffix:
                if has_encoding(suffix):
                    # it was an encoding
                    ns['encoding'] = suffix
                elif '/' in suffix:
                    # look for (optional) encoding
                    encoding, filename = suffix.split('/', 1)
                    if has_encoding(encoding):
                        ns['encoding'] = encoding
                        ns['filename'] = filename
                    else:
                        ns['filename'] = suffix
                else:
                    ns['filename'] = suffix
        if not linkcheck.Loggers.has_key(ftype):
            print_usage(_("Unknown logger type %r in %r for option %s") % \
                       (ftype, options.output, "'-F, --file-output'"))
        if ftype != 'none' and 'encoding' in ns and \
           not has_encoding(ns['encoding']):
            print_usage(_("Unknown encoding %r in %r for option %s") % \
                       ns['encoding'], options.output, "'-F, --file-output'")
        # generating loggers with fileoutput can throw
        # an exception when opening the file
        try:
            logger = config.logger_new(ftype, **ns)
        except OSError, msg:
            print_usage(_("Illegal argument %r for option %s: %s") % \
                       (arg, "'-F, --file-output'", str(msg)))
        config['fileoutput'].append(logger)
if options.interactive is not None:
    config['interactive'] = options.interactive
if options.intern:
    pats = [linkcheck.get_link_pat(arg) for arg in options.intern]
    config["internlinks"].extend(pats)
if options.denyallow is not None:
    config["denyallow"] = options.denyallow
if options.nntpserver:
    config["nntpserver"] = options.nntpserver
if options.anchorcaching is not None:
    config["anchorcaching"] = options.anchorcaching
if options.password is not None:
    _password = options.password
    constructauth = True
if options.pause is not None:
    if options.pause >= 0:
        config["wait"] = options.pause
    else:
        print_usage(_("Illegal argument %d for option %s") % \
                   (options.pause, "'-P, --pause'"))
if options.profile is not None:
    do_profile = options.profile
if options.quiet is not None:
    config['logger'] = config.logger_new('none')
if options.recursionlevel is not None:
    config["recursionlevel"] = options.recursionlevel
if options.externstrictall is not None:
    config["externstrictall"] = options.externstrictall
if options.status is not None:
    config['status'] = options.status
if options.threads is not None:
    if options.threads < 1:
        print_usage(_("Illegal argument %d for option %s") % \
                   (options.threads, "'-t, --threads'"))
    config["threads"] = options.threads
if options.timeout is not None:
    if options.timeout > 0:
        socket.setdefaulttimeout(options.timeout)
    else:
        print_usage(_("Illegal argument %r for option %s") % \
                   (options.timeout, "'--timeout'"))
if options.username is not None:
    _username = options.username
    constructauth = True
if options.version is not None:
    print_version()
if options.verbose is not None:
    if options.verbose:
        config["verbose"] = True
        config["warnings"] = True
if options.viewprof:
    viewprof()
if options.warnings is not None:
    config["warnings"] = options.warnings
if options.warningregex is not None:
    config["warningregex"] = try_compile_re(options.warningregex)
    config["warnings"] = True
if options.warningsizebytes is not None:
    config["warnsizebytes"] = options.warningsizebytes
if options.cookies is not None:
    config['cookies'] = options.cookies
if constructauth:
    config["authentication"].append({'pattern': try_compile_re(".+"),
                                     'user': _username,
                                     'password': _password})

linkcheck.log.debug(linkcheck.LOG_CMDLINE, "configuration: %s",
                    pprint.pformat(config.items()))
# warn about sitemap loggers and verbose output
klasses = [c.__class__ for c in [config['logger']] + config['fileoutput']]
if (linkcheck.logger.gml.GMLLogger in klasses or \
    linkcheck.logger.dot.DOTLogger in klasses) and not config['verbose']:
    linkcheck.log.warn(linkcheck.LOG_CMDLINE,
           _("Using DOT or GML loggers without verbose output"
             " gives an incomplete sitemap graph."))

# try to generate empty config dir if not already there
userconfdir = linkcheck.configuration.normpath("~/.linkchecker")
if not os.path.exists(userconfdir):
    try:
        os.mkdir(userconfdir, 0700)
    except OSError:
        # ignore errors
        pass

# interactive input
if len(args) <= 0:
    if config['interactive']:
        urls = raw_input(
                  _("enter one or more urls, separated by white-space\n--> "))
        args = urls.split()
    else:
        linkcheck.log.warn(linkcheck.LOG_CMDLINE, _("no files or urls given"))

# initialize the cache and the consumer model
cache = linkcheck.checker.cache.Cache()
consumer = linkcheck.checker.consumer.Consumer(config, cache)
# syntactic sugar
for url in args:
    if url.lower().startswith("www."):
        url = "http://%s" % url
    elif url.lower().startswith("ftp."):
        url = "ftp://%s" % url
    url_data = linkcheck.checker.get_url_from(url, 0, consumer, cmdline=True)
    consumer.append_url(url_data)

############################# check the urls ################################
if do_profile and not has_profile:
    linkcheck.log.warn(linkcheck.LOG_CMDLINE,
                       _("The `profile' Python module is not installed,"
                       " therefore the --profile option is disabled."))

if do_profile and has_profile:
    run = True
    if os.path.exists(_profile):
        question = _("""Overwrite profiling file %r?
Press Ctrl-C to cancel, RETURN to continue.""") % _profile
        try:
            raw_input(question)
        except KeyboardInterrupt:
            sys.stderr.write(os.linesep)
            sys.stderr.write(_("Canceled."))
            sys.stderr.write(os.linesep)
            run = False
    if run:
        profile.run("linkcheck.checker.check_urls(consumer)", _profile)
else:
    try:
        import psyco
        # psyco >= 1.4.0 final is needed
        if psyco.__version__ >= 0x10400f0:
            psyco.profile()
        else:
            # warn about old psyco version
            linkcheck.log.warn(linkcheck.LOG_CMDLINE, 
         _("""Psyco is installed but not used since the version is too old.
Psyco >= 1.4 is needed."""))
    except ImportError:
        # no psyco available, just ignore
        pass
    linkcheck.checker.check_urls(consumer)
#############################################################################

# interactive input end
if config['interactive']:
    raw_input(_("Hit RETURN to finish"))

# if errors are encountered, exit with non-zero status
# same applies to warnings when --warnings options was given
if consumer.errors or (consumer.warnings and config['warnings']):
    sys.exit(1)
