#!/usr/bin/env python
# vim: set fileencoding=utf-8 tabstop=4 shiftwidth=4 expandtab smartindent:
u"""

RSS Mirror
----------

Mirrors on the local disk the pages listed in an RSS feed, using Wget or
HTTrack.

Requires Python >= 2.6


Configuration file
~~~~~~~~~~~~~~~~~~

RSS-mirror uses a configuration file to list the RSS feed that should be
downloaded. This file must be placed in ``~/.config/rss-mirror.conf`` and is in
INI format. Example::

    [DEFAULT]
    output = ~/pda/webpages

    [owni]
    url = http://owni.fr/feed

    [zenhabits]
    url = http://zenhabits.net/feed

    [rue89-ecologie]
    url = http://www.rue89.com/tag/ecologie/feed
    title = Rue89 - Ecologie

The ``DEFAULT`` section has an ``output`` key pointing to the output directory
where the webpages will be downloaded.

Each section (except DEFAULT) is a feed to download. It has a ``url`` key which
is self-explanatory and an optional ``title`` key which will be used as a title
for the feed in the summary page.


Credits
~~~~~~~

.. :Authors:
       Aurélien Bompard <aurelien@bompard.org> <http://aurelien.bompard.org>

.. :License:
       GNU GPL v3 or later

"""

import os
import sys
import urllib
import urllib2
import re
import shutil
import optparse
import datetime
import time
import itertools
import xml.etree.ElementTree as etree
from urlparse import urlparse
from subprocess import call
from zipfile import ZipFile, BadZipfile
from ConfigParser import SafeConfigParser


IUI_VERSION = "0.40-alpha1"
CONFIG_PATH = "~/.config/rss-mirror.conf"


class Downloader(object):
    """
    Abstract downloader class

    :cvar return_codes_ok: list of non-zero return codes that are actually OK
    :type return_codes_ok: ``list``
    """

    return_codes_ok = []

    def get_command(self, destdir, url, options=None):
        """Returns the system command to execute"""
        raise NotImplementedError

    def get_start_path(self, basedir, page):
        """Returns the path to the downloaded page on the disk"""
        raise NotImplementedError


class HttrackDownloader(Downloader):
    """
    Download using httrack. More features than wget, but is has some bugs, like
    downloading CSS stylesheets in ``@import`` constructs.
    """

    name = "httrack"
    opts = [
        "-%l", "fr", # language
        "-Y", # mirror links
        "-C0", # no cache
        "-b0", # no cookies
        "-n", # download "near" files
        "-L0", # DOS-compatible file names
        "-d", # stay on the same domain
        "-x", # replace external links by error page
        "-%u", #url hacks: various hacks to limit duplicate URLs
        "-F", "rss-mirror (allow like Gecko)", # user-agent
    ]

    def __init__(self):
        super(HttrackDownloader, self).__init__()
        recursive = config.getint("DEFAULT", "recursive")
        if recursive:
            self.opts.append("-r%d" % recursive)

    def get_command(self, destdir, url, options=None):
        command = ["httrack"]
        command.extend(self.opts)
        if options:
            command.extend(options)
        command.extend(["-O", destdir, url])
        return command

    def get_start_path(self, basedir, title):
        indexfile = open(os.path.join(basedir, title, "index.html"))
        mo = re.search('<meta HTTP-EQUIV="Refresh" CONTENT="0; URL=(.*)">',
                       indexfile.read())
        indexfile.close()
        return mo.group(1)


class WgetDownloader(Downloader):
    """
    Download using wget. Simple and fast.

    I use the ``-nv`` switch to avoid creating the whole directory structure
    mirroring the website structure, because the FAT32 filesystem does not like
    very very long names.
    """

    name = "wget"
    opts = [
        "-nv", # non verbose
        "-k", # convert links
        "-p", # download needed files for the page
        "-N", # timestamping
        "--restrict-file-names=windows,ascii,lowercase",
        "-E", # adjust extension
        "-H", # allow going on a different domain
        "--timeout=15", # it's 900 by default...
        "--tries=2", # it's 20 by default...
        "-nd", # avoid having 255+ chars paths
        "--no-check-certificate", # SSL
        # User-agent: try to get the mobile version of the page
        "-U", ("Mozilla/5.0 (Linux; U; Android 2.2; en-us; rss-mirror) "
               "AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile "
               "Safari/533.1"),
    ]
    return_codes_ok = [3, 4, 8]
    # 3: I/O error, usually because the filename is too long
    # 4: Network error (broken link on the page)
    # 8: Server issued error response (broken link on the page)

    def __init__(self):
        super(WgetDownloader, self).__init__()
        recursive = config.getint("DEFAULT", "recursive")
        if recursive:
            self.opts.extend(["-r", "-l%d" % recursive])

    def get_command(self, destdir, url, options=None):
        command = ["wget", ]
        command.extend(self.opts)
        if options:
            command.extend(options)
        command.extend(["-P", destdir, url])
        return command

    def get_start_path(self, basedir, title):
        urlfile = open(os.path.join(basedir, title, "url.txt"))
        url = urlfile.read().strip()
        urlfile.close()
        if url.endswith("/"):
            url += "index.html"
        if "-nv" in self.opts:
            local_path = self.get_start_path_nodirs(basedir, url)
        else:
            if os.path.exists(os.path.join(basedir, title, "index.html")):
                # downloaded with httrack
                httrack_dl = HttrackDownloader()
                return httrack_dl.get_start_path(basedir, page)
            local_path = self.get_start_path_dirs(basedir, url)
        if os.path.isfile(os.path.join(basedir, title, local_path)):
            return local_path
        if os.path.isfile(os.path.join(basedir, title, local_path) + ".html"):
            return local_path + ".html"
        else:
            print >>sys.stderr, "Can't find the start page: ", local_path
            print >>sys.stderr, "Tried:", os.path.join(basedir, title, local_path), \
                            os.path.join(basedir, title, local_path) + ".html"
            return "."

    def get_start_path_nodirs(self, basedir, url):
        url_parsed = urlparse(url)
        local_path = os.path.basename(url_parsed[2]).lower()
        if not local_path:
            local_path = "index.html"
        if url_parsed[4]:
            local_path += "@" + url_parsed[4].lower().replace("/", "%2f")
        return local_path

    def get_start_path_dirs(self, basedir, url):
        local_path = url.replace("http://","").lower().replace("?", "@")
        return local_path


def get_engines():
    engines = {}
    for downloader in Downloader.__subclasses__():
        engines[downloader.name] = downloader
    return engines


def extract_options(desc):
    opts = []
    options_matches = re.findall("\{options:\s+[^}]+\}", desc)
    for options_match in options_matches:
        inner_options = options_match[10:-1].strip()
        for inner_opt in inner_options.split():
            inner_opt = inner_opt.strip()
            if not inner_opt:
                continue
            opts.append(inner_opt)
    return opts


class Page(object):
    """
        - ``title`` is a shortned title derived from the page title,
        - ``link`` is the URL,
        - ``title_full`` is the HTML page title,
        - ``timestamp`` is the UNIX timestamp of the page in the RSS feed, which is
          probably the moment you bookmarked it.
    """


    allowed_chars = re.compile("[^a-zA-Z0-9_-]")
    desc_cleanup = re.compile("<[^>]+>")

    @classmethod
    def parse(cls, item):
        titlesize = config.getint("DEFAULT", "title_size",)
        page = cls()
        page.title_full = item.findtext("title").strip()
        page.title = page.title_full[:titlesize].strip().lower()
        page.title = cls.allowed_chars.sub("", page.title.replace(" ","_"))
        page.link = item.findtext("link").strip()
        timestamp = item.findtext("pubDate")
        try:
            timestamp = datetime.datetime.strptime(timestamp,
                            "%a, %d %b %Y %H:%M:%S EDT")
            timestamp = int(time.mktime(timestamp.timetuple()))
        except ValueError:
            timestamp = int(time.time())
        page.timestamp = timestamp
        page.description = item.findtext("description", "")
        page.description = cls.desc_cleanup.sub("", page.description)
        page.options = extract_options(page.description)
        return page

    def download(self, outdir, downloader):
        """Use the downloader to mirror the page"""
        destdir = os.path.join(outdir, self.title)
        if os.path.exists(destdir):
            feedname = os.path.basename(outdir)
            print "Already downloaded:", os.path.join(feedname, self.title)
            return
        print "Downloading", self.title, self.link
        try:
            command = downloader.get_command(destdir, self.link,
                                             options=self.options)
            print " ".join(command)
            if config.getboolean("DEFAULT", "debug"):
                retcode = 0
            else:
                retcode = call(command)
            if retcode < 0:
                print
                print >> sys.stderr, "Child was terminated by signal", -retcode
                return
            if retcode != 0 and retcode not in downloader.return_codes_ok:
                print
                print >> sys.stderr, "Something went wrong while downloading " \
                                    + self.title + "(%s)" % self.link
                print >> sys.stderr, "Return code: %s" % retcode
                return
        except OSError, e:
            print
            print >> sys.stderr, "Execution failed:", e
            return
        except KeyboardInterrupt, e:
            print "Removing downloaded dir in 1 sec..." # to avoid partial downloads
            time.sleep(1)
            shutil.rmtree(destdir)
            return
        # Backup the URL in the url.txt file
        link_file = open(os.path.join(destdir, "url.txt"),"w")
        link_file.write(self.link)
        link_file.close()
        # Backup the HTML title in the title.txt file
        title_file = open(os.path.join(destdir, "title.txt"),"w")
        try:
            title_file.write(unicode(self.title_full).encode("utf-8"))
        except UnicodeEncodeError:
            title_file.write(self.title)
        title_file.close()
        # Backup the timestamp in the timestamp.txt file
        timestamp_file = open(os.path.join(destdir, "timestamp.txt"),"w")
        timestamp_file.write(str(self.timestamp))
        timestamp_file.close()
        print
        time.sleep(1) # Can't remember why this was necessary... FIXME



class Feed(object):

    def __init__(self, name, url):
        self.name = name
        self.url = url
        self.title = None
        self.pages = []

    def parse(self):
        """
        Read an RSS feed and return a list of pages to mirror.
        """
        content = urllib2.urlopen(self.url)
        feed = etree.parse(content)
        self.title = self.get_title(feed)
        pages = []
        for item in feed.findall(".//item"):
            pages.append(Page.parse(item))
        self.pages = pages

    def get_title(self, feed):
        if config.has_option(self.name, "title"):
            return config.get(self.name, "title")
        feed_title = feed.findtext("channel/title")
        if not feed_title:
            return self.name
        return feed_title


class Repository(object):
    """
    A folder containing mirrored pages
    """

    def __init__(self, path, feeds):
        self.path = os.path.expanduser(path)
        self.feeds = feeds

    def make_index(self, downloader):
        """Build the HTML index of the mirrored pages"""
        startfiles = {}
        for feed in self.feeds:
            startfiles[feed] = []
            destdir = os.path.join(self.path, feed.name)
            for page in feed.pages:
                if not os.path.exists(os.path.join(destdir, page.title)):
                    continue
                try:
                    local_path = downloader.get_start_path(destdir, page.title)
                except IOError:
                    print >> sys.stderr, "Can't find the url.txt file for %s" \
                                         % page.title
                    continue # no url.txt file, something went wrong
                startfiles[feed].append(
                        ( unicode(page.title_full).encode("utf-8"),
                          page.description,
                          "/".join([feed.name, page.title, local_path]) ) )
        mainindex = open(os.path.join(self.path, "index.html"), "w")
        mainindex.write("""<!DOCTYPE html>
<html>
<head>
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  <meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0; user-scalable=0;"/>
  <title>Mirrored pages</title>
  <link rel="stylesheet" href="_iui/iui.css" type="text/css" />
  <link rel="stylesheet" href="_iui/t/default/default-theme.css" type="text/css"/>
  <script type="application/x-javascript" src="_iui/iui.js"></script>
</head>
<body>

<div class="toolbar">
  <h1 id="pageTitle"></h1>
    <a id="backButton" class="button" href="#"></a>
</div>
""")
        if len(startfiles) > 1:
            mainindex.write("""<ul id="index" title="Mirrored pages" selected="true">\n""")
            for feed in sorted(startfiles, key=lambda f: f.name):
                mainindex.write("""  <li><a href="#%(name)s">%(title)s</a></li>\n"""
                                % {"name": feed.name, "title": feed.title} )
            mainindex.write("</ul>\n\n")

        for feed in sorted(startfiles, key=lambda f: f.name):
            mainindex.write("""<ul id="%(name)s" title="%(title)s">\n"""
                            % {"name": feed.name, "title": feed.title})
            for title, description, index in startfiles[feed]:
                mainindex.write(
                    """  <li><a href="%s" target="_webapp">%s</a></li>\n""" %
                    (urllib.quote(index.encode("utf-8")), title))
            mainindex.write("</ul>\n\n")
        mainindex.write("""
</body>
</html>
""")


    def cleanup(self):
        """
        Remove mirrored pages which are not in the feed anymore (probably
        because you read them)
        """
        dirs_to_remove = self._get_old_feed_dirs()
        for feed in self.feeds:
            dirs_to_remove.extend(self._get_old_page_dirs(feed))
        for dirname in dirs_to_remove:
            print "Removing", dirname
            if not config.getboolean("DEFAULT", "debug"):
                shutil.rmtree(os.path.join(self.path, dirname))
        self.lowercase_dirs()
        if "_iui" not in os.listdir(self.path):
            self.download_iui()

    def _get_old_feed_dirs(self):
        dirs = []
        for feeddirname in os.listdir(self.path):
            if feeddirname.startswith("."):
                continue
            if feeddirname == "_iui":
                continue
            if not os.path.isdir(os.path.join(self.path, feeddirname)):
                continue # like "index.html" for example
            if feeddirname not in [ f.name for f in self.feeds ]:
                dirs.append(feeddirname)
        return dirs

    def _get_old_page_dirs(self, feed):
        if not os.path.isdir(os.path.join(self.path, feed.name)):
            return []
        dirs = []
        for dirname in os.listdir(os.path.join(self.path, feed.name)):
            if dirname.startswith("."):
                continue
            if dirname not in [ p.title for p in feed.pages ]:
                dirs.append(os.path.join(feed.name, dirname))
        return dirs

    def lowercase_dirs(self):
        """For FAT32 transparency"""
        for feed in self.feeds:
            for root, dirs, files in os.walk(
                        os.path.join(self.path, feed.name), topdown=False):
                for name in dirs:
                    newname = name.lower()
                    if name != newname:
                        source = os.path.join(self.path, feed.name, root, name)
                        dest = os.path.join(self.path, feed.name, root, newname)
                        if os.path.exists(dest):
                            continue
                        os.rename(source, dest)


    def download_iui(self):
        print "Downloading iUI... ",
        sys.stdout.flush()
        iui_url = "http://iui.googlecode.com/files/iui-%s.zip" % IUI_VERSION
        try:
            fn, _headers = urllib.urlretrieve(iui_url)
            with ZipFile(fn, "r") as archive:
                archive.extractall(self.path)
            os.remove(fn)
            os.rename(os.path.join(self.path, "iui-%s/web-app/iui" % IUI_VERSION),
                      os.path.join(self.path, "_iui"))
            shutil.rmtree(os.path.join(self.path, "iui-%s" % IUI_VERSION))
        except (IOError, BadZipfile):
            print "FAILED."
            print >>sys.stderr, "WARNING, could not download or unzip iUI"
            from formatter import DumbWriter
            dw = DumbWriter(sys.stderr) # not really necessary, but fun :)
            dw.send_flowing_data("You must download iUI from %s, unpack it, "
                        "rename the folder to \"_iui\", and put it in the "
                        "destination directory." % iui_url)
            print
            return
        print "done."


    def download_jqm(self):
        print "Downloading jQuery Mobile... ",
        sys.stdout.flush()
        jqm_url = ("http://code.jquery.com/mobile/%(ver)s/jquery.mobile-%(ver)s.zip"
                   % {"ver": JQM_VERSION})
        jq_url = "http://code.jquery.com/jquery-%s.min.js" % JQ_VERSION
        try:
            jq_fn, _headers = urllib.urlretrieve(jqm_url)
            with ZipFile(jq_fn, "r") as jq_zip:
                jq_zip.extractall(self.path)
            os.remove(jq_fn)
            os.rename(os.path.join(self.path, "jquery.mobile-%s" % JQM_VERSION),
                      os.path.join(self.path, "_jqm"))
            urllib.urlretrieve(jq_url, os.path.join(self.path, "_jqm",
                                                    os.path.basename(jq_url)))
        except (IOError, BadZipfile):
            print "FAILED."
            print >>sys.stderr, ("WARNING, could not download or unzip "
                                "jQuery Mobile.")
            from formatter import DumbWriter
            dw = DumbWriter(sys.stderr) # not really necessary, but fun :)
            dw.send_flowing_data("You must download jQuery Mobile from "
                      "%(jqmurl)s, unpack it, rename the folder to \"_jqm\", "
                      "put it in the destination directory, then download "
                      "jQuery from %(jqurl)s, and put it in the same folder."
                      % { "jqmurl": jqm_url, "jqurl": jq_url } )
            print
            return
        print "done."


def parse_opts():
    """Command-line options"""
    usage = "usage: %prog -c <config file>"
    parser = optparse.OptionParser(usage)
    parser.add_option("-c", "--config", help="Configuration file")
    parser.add_option("-o", "--output", dest="output", metavar="DIR",
                      help="Output directory (will be purged !)")
    parser.add_option("--list-engines", dest="lse", action="store_true",
                      help="List available engines and exit")
    parser.add_option("-r", "--recursive", dest="recursive",
                      type="int", metavar="DEPTH", help="Download linked "
                      "pages until this depth. Be careful with that. "
                      "Default: %default)")
    parser.add_option("-d", "--debug", dest="debug", action="store_true",
                      help="Debug mode")
    options, args = parser.parse_args()
    if (options.lse):
        engines = get_engines()
        print "\n".join(engines.keys())
        sys.exit()
    if not options.config:
        if os.path.exists(os.path.expanduser(CONFIG_PATH)):
            options.config = CONFIG_PATH
        else:
            parser.error("You must provide a configuration file (or put it "
                         "in %s)" % CONFIG_PATH)
    if not os.path.exists(os.path.expanduser(options.config)):
        parser.error("Unable to find the configuration file: %s"
                     % options.config)
    if args:
        parser.error("No arguments allowed")
    return options


def get_feeds(config):
    feed_list = []
    for s in config.sections():
        if not config.has_option(s, "url"):
            continue
        feed = Feed(s, config.get(s, "url"))
        feed_list.append(feed)
    return feed_list


def choose_engine(config):
    engines = get_engines()
    name = config.get("DEFAULT", "engine")
    return engines[name]()


def get_config(options):
    # TODO: create config
    config = SafeConfigParser({"title_size": "50", "engine": "wget",
                               "recursive": "0"})
    config.read(os.path.expanduser(options.config))
    if options.output is not None:
        config.set("DEFAULT", "output", options.output)
    if not config.has_option("DEFAULT", "output"):
        print >> sys.stderr, "Config file should have an 'output' variable"
        sys.exit(1)
    if not os.path.isdir(os.path.expanduser(config.get("DEFAULT", "output"))):
        print >> sys.stderr, "The output path must be a directory"
        sys.exit(1)
    config.set("DEFAULT", "debug", str(bool(options.debug)))
    if options.recursive is not None:
        config.set("DEFAULT", "recursive", options.recursive)
    return config


def main():
    """The fun starts here"""
    global config
    options = parse_opts()
    config = get_config(options)
    downloader = choose_engine(config)

    feeds = get_feeds(config)
    for feed in feeds[:]:
        try:
            feed.parse()
        except urllib2.HTTPError, e:
            print >>sys.stderr, "Failed downloading %s: %s" % (feed.url, e)
            feeds.remove(feed)

    repo = Repository(config.get("DEFAULT", "output"), feeds)

    for feed in feeds:
        outdir = os.path.join(repo.path, feed.name)
        for page in feed.pages:
            page.download(outdir, downloader)

    repo.make_index(downloader)
    repo.cleanup()



if __name__ == "__main__":
    main()
