Back to my blog Back to my projects

Scripts

source: rss-mirror.py @ c19ca33

Revision c19ca33, 22.1 KB checked in by Aurélien Bompard <aurelien@…>, 17 months ago (diff)

rss-mirror: add doc for the config file

  • Property mode set to 100755
Line 
1#!/usr/bin/env python
2# vim: set fileencoding=utf-8 tabstop=4 shiftwidth=4 expandtab smartindent:
3u"""
4
5RSS Mirror
6----------
7
8Mirrors on the local disk the pages listed in an RSS feed, using Wget or
9HTTrack.
10
11Requires Python >= 2.6
12
13
14Configuration file
15~~~~~~~~~~~~~~~~~~
16
17RSS-mirror uses a configuration file to list the RSS feed that should be
18downloaded. This file must be placed in ``~/.config/rss-mirror.conf`` and is in
19INI format. Example::
20
21    [DEFAULT]
22    output = ~/pda/webpages
23
24    [owni]
25    url = http://owni.fr/feed
26
27    [zenhabits]
28    url = http://zenhabits.net/feed
29
30    [rue89-ecologie]
31    url = http://www.rue89.com/tag/ecologie/feed
32    title = Rue89 - Ecologie
33
34The ``DEFAULT`` section has an ``output`` key pointing to the output directory
35where the webpages will be downloaded.
36
37Each section (except DEFAULT) is a feed to download. It has a ``url`` key which
38is self-explanatory and an optional ``title`` key which will be used as a title
39for the feed in the summary page.
40
41
42Credits
43~~~~~~~
44
45.. :Authors:
46       Aurélien Bompard <aurelien@bompard.org> <http://aurelien.bompard.org>
47
48.. :License:
49       GNU GPL v3 or later
50
51"""
52
53import os
54import sys
55import urllib
56import urllib2
57import re
58import shutil
59import optparse
60import datetime
61import time
62import itertools
63import xml.etree.ElementTree as etree
64from urlparse import urlparse
65from subprocess import call
66from zipfile import ZipFile, BadZipfile
67from ConfigParser import SafeConfigParser
68
69
70IUI_VERSION = "0.40-alpha1"
71CONFIG_PATH = "~/.config/rss-mirror.conf"
72
73
74class Downloader(object):
75    """
76    Abstract downloader class
77
78    :cvar return_codes_ok: list of non-zero return codes that are actually OK
79    :type return_codes_ok: ``list``
80    """
81
82    return_codes_ok = []
83
84    def get_command(self, destdir, url, options=None):
85        """Returns the system command to execute"""
86        raise NotImplementedError
87
88    def get_start_path(self, basedir, page):
89        """Returns the path to the downloaded page on the disk"""
90        raise NotImplementedError
91
92
93class HttrackDownloader(Downloader):
94    """
95    Download using httrack. More features than wget, but is has some bugs, like
96    downloading CSS stylesheets in ``@import`` constructs.
97    """
98
99    name = "httrack"
100    opts = [
101        "-%l", "fr", # language
102        "-Y", # mirror links
103        "-C0", # no cache
104        "-b0", # no cookies
105        "-n", # download "near" files
106        "-L0", # DOS-compatible file names
107        "-d", # stay on the same domain
108        "-x", # replace external links by error page
109        "-%u", #url hacks: various hacks to limit duplicate URLs
110        "-F", "rss-mirror (allow like Gecko)", # user-agent
111    ]
112
113    def __init__(self):
114        super(HttrackDownloader, self).__init__()
115        recursive = config.getint("DEFAULT", "recursive")
116        if recursive:
117            self.opts.append("-r%d" % recursive)
118
119    def get_command(self, destdir, url, options=None):
120        command = ["httrack"]
121        command.extend(self.opts)
122        if options:
123            command.extend(options)
124        command.extend(["-O", destdir, url])
125        return command
126
127    def get_start_path(self, basedir, title):
128        indexfile = open(os.path.join(basedir, title, "index.html"))
129        mo = re.search('<meta HTTP-EQUIV="Refresh" CONTENT="0; URL=(.*)">',
130                       indexfile.read())
131        indexfile.close()
132        return mo.group(1)
133
134
135class WgetDownloader(Downloader):
136    """
137    Download using wget. Simple and fast.
138
139    I use the ``-nv`` switch to avoid creating the whole directory structure
140    mirroring the website structure, because the FAT32 filesystem does not like
141    very very long names.
142    """
143
144    name = "wget"
145    opts = [
146        "-nv", # non verbose
147        "-k", # convert links
148        "-p", # download needed files for the page
149        "-N", # timestamping
150        "--restrict-file-names=windows,ascii,lowercase",
151        "-E", # adjust extension
152        "-H", # allow going on a different domain
153        "--timeout=15", # it's 900 by default...
154        "--tries=2", # it's 20 by default...
155        "-nd", # avoid having 255+ chars paths
156        "--no-check-certificate", # SSL
157        # User-agent: try to get the mobile version of the page
158        "-U", ("Mozilla/5.0 (Linux; U; Android 2.2; en-us; rss-mirror) "
159               "AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile "
160               "Safari/533.1"),
161    ]
162    return_codes_ok = [3, 4, 8]
163    # 3: I/O error, usually because the filename is too long
164    # 4: Network error (broken link on the page)
165    # 8: Server issued error response (broken link on the page)
166
167    def __init__(self):
168        super(WgetDownloader, self).__init__()
169        recursive = config.getint("DEFAULT", "recursive")
170        if recursive:
171            self.opts.extend(["-r", "-l%d" % recursive])
172
173    def get_command(self, destdir, url, options=None):
174        command = ["wget", ]
175        command.extend(self.opts)
176        if options:
177            command.extend(options)
178        command.extend(["-P", destdir, url])
179        return command
180
181    def get_start_path(self, basedir, title):
182        urlfile = open(os.path.join(basedir, title, "url.txt"))
183        url = urlfile.read().strip()
184        urlfile.close()
185        if url.endswith("/"):
186            url += "index.html"
187        if "-nv" in self.opts:
188            local_path = self.get_start_path_nodirs(basedir, url)
189        else:
190            if os.path.exists(os.path.join(basedir, title, "index.html")):
191                # downloaded with httrack
192                httrack_dl = HttrackDownloader()
193                return httrack_dl.get_start_path(basedir, page)
194            local_path = self.get_start_path_dirs(basedir, url)
195        if os.path.isfile(os.path.join(basedir, title, local_path)):
196            return local_path
197        if os.path.isfile(os.path.join(basedir, title, local_path) + ".html"):
198            return local_path + ".html"
199        else:
200            print >>sys.stderr, "Can't find the start page: ", local_path
201            print >>sys.stderr, "Tried:", os.path.join(basedir, title, local_path), \
202                            os.path.join(basedir, title, local_path) + ".html"
203            return "."
204
205    def get_start_path_nodirs(self, basedir, url):
206        url_parsed = urlparse(url)
207        local_path = os.path.basename(url_parsed[2]).lower()
208        if not local_path:
209            local_path = "index.html"
210        if url_parsed[4]:
211            local_path += "@" + url_parsed[4].lower().replace("/", "%2f")
212        return local_path
213
214    def get_start_path_dirs(self, basedir, url):
215        local_path = url.replace("http://","").lower().replace("?", "@")
216        return local_path
217
218
219def get_engines():
220    engines = {}
221    for downloader in Downloader.__subclasses__():
222        engines[downloader.name] = downloader
223    return engines
224
225
226def extract_options(desc):
227    opts = []
228    options_matches = re.findall("\{options:\s+[^}]+\}", desc)
229    for options_match in options_matches:
230        inner_options = options_match[10:-1].strip()
231        for inner_opt in inner_options.split():
232            inner_opt = inner_opt.strip()
233            if not inner_opt:
234                continue
235            opts.append(inner_opt)
236    return opts
237
238
239class Page(object):
240    """
241        - ``title`` is a shortned title derived from the page title,
242        - ``link`` is the URL,
243        - ``title_full`` is the HTML page title,
244        - ``timestamp`` is the UNIX timestamp of the page in the RSS feed, which is
245          probably the moment you bookmarked it.
246    """
247
248
249    allowed_chars = re.compile("[^a-zA-Z0-9_-]")
250    desc_cleanup = re.compile("<[^>]+>")
251
252    @classmethod
253    def parse(cls, item):
254        titlesize = config.getint("DEFAULT", "title_size",)
255        page = cls()
256        page.title_full = item.findtext("title").strip()
257        page.title = page.title_full[:titlesize].strip().lower()
258        page.title = cls.allowed_chars.sub("", page.title.replace(" ","_"))
259        page.link = item.findtext("link").strip()
260        timestamp = item.findtext("pubDate")
261        try:
262            timestamp = datetime.datetime.strptime(timestamp,
263                            "%a, %d %b %Y %H:%M:%S EDT")
264            timestamp = int(time.mktime(timestamp.timetuple()))
265        except ValueError:
266            timestamp = int(time.time())
267        page.timestamp = timestamp
268        page.description = item.findtext("description", "")
269        page.description = cls.desc_cleanup.sub("", page.description)
270        page.options = extract_options(page.description)
271        return page
272
273    def download(self, outdir, downloader):
274        """Use the downloader to mirror the page"""
275        destdir = os.path.join(outdir, self.title)
276        if os.path.exists(destdir):
277            feedname = os.path.basename(outdir)
278            print "Already downloaded:", os.path.join(feedname, self.title)
279            return
280        print "Downloading", self.title, self.link
281        try:
282            command = downloader.get_command(destdir, self.link,
283                                             options=self.options)
284            print " ".join(command)
285            if config.getboolean("DEFAULT", "debug"):
286                retcode = 0
287            else:
288                retcode = call(command)
289            if retcode < 0:
290                print
291                print >> sys.stderr, "Child was terminated by signal", -retcode
292                return
293            if retcode != 0 and retcode not in downloader.return_codes_ok:
294                print
295                print >> sys.stderr, "Something went wrong while downloading " \
296                                    + self.title + "(%s)" % self.link
297                print >> sys.stderr, "Return code: %s" % retcode
298                return
299        except OSError, e:
300            print
301            print >> sys.stderr, "Execution failed:", e
302            return
303        except KeyboardInterrupt, e:
304            print "Removing downloaded dir in 1 sec..." # to avoid partial downloads
305            time.sleep(1)
306            shutil.rmtree(destdir)
307            return
308        # Backup the URL in the url.txt file
309        link_file = open(os.path.join(destdir, "url.txt"),"w")
310        link_file.write(self.link)
311        link_file.close()
312        # Backup the HTML title in the title.txt file
313        title_file = open(os.path.join(destdir, "title.txt"),"w")
314        try:
315            title_file.write(unicode(self.title_full).encode("utf-8"))
316        except UnicodeEncodeError:
317            title_file.write(self.title)
318        title_file.close()
319        # Backup the timestamp in the timestamp.txt file
320        timestamp_file = open(os.path.join(destdir, "timestamp.txt"),"w")
321        timestamp_file.write(str(self.timestamp))
322        timestamp_file.close()
323        print
324        time.sleep(1) # Can't remember why this was necessary... FIXME
325
326
327
328class Feed(object):
329
330    def __init__(self, name, url):
331        self.name = name
332        self.url = url
333        self.title = None
334        self.pages = []
335
336    def parse(self):
337        """
338        Read an RSS feed and return a list of pages to mirror.
339        """
340        content = urllib2.urlopen(self.url)
341        feed = etree.parse(content)
342        self.title = self.get_title(feed)
343        pages = []
344        for item in feed.findall(".//item"):
345            pages.append(Page.parse(item))
346        self.pages = pages
347
348    def get_title(self, feed):
349        if config.has_option(self.name, "title"):
350            return config.get(self.name, "title")
351        feed_title = feed.findtext("channel/title")
352        if not feed_title:
353            return self.name
354        return feed_title
355
356
357class Repository(object):
358    """
359    A folder containing mirrored pages
360    """
361
362    def __init__(self, path, feeds):
363        self.path = os.path.expanduser(path)
364        self.feeds = feeds
365
366    def make_index(self, downloader):
367        """Build the HTML index of the mirrored pages"""
368        startfiles = {}
369        for feed in self.feeds:
370            startfiles[feed] = []
371            destdir = os.path.join(self.path, feed.name)
372            for page in feed.pages:
373                if not os.path.exists(os.path.join(destdir, page.title)):
374                    continue
375                try:
376                    local_path = downloader.get_start_path(destdir, page.title)
377                except IOError:
378                    print >> sys.stderr, "Can't find the url.txt file for %s" \
379                                         % page.title
380                    continue # no url.txt file, something went wrong
381                startfiles[feed].append(
382                        ( unicode(page.title_full).encode("utf-8"),
383                          page.description,
384                          "/".join([feed.name, page.title, local_path]) ) )
385        mainindex = open(os.path.join(self.path, "index.html"), "w")
386        mainindex.write("""<!DOCTYPE html>
387<html>
388<head>
389  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
390  <meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0; user-scalable=0;"/>
391  <title>Mirrored pages</title>
392  <link rel="stylesheet" href="_iui/iui.css" type="text/css" />
393  <link rel="stylesheet" href="_iui/t/default/default-theme.css" type="text/css"/>
394  <script type="application/x-javascript" src="_iui/iui.js"></script>
395</head>
396<body>
397
398<div class="toolbar">
399  <h1 id="pageTitle"></h1>
400    <a id="backButton" class="button" href="#"></a>
401</div>
402""")
403        if len(startfiles) > 1:
404            mainindex.write("""<ul id="index" title="Mirrored pages" selected="true">\n""")
405            for feed in sorted(startfiles, key=lambda f: f.name):
406                mainindex.write("""  <li><a href="#%(name)s">%(title)s</a></li>\n"""
407                                % {"name": feed.name, "title": feed.title} )
408            mainindex.write("</ul>\n\n")
409
410        for feed in sorted(startfiles, key=lambda f: f.name):
411            mainindex.write("""<ul id="%(name)s" title="%(title)s">\n"""
412                            % {"name": feed.name, "title": feed.title})
413            for title, description, index in startfiles[feed]:
414                mainindex.write(
415                    """  <li><a href="%s" target="_webapp">%s</a></li>\n""" %
416                    (urllib.quote(index.encode("utf-8")), title))
417            mainindex.write("</ul>\n\n")
418        mainindex.write("""
419</body>
420</html>
421""")
422
423
424    def cleanup(self):
425        """
426        Remove mirrored pages which are not in the feed anymore (probably
427        because you read them)
428        """
429        dirs_to_remove = self._get_old_feed_dirs()
430        for feed in self.feeds:
431            dirs_to_remove.extend(self._get_old_page_dirs(feed))
432        for dirname in dirs_to_remove:
433            print "Removing", dirname
434            if not config.getboolean("DEFAULT", "debug"):
435                shutil.rmtree(os.path.join(self.path, dirname))
436        self.lowercase_dirs()
437        if "_iui" not in os.listdir(self.path):
438            self.download_iui()
439
440    def _get_old_feed_dirs(self):
441        dirs = []
442        for feeddirname in os.listdir(self.path):
443            if feeddirname.startswith("."):
444                continue
445            if feeddirname == "_iui":
446                continue
447            if not os.path.isdir(os.path.join(self.path, feeddirname)):
448                continue # like "index.html" for example
449            if feeddirname not in [ f.name for f in self.feeds ]:
450                dirs.append(feeddirname)
451        return dirs
452
453    def _get_old_page_dirs(self, feed):
454        if not os.path.isdir(os.path.join(self.path, feed.name)):
455            return []
456        dirs = []
457        for dirname in os.listdir(os.path.join(self.path, feed.name)):
458            if dirname.startswith("."):
459                continue
460            if dirname not in [ p.title for p in feed.pages ]:
461                dirs.append(os.path.join(feed.name, dirname))
462        return dirs
463
464    def lowercase_dirs(self):
465        """For FAT32 transparency"""
466        for feed in self.feeds:
467            for root, dirs, files in os.walk(
468                        os.path.join(self.path, feed.name), topdown=False):
469                for name in dirs:
470                    newname = name.lower()
471                    if name != newname:
472                        source = os.path.join(self.path, feed.name, root, name)
473                        dest = os.path.join(self.path, feed.name, root, newname)
474                        if os.path.exists(dest):
475                            continue
476                        os.rename(source, dest)
477
478
479    def download_iui(self):
480        print "Downloading iUI... ",
481        sys.stdout.flush()
482        iui_url = "http://iui.googlecode.com/files/iui-%s.zip" % IUI_VERSION
483        try:
484            fn, _headers = urllib.urlretrieve(iui_url)
485            with ZipFile(fn, "r") as archive:
486                archive.extractall(self.path)
487            os.remove(fn)
488            os.rename(os.path.join(self.path, "iui-%s/web-app/iui" % IUI_VERSION),
489                      os.path.join(self.path, "_iui"))
490            shutil.rmtree(os.path.join(self.path, "iui-%s" % IUI_VERSION))
491        except (IOError, BadZipfile):
492            print "FAILED."
493            print >>sys.stderr, "WARNING, could not download or unzip iUI"
494            from formatter import DumbWriter
495            dw = DumbWriter(sys.stderr) # not really necessary, but fun :)
496            dw.send_flowing_data("You must download iUI from %s, unpack it, "
497                        "rename the folder to \"_iui\", and put it in the "
498                        "destination directory." % iui_url)
499            print
500            return
501        print "done."
502
503
504    def download_jqm(self):
505        print "Downloading jQuery Mobile... ",
506        sys.stdout.flush()
507        jqm_url = ("http://code.jquery.com/mobile/%(ver)s/jquery.mobile-%(ver)s.zip"
508                   % {"ver": JQM_VERSION})
509        jq_url = "http://code.jquery.com/jquery-%s.min.js" % JQ_VERSION
510        try:
511            jq_fn, _headers = urllib.urlretrieve(jqm_url)
512            with ZipFile(jq_fn, "r") as jq_zip:
513                jq_zip.extractall(self.path)
514            os.remove(jq_fn)
515            os.rename(os.path.join(self.path, "jquery.mobile-%s" % JQM_VERSION),
516                      os.path.join(self.path, "_jqm"))
517            urllib.urlretrieve(jq_url, os.path.join(self.path, "_jqm",
518                                                    os.path.basename(jq_url)))
519        except (IOError, BadZipfile):
520            print "FAILED."
521            print >>sys.stderr, ("WARNING, could not download or unzip "
522                                "jQuery Mobile.")
523            from formatter import DumbWriter
524            dw = DumbWriter(sys.stderr) # not really necessary, but fun :)
525            dw.send_flowing_data("You must download jQuery Mobile from "
526                      "%(jqmurl)s, unpack it, rename the folder to \"_jqm\", "
527                      "put it in the destination directory, then download "
528                      "jQuery from %(jqurl)s, and put it in the same folder."
529                      % { "jqmurl": jqm_url, "jqurl": jq_url } )
530            print
531            return
532        print "done."
533
534
535def parse_opts():
536    """Command-line options"""
537    usage = "usage: %prog -c <config file>"
538    parser = optparse.OptionParser(usage)
539    parser.add_option("-c", "--config", help="Configuration file")
540    parser.add_option("-o", "--output", dest="output", metavar="DIR",
541                      help="Output directory (will be purged !)")
542    parser.add_option("--list-engines", dest="lse", action="store_true",
543                      help="List available engines and exit")
544    parser.add_option("-r", "--recursive", dest="recursive",
545                      type="int", metavar="DEPTH", help="Download linked "
546                      "pages until this depth. Be careful with that. "
547                      "Default: %default)")
548    parser.add_option("-d", "--debug", dest="debug", action="store_true",
549                      help="Debug mode")
550    options, args = parser.parse_args()
551    if (options.lse):
552        engines = get_engines()
553        print "\n".join(engines.keys())
554        sys.exit()
555    if not options.config:
556        if os.path.exists(os.path.expanduser(CONFIG_PATH)):
557            options.config = CONFIG_PATH
558        else:
559            parser.error("You must provide a configuration file (or put it "
560                         "in %s)" % CONFIG_PATH)
561    if not os.path.exists(os.path.expanduser(options.config)):
562        parser.error("Unable to find the configuration file: %s"
563                     % options.config)
564    if args:
565        parser.error("No arguments allowed")
566    return options
567
568
569def get_feeds(config):
570    feed_list = []
571    for s in config.sections():
572        if not config.has_option(s, "url"):
573            continue
574        feed = Feed(s, config.get(s, "url"))
575        feed_list.append(feed)
576    return feed_list
577
578
579def choose_engine(config):
580    engines = get_engines()
581    name = config.get("DEFAULT", "engine")
582    return engines[name]()
583
584
585def get_config(options):
586    # TODO: create config
587    config = SafeConfigParser({"title_size": "50", "engine": "wget",
588                               "recursive": "0"})
589    config.read(os.path.expanduser(options.config))
590    if options.output is not None:
591        config.set("DEFAULT", "output", options.output)
592    if not config.has_option("DEFAULT", "output"):
593        print >> sys.stderr, "Config file should have an 'output' variable"
594        sys.exit(1)
595    if not os.path.isdir(os.path.expanduser(config.get("DEFAULT", "output"))):
596        print >> sys.stderr, "The output path must be a directory"
597        sys.exit(1)
598    config.set("DEFAULT", "debug", str(bool(options.debug)))
599    if options.recursive is not None:
600        config.set("DEFAULT", "recursive", options.recursive)
601    return config
602
603
604def main():
605    """The fun starts here"""
606    global config
607    options = parse_opts()
608    config = get_config(options)
609    downloader = choose_engine(config)
610
611    feeds = get_feeds(config)
612    for feed in feeds[:]:
613        try:
614            feed.parse()
615        except urllib2.HTTPError, e:
616            print >>sys.stderr, "Failed downloading %s: %s" % (feed.url, e)
617            feeds.remove(feed)
618
619    repo = Repository(config.get("DEFAULT", "output"), feeds)
620
621    for feed in feeds:
622        outdir = os.path.join(repo.path, feed.name)
623        for page in feed.pages:
624            page.download(outdir, downloader)
625
626    repo.make_index(downloader)
627    repo.cleanup()
628
629
630
631if __name__ == "__main__":
632    main()
Note: See TracBrowser for help on using the repository browser.