| 1 | #!/usr/bin/env python |
|---|
| 2 | # vim: set fileencoding=utf-8 tabstop=4 shiftwidth=4 expandtab smartindent: |
|---|
| 3 | u""" |
|---|
| 4 | |
|---|
| 5 | RSS Mirror |
|---|
| 6 | ---------- |
|---|
| 7 | |
|---|
| 8 | Mirrors on the local disk the pages listed in an RSS feed, using Wget or |
|---|
| 9 | HTTrack. |
|---|
| 10 | |
|---|
| 11 | Requires Python >= 2.6 |
|---|
| 12 | |
|---|
| 13 | |
|---|
| 14 | Configuration file |
|---|
| 15 | ~~~~~~~~~~~~~~~~~~ |
|---|
| 16 | |
|---|
| 17 | RSS-mirror uses a configuration file to list the RSS feed that should be |
|---|
| 18 | downloaded. This file must be placed in ``~/.config/rss-mirror.conf`` and is in |
|---|
| 19 | INI format. Example:: |
|---|
| 20 | |
|---|
| 21 | [DEFAULT] |
|---|
| 22 | output = ~/pda/webpages |
|---|
| 23 | |
|---|
| 24 | [owni] |
|---|
| 25 | url = http://owni.fr/feed |
|---|
| 26 | |
|---|
| 27 | [zenhabits] |
|---|
| 28 | url = http://zenhabits.net/feed |
|---|
| 29 | |
|---|
| 30 | [rue89-ecologie] |
|---|
| 31 | url = http://www.rue89.com/tag/ecologie/feed |
|---|
| 32 | title = Rue89 - Ecologie |
|---|
| 33 | |
|---|
| 34 | The ``DEFAULT`` section has an ``output`` key pointing to the output directory |
|---|
| 35 | where the webpages will be downloaded. |
|---|
| 36 | |
|---|
| 37 | Each section (except DEFAULT) is a feed to download. It has a ``url`` key which |
|---|
| 38 | is self-explanatory and an optional ``title`` key which will be used as a title |
|---|
| 39 | for the feed in the summary page. |
|---|
| 40 | |
|---|
| 41 | |
|---|
| 42 | Credits |
|---|
| 43 | ~~~~~~~ |
|---|
| 44 | |
|---|
| 45 | .. :Authors: |
|---|
| 46 | Aurélien Bompard <aurelien@bompard.org> <http://aurelien.bompard.org> |
|---|
| 47 | |
|---|
| 48 | .. :License: |
|---|
| 49 | GNU GPL v3 or later |
|---|
| 50 | |
|---|
| 51 | """ |
|---|
| 52 | |
|---|
| 53 | import os |
|---|
| 54 | import sys |
|---|
| 55 | import urllib |
|---|
| 56 | import urllib2 |
|---|
| 57 | import re |
|---|
| 58 | import shutil |
|---|
| 59 | import optparse |
|---|
| 60 | import datetime |
|---|
| 61 | import time |
|---|
| 62 | import itertools |
|---|
| 63 | import xml.etree.ElementTree as etree |
|---|
| 64 | from urlparse import urlparse |
|---|
| 65 | from subprocess import call |
|---|
| 66 | from zipfile import ZipFile, BadZipfile |
|---|
| 67 | from ConfigParser import SafeConfigParser |
|---|
| 68 | |
|---|
| 69 | |
|---|
| 70 | IUI_VERSION = "0.40-alpha1" |
|---|
| 71 | CONFIG_PATH = "~/.config/rss-mirror.conf" |
|---|
| 72 | |
|---|
| 73 | |
|---|
| 74 | class Downloader(object): |
|---|
| 75 | """ |
|---|
| 76 | Abstract downloader class |
|---|
| 77 | |
|---|
| 78 | :cvar return_codes_ok: list of non-zero return codes that are actually OK |
|---|
| 79 | :type return_codes_ok: ``list`` |
|---|
| 80 | """ |
|---|
| 81 | |
|---|
| 82 | return_codes_ok = [] |
|---|
| 83 | |
|---|
| 84 | def get_command(self, destdir, url, options=None): |
|---|
| 85 | """Returns the system command to execute""" |
|---|
| 86 | raise NotImplementedError |
|---|
| 87 | |
|---|
| 88 | def get_start_path(self, basedir, page): |
|---|
| 89 | """Returns the path to the downloaded page on the disk""" |
|---|
| 90 | raise NotImplementedError |
|---|
| 91 | |
|---|
| 92 | |
|---|
| 93 | class HttrackDownloader(Downloader): |
|---|
| 94 | """ |
|---|
| 95 | Download using httrack. More features than wget, but is has some bugs, like |
|---|
| 96 | downloading CSS stylesheets in ``@import`` constructs. |
|---|
| 97 | """ |
|---|
| 98 | |
|---|
| 99 | name = "httrack" |
|---|
| 100 | opts = [ |
|---|
| 101 | "-%l", "fr", # language |
|---|
| 102 | "-Y", # mirror links |
|---|
| 103 | "-C0", # no cache |
|---|
| 104 | "-b0", # no cookies |
|---|
| 105 | "-n", # download "near" files |
|---|
| 106 | "-L0", # DOS-compatible file names |
|---|
| 107 | "-d", # stay on the same domain |
|---|
| 108 | "-x", # replace external links by error page |
|---|
| 109 | "-%u", #url hacks: various hacks to limit duplicate URLs |
|---|
| 110 | "-F", "rss-mirror (allow like Gecko)", # user-agent |
|---|
| 111 | ] |
|---|
| 112 | |
|---|
| 113 | def __init__(self): |
|---|
| 114 | super(HttrackDownloader, self).__init__() |
|---|
| 115 | recursive = config.getint("DEFAULT", "recursive") |
|---|
| 116 | if recursive: |
|---|
| 117 | self.opts.append("-r%d" % recursive) |
|---|
| 118 | |
|---|
| 119 | def get_command(self, destdir, url, options=None): |
|---|
| 120 | command = ["httrack"] |
|---|
| 121 | command.extend(self.opts) |
|---|
| 122 | if options: |
|---|
| 123 | command.extend(options) |
|---|
| 124 | command.extend(["-O", destdir, url]) |
|---|
| 125 | return command |
|---|
| 126 | |
|---|
| 127 | def get_start_path(self, basedir, title): |
|---|
| 128 | indexfile = open(os.path.join(basedir, title, "index.html")) |
|---|
| 129 | mo = re.search('<meta HTTP-EQUIV="Refresh" CONTENT="0; URL=(.*)">', |
|---|
| 130 | indexfile.read()) |
|---|
| 131 | indexfile.close() |
|---|
| 132 | return mo.group(1) |
|---|
| 133 | |
|---|
| 134 | |
|---|
| 135 | class WgetDownloader(Downloader): |
|---|
| 136 | """ |
|---|
| 137 | Download using wget. Simple and fast. |
|---|
| 138 | |
|---|
| 139 | I use the ``-nv`` switch to avoid creating the whole directory structure |
|---|
| 140 | mirroring the website structure, because the FAT32 filesystem does not like |
|---|
| 141 | very very long names. |
|---|
| 142 | """ |
|---|
| 143 | |
|---|
| 144 | name = "wget" |
|---|
| 145 | opts = [ |
|---|
| 146 | "-nv", # non verbose |
|---|
| 147 | "-k", # convert links |
|---|
| 148 | "-p", # download needed files for the page |
|---|
| 149 | "-N", # timestamping |
|---|
| 150 | "--restrict-file-names=windows,ascii,lowercase", |
|---|
| 151 | "-E", # adjust extension |
|---|
| 152 | "-H", # allow going on a different domain |
|---|
| 153 | "--timeout=15", # it's 900 by default... |
|---|
| 154 | "--tries=2", # it's 20 by default... |
|---|
| 155 | "-nd", # avoid having 255+ chars paths |
|---|
| 156 | "--no-check-certificate", # SSL |
|---|
| 157 | # User-agent: try to get the mobile version of the page |
|---|
| 158 | "-U", ("Mozilla/5.0 (Linux; U; Android 2.2; en-us; rss-mirror) " |
|---|
| 159 | "AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile " |
|---|
| 160 | "Safari/533.1"), |
|---|
| 161 | ] |
|---|
| 162 | return_codes_ok = [3, 4, 8] |
|---|
| 163 | # 3: I/O error, usually because the filename is too long |
|---|
| 164 | # 4: Network error (broken link on the page) |
|---|
| 165 | # 8: Server issued error response (broken link on the page) |
|---|
| 166 | |
|---|
| 167 | def __init__(self): |
|---|
| 168 | super(WgetDownloader, self).__init__() |
|---|
| 169 | recursive = config.getint("DEFAULT", "recursive") |
|---|
| 170 | if recursive: |
|---|
| 171 | self.opts.extend(["-r", "-l%d" % recursive]) |
|---|
| 172 | |
|---|
| 173 | def get_command(self, destdir, url, options=None): |
|---|
| 174 | command = ["wget", ] |
|---|
| 175 | command.extend(self.opts) |
|---|
| 176 | if options: |
|---|
| 177 | command.extend(options) |
|---|
| 178 | command.extend(["-P", destdir, url]) |
|---|
| 179 | return command |
|---|
| 180 | |
|---|
| 181 | def get_start_path(self, basedir, title): |
|---|
| 182 | urlfile = open(os.path.join(basedir, title, "url.txt")) |
|---|
| 183 | url = urlfile.read().strip() |
|---|
| 184 | urlfile.close() |
|---|
| 185 | if url.endswith("/"): |
|---|
| 186 | url += "index.html" |
|---|
| 187 | if "-nv" in self.opts: |
|---|
| 188 | local_path = self.get_start_path_nodirs(basedir, url) |
|---|
| 189 | else: |
|---|
| 190 | if os.path.exists(os.path.join(basedir, title, "index.html")): |
|---|
| 191 | # downloaded with httrack |
|---|
| 192 | httrack_dl = HttrackDownloader() |
|---|
| 193 | return httrack_dl.get_start_path(basedir, page) |
|---|
| 194 | local_path = self.get_start_path_dirs(basedir, url) |
|---|
| 195 | if os.path.isfile(os.path.join(basedir, title, local_path)): |
|---|
| 196 | return local_path |
|---|
| 197 | if os.path.isfile(os.path.join(basedir, title, local_path) + ".html"): |
|---|
| 198 | return local_path + ".html" |
|---|
| 199 | else: |
|---|
| 200 | print >>sys.stderr, "Can't find the start page: ", local_path |
|---|
| 201 | print >>sys.stderr, "Tried:", os.path.join(basedir, title, local_path), \ |
|---|
| 202 | os.path.join(basedir, title, local_path) + ".html" |
|---|
| 203 | return "." |
|---|
| 204 | |
|---|
| 205 | def get_start_path_nodirs(self, basedir, url): |
|---|
| 206 | url_parsed = urlparse(url) |
|---|
| 207 | local_path = os.path.basename(url_parsed[2]).lower() |
|---|
| 208 | if not local_path: |
|---|
| 209 | local_path = "index.html" |
|---|
| 210 | if url_parsed[4]: |
|---|
| 211 | local_path += "@" + url_parsed[4].lower().replace("/", "%2f") |
|---|
| 212 | return local_path |
|---|
| 213 | |
|---|
| 214 | def get_start_path_dirs(self, basedir, url): |
|---|
| 215 | local_path = url.replace("http://","").lower().replace("?", "@") |
|---|
| 216 | return local_path |
|---|
| 217 | |
|---|
| 218 | |
|---|
| 219 | def get_engines(): |
|---|
| 220 | engines = {} |
|---|
| 221 | for downloader in Downloader.__subclasses__(): |
|---|
| 222 | engines[downloader.name] = downloader |
|---|
| 223 | return engines |
|---|
| 224 | |
|---|
| 225 | |
|---|
| 226 | def extract_options(desc): |
|---|
| 227 | opts = [] |
|---|
| 228 | options_matches = re.findall("\{options:\s+[^}]+\}", desc) |
|---|
| 229 | for options_match in options_matches: |
|---|
| 230 | inner_options = options_match[10:-1].strip() |
|---|
| 231 | for inner_opt in inner_options.split(): |
|---|
| 232 | inner_opt = inner_opt.strip() |
|---|
| 233 | if not inner_opt: |
|---|
| 234 | continue |
|---|
| 235 | opts.append(inner_opt) |
|---|
| 236 | return opts |
|---|
| 237 | |
|---|
| 238 | |
|---|
| 239 | class Page(object): |
|---|
| 240 | """ |
|---|
| 241 | - ``title`` is a shortned title derived from the page title, |
|---|
| 242 | - ``link`` is the URL, |
|---|
| 243 | - ``title_full`` is the HTML page title, |
|---|
| 244 | - ``timestamp`` is the UNIX timestamp of the page in the RSS feed, which is |
|---|
| 245 | probably the moment you bookmarked it. |
|---|
| 246 | """ |
|---|
| 247 | |
|---|
| 248 | |
|---|
| 249 | allowed_chars = re.compile("[^a-zA-Z0-9_-]") |
|---|
| 250 | desc_cleanup = re.compile("<[^>]+>") |
|---|
| 251 | |
|---|
| 252 | @classmethod |
|---|
| 253 | def parse(cls, item): |
|---|
| 254 | titlesize = config.getint("DEFAULT", "title_size",) |
|---|
| 255 | page = cls() |
|---|
| 256 | page.title_full = item.findtext("title").strip() |
|---|
| 257 | page.title = page.title_full[:titlesize].strip().lower() |
|---|
| 258 | page.title = cls.allowed_chars.sub("", page.title.replace(" ","_")) |
|---|
| 259 | page.link = item.findtext("link").strip() |
|---|
| 260 | timestamp = item.findtext("pubDate") |
|---|
| 261 | try: |
|---|
| 262 | timestamp = datetime.datetime.strptime(timestamp, |
|---|
| 263 | "%a, %d %b %Y %H:%M:%S EDT") |
|---|
| 264 | timestamp = int(time.mktime(timestamp.timetuple())) |
|---|
| 265 | except ValueError: |
|---|
| 266 | timestamp = int(time.time()) |
|---|
| 267 | page.timestamp = timestamp |
|---|
| 268 | page.description = item.findtext("description", "") |
|---|
| 269 | page.description = cls.desc_cleanup.sub("", page.description) |
|---|
| 270 | page.options = extract_options(page.description) |
|---|
| 271 | return page |
|---|
| 272 | |
|---|
| 273 | def download(self, outdir, downloader): |
|---|
| 274 | """Use the downloader to mirror the page""" |
|---|
| 275 | destdir = os.path.join(outdir, self.title) |
|---|
| 276 | if os.path.exists(destdir): |
|---|
| 277 | feedname = os.path.basename(outdir) |
|---|
| 278 | print "Already downloaded:", os.path.join(feedname, self.title) |
|---|
| 279 | return |
|---|
| 280 | print "Downloading", self.title, self.link |
|---|
| 281 | try: |
|---|
| 282 | command = downloader.get_command(destdir, self.link, |
|---|
| 283 | options=self.options) |
|---|
| 284 | print " ".join(command) |
|---|
| 285 | if config.getboolean("DEFAULT", "debug"): |
|---|
| 286 | retcode = 0 |
|---|
| 287 | else: |
|---|
| 288 | retcode = call(command) |
|---|
| 289 | if retcode < 0: |
|---|
| 290 | print |
|---|
| 291 | print >> sys.stderr, "Child was terminated by signal", -retcode |
|---|
| 292 | return |
|---|
| 293 | if retcode != 0 and retcode not in downloader.return_codes_ok: |
|---|
| 294 | print |
|---|
| 295 | print >> sys.stderr, "Something went wrong while downloading " \ |
|---|
| 296 | + self.title + "(%s)" % self.link |
|---|
| 297 | print >> sys.stderr, "Return code: %s" % retcode |
|---|
| 298 | return |
|---|
| 299 | except OSError, e: |
|---|
| 300 | print |
|---|
| 301 | print >> sys.stderr, "Execution failed:", e |
|---|
| 302 | return |
|---|
| 303 | except KeyboardInterrupt, e: |
|---|
| 304 | print "Removing downloaded dir in 1 sec..." # to avoid partial downloads |
|---|
| 305 | time.sleep(1) |
|---|
| 306 | shutil.rmtree(destdir) |
|---|
| 307 | return |
|---|
| 308 | # Backup the URL in the url.txt file |
|---|
| 309 | link_file = open(os.path.join(destdir, "url.txt"),"w") |
|---|
| 310 | link_file.write(self.link) |
|---|
| 311 | link_file.close() |
|---|
| 312 | # Backup the HTML title in the title.txt file |
|---|
| 313 | title_file = open(os.path.join(destdir, "title.txt"),"w") |
|---|
| 314 | try: |
|---|
| 315 | title_file.write(unicode(self.title_full).encode("utf-8")) |
|---|
| 316 | except UnicodeEncodeError: |
|---|
| 317 | title_file.write(self.title) |
|---|
| 318 | title_file.close() |
|---|
| 319 | # Backup the timestamp in the timestamp.txt file |
|---|
| 320 | timestamp_file = open(os.path.join(destdir, "timestamp.txt"),"w") |
|---|
| 321 | timestamp_file.write(str(self.timestamp)) |
|---|
| 322 | timestamp_file.close() |
|---|
| 323 | print |
|---|
| 324 | time.sleep(1) # Can't remember why this was necessary... FIXME |
|---|
| 325 | |
|---|
| 326 | |
|---|
| 327 | |
|---|
| 328 | class Feed(object): |
|---|
| 329 | |
|---|
| 330 | def __init__(self, name, url): |
|---|
| 331 | self.name = name |
|---|
| 332 | self.url = url |
|---|
| 333 | self.title = None |
|---|
| 334 | self.pages = [] |
|---|
| 335 | |
|---|
| 336 | def parse(self): |
|---|
| 337 | """ |
|---|
| 338 | Read an RSS feed and return a list of pages to mirror. |
|---|
| 339 | """ |
|---|
| 340 | content = urllib2.urlopen(self.url) |
|---|
| 341 | feed = etree.parse(content) |
|---|
| 342 | self.title = self.get_title(feed) |
|---|
| 343 | pages = [] |
|---|
| 344 | for item in feed.findall(".//item"): |
|---|
| 345 | pages.append(Page.parse(item)) |
|---|
| 346 | self.pages = pages |
|---|
| 347 | |
|---|
| 348 | def get_title(self, feed): |
|---|
| 349 | if config.has_option(self.name, "title"): |
|---|
| 350 | return config.get(self.name, "title") |
|---|
| 351 | feed_title = feed.findtext("channel/title") |
|---|
| 352 | if not feed_title: |
|---|
| 353 | return self.name |
|---|
| 354 | return feed_title |
|---|
| 355 | |
|---|
| 356 | |
|---|
| 357 | class Repository(object): |
|---|
| 358 | """ |
|---|
| 359 | A folder containing mirrored pages |
|---|
| 360 | """ |
|---|
| 361 | |
|---|
| 362 | def __init__(self, path, feeds): |
|---|
| 363 | self.path = os.path.expanduser(path) |
|---|
| 364 | self.feeds = feeds |
|---|
| 365 | |
|---|
| 366 | def make_index(self, downloader): |
|---|
| 367 | """Build the HTML index of the mirrored pages""" |
|---|
| 368 | startfiles = {} |
|---|
| 369 | for feed in self.feeds: |
|---|
| 370 | startfiles[feed] = [] |
|---|
| 371 | destdir = os.path.join(self.path, feed.name) |
|---|
| 372 | for page in feed.pages: |
|---|
| 373 | if not os.path.exists(os.path.join(destdir, page.title)): |
|---|
| 374 | continue |
|---|
| 375 | try: |
|---|
| 376 | local_path = downloader.get_start_path(destdir, page.title) |
|---|
| 377 | except IOError: |
|---|
| 378 | print >> sys.stderr, "Can't find the url.txt file for %s" \ |
|---|
| 379 | % page.title |
|---|
| 380 | continue # no url.txt file, something went wrong |
|---|
| 381 | startfiles[feed].append( |
|---|
| 382 | ( unicode(page.title_full).encode("utf-8"), |
|---|
| 383 | page.description, |
|---|
| 384 | "/".join([feed.name, page.title, local_path]) ) ) |
|---|
| 385 | mainindex = open(os.path.join(self.path, "index.html"), "w") |
|---|
| 386 | mainindex.write("""<!DOCTYPE html> |
|---|
| 387 | <html> |
|---|
| 388 | <head> |
|---|
| 389 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> |
|---|
| 390 | <meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0; user-scalable=0;"/> |
|---|
| 391 | <title>Mirrored pages</title> |
|---|
| 392 | <link rel="stylesheet" href="_iui/iui.css" type="text/css" /> |
|---|
| 393 | <link rel="stylesheet" href="_iui/t/default/default-theme.css" type="text/css"/> |
|---|
| 394 | <script type="application/x-javascript" src="_iui/iui.js"></script> |
|---|
| 395 | </head> |
|---|
| 396 | <body> |
|---|
| 397 | |
|---|
| 398 | <div class="toolbar"> |
|---|
| 399 | <h1 id="pageTitle"></h1> |
|---|
| 400 | <a id="backButton" class="button" href="#"></a> |
|---|
| 401 | </div> |
|---|
| 402 | """) |
|---|
| 403 | if len(startfiles) > 1: |
|---|
| 404 | mainindex.write("""<ul id="index" title="Mirrored pages" selected="true">\n""") |
|---|
| 405 | for feed in sorted(startfiles, key=lambda f: f.name): |
|---|
| 406 | mainindex.write(""" <li><a href="#%(name)s">%(title)s</a></li>\n""" |
|---|
| 407 | % {"name": feed.name, "title": feed.title} ) |
|---|
| 408 | mainindex.write("</ul>\n\n") |
|---|
| 409 | |
|---|
| 410 | for feed in sorted(startfiles, key=lambda f: f.name): |
|---|
| 411 | mainindex.write("""<ul id="%(name)s" title="%(title)s">\n""" |
|---|
| 412 | % {"name": feed.name, "title": feed.title}) |
|---|
| 413 | for title, description, index in startfiles[feed]: |
|---|
| 414 | mainindex.write( |
|---|
| 415 | """ <li><a href="%s" target="_webapp">%s</a></li>\n""" % |
|---|
| 416 | (urllib.quote(index.encode("utf-8")), title)) |
|---|
| 417 | mainindex.write("</ul>\n\n") |
|---|
| 418 | mainindex.write(""" |
|---|
| 419 | </body> |
|---|
| 420 | </html> |
|---|
| 421 | """) |
|---|
| 422 | |
|---|
| 423 | |
|---|
| 424 | def cleanup(self): |
|---|
| 425 | """ |
|---|
| 426 | Remove mirrored pages which are not in the feed anymore (probably |
|---|
| 427 | because you read them) |
|---|
| 428 | """ |
|---|
| 429 | dirs_to_remove = self._get_old_feed_dirs() |
|---|
| 430 | for feed in self.feeds: |
|---|
| 431 | dirs_to_remove.extend(self._get_old_page_dirs(feed)) |
|---|
| 432 | for dirname in dirs_to_remove: |
|---|
| 433 | print "Removing", dirname |
|---|
| 434 | if not config.getboolean("DEFAULT", "debug"): |
|---|
| 435 | shutil.rmtree(os.path.join(self.path, dirname)) |
|---|
| 436 | self.lowercase_dirs() |
|---|
| 437 | if "_iui" not in os.listdir(self.path): |
|---|
| 438 | self.download_iui() |
|---|
| 439 | |
|---|
| 440 | def _get_old_feed_dirs(self): |
|---|
| 441 | dirs = [] |
|---|
| 442 | for feeddirname in os.listdir(self.path): |
|---|
| 443 | if feeddirname.startswith("."): |
|---|
| 444 | continue |
|---|
| 445 | if feeddirname == "_iui": |
|---|
| 446 | continue |
|---|
| 447 | if not os.path.isdir(os.path.join(self.path, feeddirname)): |
|---|
| 448 | continue # like "index.html" for example |
|---|
| 449 | if feeddirname not in [ f.name for f in self.feeds ]: |
|---|
| 450 | dirs.append(feeddirname) |
|---|
| 451 | return dirs |
|---|
| 452 | |
|---|
| 453 | def _get_old_page_dirs(self, feed): |
|---|
| 454 | if not os.path.isdir(os.path.join(self.path, feed.name)): |
|---|
| 455 | return [] |
|---|
| 456 | dirs = [] |
|---|
| 457 | for dirname in os.listdir(os.path.join(self.path, feed.name)): |
|---|
| 458 | if dirname.startswith("."): |
|---|
| 459 | continue |
|---|
| 460 | if dirname not in [ p.title for p in feed.pages ]: |
|---|
| 461 | dirs.append(os.path.join(feed.name, dirname)) |
|---|
| 462 | return dirs |
|---|
| 463 | |
|---|
| 464 | def lowercase_dirs(self): |
|---|
| 465 | """For FAT32 transparency""" |
|---|
| 466 | for feed in self.feeds: |
|---|
| 467 | for root, dirs, files in os.walk( |
|---|
| 468 | os.path.join(self.path, feed.name), topdown=False): |
|---|
| 469 | for name in dirs: |
|---|
| 470 | newname = name.lower() |
|---|
| 471 | if name != newname: |
|---|
| 472 | source = os.path.join(self.path, feed.name, root, name) |
|---|
| 473 | dest = os.path.join(self.path, feed.name, root, newname) |
|---|
| 474 | if os.path.exists(dest): |
|---|
| 475 | continue |
|---|
| 476 | os.rename(source, dest) |
|---|
| 477 | |
|---|
| 478 | |
|---|
| 479 | def download_iui(self): |
|---|
| 480 | print "Downloading iUI... ", |
|---|
| 481 | sys.stdout.flush() |
|---|
| 482 | iui_url = "http://iui.googlecode.com/files/iui-%s.zip" % IUI_VERSION |
|---|
| 483 | try: |
|---|
| 484 | fn, _headers = urllib.urlretrieve(iui_url) |
|---|
| 485 | with ZipFile(fn, "r") as archive: |
|---|
| 486 | archive.extractall(self.path) |
|---|
| 487 | os.remove(fn) |
|---|
| 488 | os.rename(os.path.join(self.path, "iui-%s/web-app/iui" % IUI_VERSION), |
|---|
| 489 | os.path.join(self.path, "_iui")) |
|---|
| 490 | shutil.rmtree(os.path.join(self.path, "iui-%s" % IUI_VERSION)) |
|---|
| 491 | except (IOError, BadZipfile): |
|---|
| 492 | print "FAILED." |
|---|
| 493 | print >>sys.stderr, "WARNING, could not download or unzip iUI" |
|---|
| 494 | from formatter import DumbWriter |
|---|
| 495 | dw = DumbWriter(sys.stderr) # not really necessary, but fun :) |
|---|
| 496 | dw.send_flowing_data("You must download iUI from %s, unpack it, " |
|---|
| 497 | "rename the folder to \"_iui\", and put it in the " |
|---|
| 498 | "destination directory." % iui_url) |
|---|
| 499 | print |
|---|
| 500 | return |
|---|
| 501 | print "done." |
|---|
| 502 | |
|---|
| 503 | |
|---|
| 504 | def download_jqm(self): |
|---|
| 505 | print "Downloading jQuery Mobile... ", |
|---|
| 506 | sys.stdout.flush() |
|---|
| 507 | jqm_url = ("http://code.jquery.com/mobile/%(ver)s/jquery.mobile-%(ver)s.zip" |
|---|
| 508 | % {"ver": JQM_VERSION}) |
|---|
| 509 | jq_url = "http://code.jquery.com/jquery-%s.min.js" % JQ_VERSION |
|---|
| 510 | try: |
|---|
| 511 | jq_fn, _headers = urllib.urlretrieve(jqm_url) |
|---|
| 512 | with ZipFile(jq_fn, "r") as jq_zip: |
|---|
| 513 | jq_zip.extractall(self.path) |
|---|
| 514 | os.remove(jq_fn) |
|---|
| 515 | os.rename(os.path.join(self.path, "jquery.mobile-%s" % JQM_VERSION), |
|---|
| 516 | os.path.join(self.path, "_jqm")) |
|---|
| 517 | urllib.urlretrieve(jq_url, os.path.join(self.path, "_jqm", |
|---|
| 518 | os.path.basename(jq_url))) |
|---|
| 519 | except (IOError, BadZipfile): |
|---|
| 520 | print "FAILED." |
|---|
| 521 | print >>sys.stderr, ("WARNING, could not download or unzip " |
|---|
| 522 | "jQuery Mobile.") |
|---|
| 523 | from formatter import DumbWriter |
|---|
| 524 | dw = DumbWriter(sys.stderr) # not really necessary, but fun :) |
|---|
| 525 | dw.send_flowing_data("You must download jQuery Mobile from " |
|---|
| 526 | "%(jqmurl)s, unpack it, rename the folder to \"_jqm\", " |
|---|
| 527 | "put it in the destination directory, then download " |
|---|
| 528 | "jQuery from %(jqurl)s, and put it in the same folder." |
|---|
| 529 | % { "jqmurl": jqm_url, "jqurl": jq_url } ) |
|---|
| 530 | print |
|---|
| 531 | return |
|---|
| 532 | print "done." |
|---|
| 533 | |
|---|
| 534 | |
|---|
| 535 | def parse_opts(): |
|---|
| 536 | """Command-line options""" |
|---|
| 537 | usage = "usage: %prog -c <config file>" |
|---|
| 538 | parser = optparse.OptionParser(usage) |
|---|
| 539 | parser.add_option("-c", "--config", help="Configuration file") |
|---|
| 540 | parser.add_option("-o", "--output", dest="output", metavar="DIR", |
|---|
| 541 | help="Output directory (will be purged !)") |
|---|
| 542 | parser.add_option("--list-engines", dest="lse", action="store_true", |
|---|
| 543 | help="List available engines and exit") |
|---|
| 544 | parser.add_option("-r", "--recursive", dest="recursive", |
|---|
| 545 | type="int", metavar="DEPTH", help="Download linked " |
|---|
| 546 | "pages until this depth. Be careful with that. " |
|---|
| 547 | "Default: %default)") |
|---|
| 548 | parser.add_option("-d", "--debug", dest="debug", action="store_true", |
|---|
| 549 | help="Debug mode") |
|---|
| 550 | options, args = parser.parse_args() |
|---|
| 551 | if (options.lse): |
|---|
| 552 | engines = get_engines() |
|---|
| 553 | print "\n".join(engines.keys()) |
|---|
| 554 | sys.exit() |
|---|
| 555 | if not options.config: |
|---|
| 556 | if os.path.exists(os.path.expanduser(CONFIG_PATH)): |
|---|
| 557 | options.config = CONFIG_PATH |
|---|
| 558 | else: |
|---|
| 559 | parser.error("You must provide a configuration file (or put it " |
|---|
| 560 | "in %s)" % CONFIG_PATH) |
|---|
| 561 | if not os.path.exists(os.path.expanduser(options.config)): |
|---|
| 562 | parser.error("Unable to find the configuration file: %s" |
|---|
| 563 | % options.config) |
|---|
| 564 | if args: |
|---|
| 565 | parser.error("No arguments allowed") |
|---|
| 566 | return options |
|---|
| 567 | |
|---|
| 568 | |
|---|
| 569 | def get_feeds(config): |
|---|
| 570 | feed_list = [] |
|---|
| 571 | for s in config.sections(): |
|---|
| 572 | if not config.has_option(s, "url"): |
|---|
| 573 | continue |
|---|
| 574 | feed = Feed(s, config.get(s, "url")) |
|---|
| 575 | feed_list.append(feed) |
|---|
| 576 | return feed_list |
|---|
| 577 | |
|---|
| 578 | |
|---|
| 579 | def choose_engine(config): |
|---|
| 580 | engines = get_engines() |
|---|
| 581 | name = config.get("DEFAULT", "engine") |
|---|
| 582 | return engines[name]() |
|---|
| 583 | |
|---|
| 584 | |
|---|
| 585 | def get_config(options): |
|---|
| 586 | # TODO: create config |
|---|
| 587 | config = SafeConfigParser({"title_size": "50", "engine": "wget", |
|---|
| 588 | "recursive": "0"}) |
|---|
| 589 | config.read(os.path.expanduser(options.config)) |
|---|
| 590 | if options.output is not None: |
|---|
| 591 | config.set("DEFAULT", "output", options.output) |
|---|
| 592 | if not config.has_option("DEFAULT", "output"): |
|---|
| 593 | print >> sys.stderr, "Config file should have an 'output' variable" |
|---|
| 594 | sys.exit(1) |
|---|
| 595 | if not os.path.isdir(os.path.expanduser(config.get("DEFAULT", "output"))): |
|---|
| 596 | print >> sys.stderr, "The output path must be a directory" |
|---|
| 597 | sys.exit(1) |
|---|
| 598 | config.set("DEFAULT", "debug", str(bool(options.debug))) |
|---|
| 599 | if options.recursive is not None: |
|---|
| 600 | config.set("DEFAULT", "recursive", options.recursive) |
|---|
| 601 | return config |
|---|
| 602 | |
|---|
| 603 | |
|---|
| 604 | def main(): |
|---|
| 605 | """The fun starts here""" |
|---|
| 606 | global config |
|---|
| 607 | options = parse_opts() |
|---|
| 608 | config = get_config(options) |
|---|
| 609 | downloader = choose_engine(config) |
|---|
| 610 | |
|---|
| 611 | feeds = get_feeds(config) |
|---|
| 612 | for feed in feeds[:]: |
|---|
| 613 | try: |
|---|
| 614 | feed.parse() |
|---|
| 615 | except urllib2.HTTPError, e: |
|---|
| 616 | print >>sys.stderr, "Failed downloading %s: %s" % (feed.url, e) |
|---|
| 617 | feeds.remove(feed) |
|---|
| 618 | |
|---|
| 619 | repo = Repository(config.get("DEFAULT", "output"), feeds) |
|---|
| 620 | |
|---|
| 621 | for feed in feeds: |
|---|
| 622 | outdir = os.path.join(repo.path, feed.name) |
|---|
| 623 | for page in feed.pages: |
|---|
| 624 | page.download(outdir, downloader) |
|---|
| 625 | |
|---|
| 626 | repo.make_index(downloader) |
|---|
| 627 | repo.cleanup() |
|---|
| 628 | |
|---|
| 629 | |
|---|
| 630 | |
|---|
| 631 | if __name__ == "__main__": |
|---|
| 632 | main() |
|---|