Back to my blog Back to my projects

Scripts

source: podcast-transcode.py @ 45222ed

Revision 45222ed, 16.5 KB checked in by Aurélien Bompard <aurelien@…>, 17 months ago (diff)

podcast-transcode: catch one more exception

  • Property mode set to 100755
Line 
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3u"""
4Reads an RSS/Atom feed and converts the enclosures to AVI.
5
6Dependencies:
7
8- ``flvstreamer`` for the RTMP streams
9- ``mimms`` for the MMS streams
10- ``file`` to get the sizes of the videos
11- ``mencoder`` to do the conversion
12- ``tedtalksubs.py`` to dowload ted talks subtitles (in this repo)
13
14:Authors:
15    Aurélien Bompard <aurelien@bompard.org> <http://aurelien.bompard.org>
16
17:License:
18    GNU GPL v3 or later
19"""
20
21import os
22import sys
23import urllib2
24import httplib
25import glob
26import subprocess
27import re
28import tempfile
29import atexit
30from urlparse import urlparse
31from optparse import OptionParser
32from stat import S_IRUSR, S_IWUSR, S_IROTH, S_IRGRP
33#from xml.etree import ElementTree as ET
34from pprint import pprint
35
36from lxml import etree as ET
37import urlgrabber
38import urlgrabber.progress
39
40# Tags to skip
41EXCLUDE_TAGS = ""
42# Max size of the encoded video
43WIDTH = 800
44HEIGHT = 480
45# Default MIME type
46MIME_DEFAULT = "video/x-msvideo"
47EXTENSION = "avi"
48
49
50def get_options():
51    usage = "usage: %prog -i input_feed -u URL -o output_feed [-d directory]"
52    parser = OptionParser(usage=usage)
53    parser.add_option("-i", "--input", dest="input",
54                      help="Process this file")
55    parser.add_option("-o", "--output", dest="output",
56                      help="Write the RSS in this file")
57    parser.add_option("-d", "--directory", dest="directory",
58                      help="Write the converted videos in this directory")
59    parser.add_option("-u", "--url", dest="url",
60                      help="The external URL of the video folder")
61    parser.add_option("-W", "--width", dest="width", type="int", default=WIDTH,
62          help="Width of the converted video [default: %default]")
63    parser.add_option("-H", "--height", dest="height", type="int", default=HEIGHT,
64          help="Height of the converted video [default: %default]")
65    parser.add_option("-m", "--max", dest="max", type="int", default=10,
66          help="Only convert that many videos, drop the rest [default: %default]")
67    parser.add_option("-q", "--quiet", dest="quiet", action="store_true",
68                      default=False, help="Don't show progress bars")
69    parser.add_option("-k", "--keep", dest="keep", action="store_true",
70                      default=False, help="Don't remove original files")
71    parser.add_option("--exclude-tags", dest="exclude_tags",
72                      default=EXCLUDE_TAGS, help="Drop videos tagged with a "
73                      "tag in this comma-sparated list [default: %default]")
74    parser.add_option("--subtitles", dest="subtitles", metavar="LANG",
75                      help="Download subtitles in this language")
76    parser.add_option("--old-ffmpeg", dest="oldffmpeg", action="store_true",
77                      help="FFMpeg is old (like on Debian Lenny)")
78    options, args = parser.parse_args()
79    if len(args) > 0:
80        parser.error("illegal arguments: %s" % ", ".join(args))
81    if not options.input:
82        parser.error("I need a file to process")
83    if not os.path.exists(options.input):
84        parser.error("The file to process does not exist")
85    if not options.output:
86        parser.error("I need a file to write to")
87    if not options.url:
88        parser.error("I need an external URL")
89    if not options.directory and options.input != "-":
90        options.directory = os.path.abspath(os.path.dirname(options.input))
91    if not options.directory:
92        parser.error("I need a directory for the videos")
93    if not options.quiet and "TERM" not in os.environ:
94        options.quiet = True # Not in a terminal, be quiet anyway
95    if isinstance(options.exclude_tags, basestring):
96        options.exclude_tags = [ t.strip() for t in
97                                 options.exclude_tags.split(",") ]
98    return options, args
99
100
101class PodcastError(Exception): pass
102class NotAPodcastError(PodcastError): pass
103class TranscodingError(PodcastError): pass
104class DownloadingError(PodcastError): pass
105
106
107class Podcast(object):
108
109    _mimetypes = {}
110
111    def __init__(self, item):
112        self.item = item
113        self.enclosure = item.find("enclosure")
114        if not ET.iselement(self.enclosure):
115            raise NotAPodcastError()
116        self.url = self._get_url()
117        self.path_downloaded = self._get_downloaded_path()
118        self.content_type = self._get_content_type()
119        self.path_encoded = self._get_encoded_path()
120        self.subs = self._get_subtitles()
121        self.video_info = None
122        self.size = None
123
124    def _get_url(self):
125        url = self.enclosure.get("url")
126        if url.startswith(options.url):
127            return url
128        # Resolve redirects
129        try:
130            remote_file = urllib2.urlopen(url)
131            url = remote_file.geturl()
132            self.content_type = remote_file.info().get("Content-Type")
133            remote_file.close()
134        except (urllib2.HTTPError, httplib.HTTPException), e:
135            print "Failed downloading %s" % url
136            print e
137        except urllib2.URLError, e:
138            print "Probably RTMP or MMS: %s" % url
139        return url
140
141    def _get_content_type(self):
142        if getattr(self, "content_type", None):
143            return self.content_type # already set by _get_url()
144        if "content-type" in self.enclosure.attrib:
145            return self.enclosure.attrib.get("content-type")
146        for extension, mimetype in self.mimetypes.iteritems():
147            if self.path_downloaded.endswith("."+extension):
148                return mimetype
149        return MIME_DEFAULT
150
151    def _get_downloaded_path(self):
152        url_obj = urlparse(self.url)
153        filename = os.path.basename(url_obj.path)
154        if filename.count("?"):
155            filename = filename[:filename.index("?")]
156        return os.path.join(options.directory, filename)
157
158    def _get_encoded_path(self):
159        if self.content_type.startswith("audio/"):
160            # the file won't be transcoded
161            return self.path_downloaded
162        filename_encoded = "%s.%s" % (
163                os.path.splitext(self.path_downloaded)[0], EXTENSION)
164        return os.path.join(options.directory, filename_encoded)
165
166    def _get_subtitles(self):
167        if not options.subtitles:
168            return None
169        subfile, subfile_path = tempfile.mkstemp(prefix="podcast-transcode-sub-",
170                                                 suffix=".srt")
171        os.close(subfile)
172        atexit.register(os.remove, subfile_path)
173        if (flux_xml.findtext("channel/title") == "TEDTalks (video)" or
174                flux_xml.findtext("channel/title") == "TEDTalks (hd)"):
175            talkid = self.item.findtext("guid").split(":")[1]
176            subdl = subprocess.Popen(["tedtalksubs.py", "-l", options.subtitles,
177                                      "-o", subfile_path, talkid],
178                                     stdout=subprocess.PIPE,
179                                     stderr=subprocess.STDOUT)
180            out, err = subdl.communicate()
181            if subdl.returncode != 0:
182                print >>sys.stderr, "Failed to download subtitles. Message:"
183                print >>sys.stderr, out
184                return None
185            return subfile_path
186        return None
187
188    def _get_mimetypes(self):
189        if self._mimetypes:
190            return self._mimetypes
191        mimetypes_re = re.compile("\s*([^\s]+)\s+([^\s]+)\s*")
192        mimetypes = open("/etc/mime.types")
193        for line in mimetypes:
194            line_mo = mimetypes_re.match(line)
195            if not line_mo:
196                continue
197            mimetype = line_mo.group(1)
198            extension = line_mo.group(2)
199            if not mimetype.startswith("video/") \
200                    and not mimetype.startswith("audio/"):
201                continue
202            self._mimetypes[line_mo.group(2)] = line_mo.group(1)
203        mimetypes.close()
204        return self._mimetypes
205    mimetypes = property(_get_mimetypes)
206
207    def is_already_transcoded(self):
208        return self.url.startswith(options.url)
209
210    def process(self):
211        if not os.path.exists(self.path_encoded):
212            self.download()
213            self.encode_video()
214        else:
215            print "Already downloaded/encoded: %s" % self.path_encoded
216        self.url = "%s/%s" % (options.url, os.path.basename(self.path_encoded))
217        self.size = int(os.stat(self.path_encoded).st_size)
218        self.update_item()
219
220    def update_item(self):
221        self.enclosure.set("url", self.url)
222        self.enclosure.set("length", str(self.size))
223        self.enclosure.set("type", self.content_type)
224        fb = self.item.find("{http://rssnamespace.org/feedburner/ext/1.0}origEnclosureLink")
225        if ET.iselement(fb):
226            fb.text = self.url
227        mediacontent = self.item.find("{http://search.yahoo.com/mrss/}content")
228        if ET.iselement(mediacontent):
229            mediacontent.set("url", self.url)
230            mediacontent.set("fileSize", str(self.size))
231            mediacontent.set("type", self.content_type)
232
233    def download(self):
234        if os.path.exists(self.path_downloaded):
235            print "Already downloaded: %s" % self.path_downloaded
236            return
237        if self.url.startswith("rtmp://"):
238            self.download_rtmp()
239        elif self.url.startswith("mms://"):
240            self.download_mms()
241        elif self.content_type and self.content_type == "video/x-ms-asf":
242            self.download_asf()
243        else:
244            if options.quiet:
245                progress = urlgrabber.progress.BaseMeter()
246            else:
247                progress = urlgrabber.progress.TextMeter(fo=sys.stdout)
248            print "Downloading %s to %s" % (self.url, self.path_downloaded)
249            try:
250                urlgrabber.urlgrab(self.url, filename=self.path_downloaded,
251                                   reget='simple', progress_obj=progress)
252            except urlgrabber.grabber.URLGrabError, e:
253                raise DownloadingError("Error downloading %s: %s"
254                                       % (self.url, e))
255
256    def download_rtmp(self):
257        MAX_TRIES = 10
258        def download_rtmp_unit(url, path):
259            command = ["flvstreamer", "-r", url, "-o", path]
260            if options.quiet:
261                command.append("-q")
262            if os.path.exists(path):
263                command.insert(1, "--resume")
264            print "Streaming %s to %s" % (url, path)
265            retcode = 0
266            try:
267                retcode = subprocess.call(command)
268            except KeyboardInterrupt:
269                retcode = 1
270            return retcode
271
272        retcode = download_rtmp_unit(self.url, self.path_downloaded)
273        # flvstreamer returns 2 if the download is incomplete
274        current_try = 1
275        while retcode == 2:
276            print "Trying again..."
277            retcode = download_rtmp_unit(self.url, self.path_downloaded)
278            current_try += 1
279            if current_try > MAX_TRIES:
280                print "Too many tries, aborting."
281                break
282        if retcode != 0:
283            if os.path.exists(self.path_downloaded):
284                os.remove(self.path_downloaded)
285            raise DownloadingError("Error code: %s" % retcode)
286
287    def download_mms(self):
288        #command = ["mplayer", "-dumpstream", "-dumpfile", self.path_downloaded, self.url]
289        command = ["mimms", self.url, self.path_downloaded]
290        if options.quiet:
291            command.append("-q")
292        try:
293            print "Streaming %s to %s" % (self.url, self.path_downloaded)
294            retcode = 0
295            retcode = subprocess.call(command)
296        except KeyboardInterrupt:
297            retcode = 1
298        if retcode != 0 and os.path.exists(self.path_downloaded):
299            os.remove(self.path_downloaded)
300            raise DownloadingError("Error code: %s" % retcode)
301
302    def download_asf(self):
303        mms_xml = urllib2.urlopen(self.url).read()
304        mms_match = re.search('"(mms://.*)"', mms_xml)
305        mms_url = mms_match.group(1)
306        return download_mms(mms_url.replace("&amp;", "&"), self.path_downloaded)
307
308    def encode_video(self):
309        if self.path_encoded == self.path_downloaded:
310            print "No transcoding required"
311            return
312        transcoded_video = self._transcode_video()
313        os.rename(transcoded_video, self.path_encoded)
314        os.chmod(self.path_encoded, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH) # 644
315
316    def _transcode_video(self):
317        width, height = self.get_video_info()
318        transcoded_video_file, transcoded_video = tempfile.mkstemp(
319                prefix="podcast-transcode-", suffix=".avi", dir=options.directory)
320        os.close(transcoded_video_file)
321        def rm_if_exists(f):
322            if os.path.exists(f):
323                os.remove(f)
324        if not options.keep:
325            atexit.register(rm_if_exists, transcoded_video)
326        command = ["mencoder", "-oac", "mp3lame",
327                   "-ovc", "lavc", "-lavcopts", "vbitrate=600",
328                   "-of", "avi", "-mc", "0", self.path_downloaded,
329                   "-o", transcoded_video]
330        if height and width:
331            if int(height) > options.height:
332                command.extend(["-vf", "scale=-3:%d" % options.height])
333            elif int(width) > options.width:
334                command.extend(["-vf", "scale=%d:-3" % options.width])
335        if options.quiet:
336            command.append("-quiet")
337        if self.subs:
338            command.extend(["-sub", self.subs, "-subfont-text-scale", "4"])
339            if not os.path.exists(os.path.expanduser("~/.mplayer/subfont.ttf")):
340                command.extend(["-fontconfig", "-font", "DejaVu Sans"])
341        print " ".join(command)
342        retcode = 0
343        try:
344            print "Encoding %s to %s" % (self.path_downloaded, transcoded_video)
345            retcode = subprocess.call(command)
346        except KeyboardInterrupt:
347            retcode = 1
348        if retcode != 0:
349            if os.path.exists(transcoded_video):
350                os.remove(transcoded_video)
351            raise TranscodingError("Error code: %s" % retcode)
352        self.content_type = "video/x-msvideo"
353        return transcoded_video
354
355    def get_video_info(self):
356        if self.video_info is not None:
357            return self.video_info
358        ffmpeg_cmd = subprocess.Popen(["ffmpeg", "-i", self.path_downloaded],
359                        stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
360        output = ffmpeg_cmd.stdout.read()
361        info_match = re.search("Stream .*: Video: (\w+), \w+, (\d+)x(\d+)", output)
362        if info_match:
363            self.video_info = (info_match.group(2), info_match.group(3))
364        else:
365            self.video_info = (None, None)
366        return self.video_info
367
368
369def cleanup(items):
370    feed_podcasts = set()
371    for item in items:
372        try:
373            podcast = Podcast(item)
374        except NotAPodcastError:
375            continue
376        feed_podcasts.add(os.path.basename(podcast.path_downloaded))
377        feed_podcasts.add(os.path.basename(podcast.path_encoded))
378
379    if options.keep:
380        return
381
382    for filepath in glob.glob(os.path.join(options.directory, "*")):
383        if filepath.endswith(".xml"):
384            continue # keep the RSS feed
385        filename = os.path.basename(filepath)
386        if filename not in feed_podcasts:
387            print "Removing old file %s" % filename
388            #print feed_podcasts
389            os.remove(filepath)
390
391
392def handle_item(item):
393    try:
394        podcast = Podcast(item)
395    except NotAPodcastError:
396        return
397
398    if podcast.is_already_transcoded():
399        print "Already converted: %s" % podcast.url
400        return
401    if options.subtitles and not podcast.subs:
402        print "No subtitles for %s, skipping." % item.findtext("guid")
403        flux_xml.find("channel").remove(item)
404        return
405
406    try:
407        podcast.process()
408    except DownloadingError, e:
409        print e
410        return
411    except TranscodingError, e:
412        print e
413        return
414
415
416def to_skip(item):
417    tags = item.findall("category")
418    for tag in tags:
419        if tag.text in options.exclude_tags:
420            return True
421    return False
422
423def main():
424    global options, flux_xml
425    options, args = get_options()
426    if options.input == "-":
427        options.input = sys.stdin
428    flux_xml = ET.parse(options.input)
429    items = flux_xml.findall("channel/item")
430    # tag skipping
431    for item in items[:]:
432        if to_skip(item):
433            flux_xml.find("channel").remove(item)
434            items.remove(item)
435    for i, item in enumerate(items[:]):
436        if i < options.max:
437            handle_item(item)
438        else:
439            flux_xml.find("channel").remove(item)
440
441    #flux_xml.write(options.output, "utf-8")
442    flux_xml.write(options.output)
443    cleanup(items[:options.max])
444
445
446if __name__ == "__main__":
447    main()
448
Note: See TracBrowser for help on using the repository browser.