Original 2010-09-03 version
David Blume

David Blume commited on 2018-01-20 20:10:33
Showing 4 changed files, with 4291 additions and 0 deletions.

... ...
@@ -0,0 +1,19 @@
1
+Copyright (c) 2018, David Blume
2
+
3
+Permission is hereby granted, free of charge, to any person obtaining
4
+a copy of this software and associated documentation files (the "Software"),
5
+to deal in the Software without restriction, including without limitation
6
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
7
+and/or sell copies of the Software, and to permit persons to whom the
8
+Software is furnished to do so, subject to the following conditions:
9
+
10
+The above copyright notice and this permission notice shall be included
11
+in all copies or substantial portions of the Software.
12
+
13
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
14
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
16
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
19
+DEALINGS IN THE SOFTWARE.
... ...
@@ -0,0 +1,30 @@
1
+[![License](https://img.shields.io/badge/license-MIT_license-blue.svg)](http://git.dlma.com/techcrunch.git/blob/master/LICENSE.txt)
2
+![python2.x](https://img.shields.io/badge/python-2.x-yellow.svg)
3
+# TechCrunch Feed Filter
4
+
5
+This is a Python script run as a cronjob to read the TechCrunch article feed, 
6
+and decide which articles to include in its own feed.
7
+
8
+Here's a [blog post about it](http://david.dlma.com/blog/my-techcrunch-feed-filter).
9
+
10
+# History
11
+
12
+This was originally archived in a Subversion repo. I'd forgotten about the
13
+version control and had gotten into the habit of just modifying the production
14
+site.
15
+
16
+* 2010-09-03: Original
17
+* 2010-09-03: Save off the disqus identifier for use later.
18
+* 2011-02-04: Algorithm changes (tags and author checked), new chart drawing, spaces used instead of tabs.
19
+* 2011-02-04: Update to the chart drawing algorithm.
20
+* 2013-08-04: Miscellaneous changes to techcrunch.py
21
+* 2015-11-23: Resync svn with production site.
22
+* 2015-11-27: Remove obsolete disqus and retweet code, and refactor style to be more PEP-8ish.
23
+
24
+# Is it any good?
25
+
26
+[Yes](https://news.ycombinator.com/item?id=3067434).
27
+
28
+# Licence
29
+
30
+This software uses the [MIT license](http://git.dlma.com/techcrunch.git/blob/master/LICENSE.txt)
... ...
@@ -0,0 +1,3612 @@
1
+#!/usr/bin/env python
2
+"""Universal feed parser
3
+
4
+Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
5
+
6
+Visit http://feedparser.org/ for the latest version
7
+Visit http://feedparser.org/docs/ for the latest documentation
8
+
9
+Required: Python 2.1 or later
10
+Recommended: Python 2.3 or later
11
+Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
12
+"""
13
+
14
+__version__ = "4.2-pre-" + "$Revision: 291 $"[11:14] + "-svn"
15
+__license__ = """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved.
16
+
17
+Redistribution and use in source and binary forms, with or without modification,
18
+are permitted provided that the following conditions are met:
19
+
20
+* Redistributions of source code must retain the above copyright notice,
21
+  this list of conditions and the following disclaimer.
22
+* Redistributions in binary form must reproduce the above copyright notice,
23
+  this list of conditions and the following disclaimer in the documentation
24
+  and/or other materials provided with the distribution.
25
+
26
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
27
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36
+POSSIBILITY OF SUCH DAMAGE."""
37
+__author__ = "Mark Pilgrim <http://diveintomark.org/>"
38
+__contributors__ = ["Jason Diamond <http://injektilo.org/>",
39
+                    "John Beimler <http://john.beimler.org/>",
40
+                    "Fazal Majid <http://www.majid.info/mylos/weblog/>",
41
+                    "Aaron Swartz <http://aaronsw.com/>",
42
+                    "Kevin Marks <http://epeus.blogspot.com/>",
43
+                    "Sam Ruby <http://intertwingly.net/>"]
44
+_debug = 0
45
+
46
+# HTTP "User-Agent" header to send to servers when downloading feeds.
47
+# If you are embedding feedparser in a larger application, you should
48
+# change this to your application name and URL.
49
+USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__
50
+
51
+# HTTP "Accept" header to send to servers when downloading feeds.  If you don't
52
+# want to send an Accept header, set this to None.
53
+ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
54
+
55
+# List of preferred XML parsers, by SAX driver name.  These will be tried first,
56
+# but if they're not installed, Python will keep searching through its own list
57
+# of pre-installed parsers until it finds one that supports everything we need.
58
+PREFERRED_XML_PARSERS = ["drv_libxml2"]
59
+
60
+# If you want feedparser to automatically run HTML markup through HTML Tidy, set
61
+# this to 1.  Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
62
+# or utidylib <http://utidylib.berlios.de/>.
63
+TIDY_MARKUP = 0
64
+
65
+# List of Python interfaces for HTML Tidy, in order of preference.  Only useful
66
+# if TIDY_MARKUP = 1
67
+PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
68
+
69
+# If you want feedparser to automatically resolve all relative URIs, set this
70
+# to 1.
71
+RESOLVE_RELATIVE_URIS = 1
72
+
73
+# If you want feedparser to automatically sanitize all potentially unsafe
74
+# HTML content, set this to 1.
75
+SANITIZE_HTML = 1
76
+
77
+# ---------- required modules (should come with any Python distribution) ----------
78
+import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
79
+try:
80
+    from cStringIO import StringIO as _StringIO
81
+except:
82
+    from StringIO import StringIO as _StringIO
83
+
84
+# ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
85
+
86
+# gzip is included with most Python distributions, but may not be available if you compiled your own
87
+try:
88
+    import gzip
89
+except:
90
+    gzip = None
91
+try:
92
+    import zlib
93
+except:
94
+    zlib = None
95
+
96
+# If a real XML parser is available, feedparser will attempt to use it.  feedparser has
97
+# been tested with the built-in SAX parser, PyXML, and libxml2.  On platforms where the
98
+# Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
99
+# versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
100
+try:
101
+    import xml.sax
102
+    xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
103
+    from xml.sax.saxutils import escape as _xmlescape
104
+    _XML_AVAILABLE = 1
105
+except:
106
+    _XML_AVAILABLE = 0
107
+    def _xmlescape(data,entities={}):
108
+        data = data.replace('&', '&amp;')
109
+        data = data.replace('>', '&gt;')
110
+        data = data.replace('<', '&lt;')
111
+        for char, entity in entities:
112
+            data = data.replace(char, entity)
113
+        return data
114
+
115
+# base64 support for Atom feeds that contain embedded binary data
116
+try:
117
+    import base64, binascii
118
+except:
119
+    base64 = binascii = None
120
+
121
+# cjkcodecs and iconv_codec provide support for more character encodings.
122
+# Both are available from http://cjkpython.i18n.org/
123
+try:
124
+    import cjkcodecs.aliases
125
+except:
126
+    pass
127
+try:
128
+    import iconv_codec
129
+except:
130
+    pass
131
+
132
+# chardet library auto-detects character encodings
133
+# Download from http://chardet.feedparser.org/
134
+try:
135
+    import chardet
136
+    if _debug:
137
+        import chardet.constants
138
+        chardet.constants._debug = 1
139
+except:
140
+    chardet = None
141
+
142
+# reversable htmlentitydefs mappings for Python 2.2
143
+try:
144
+  from htmlentitydefs import name2codepoint, codepoint2name
145
+except:
146
+  import htmlentitydefs
147
+  name2codepoint={}
148
+  codepoint2name={}
149
+  for (name,codepoint) in htmlentitydefs.entitydefs.iteritems():
150
+    if codepoint.startswith('&#'): codepoint=unichr(int(codepoint[2:-1]))
151
+    name2codepoint[name]=ord(codepoint)
152
+    codepoint2name[ord(codepoint)]=name
153
+
154
+# BeautifulSoup parser used for parsing microformats from embedded HTML content
155
+# http://www.crummy.com/software/BeautifulSoup/
156
+# feedparser is tested with BeautifulSoup 3.0.x, but it might work with the
157
+# older 2.x series.  If it doesn't, and you can figure out why, I'll accept a
158
+# patch and modify the compatibility statement accordingly.
159
+try:
160
+    import BeautifulSoup
161
+except:
162
+    BeautifulSoup = None
163
+
164
+# ---------- don't touch these ----------
165
+class ThingsNobodyCaresAboutButMe(Exception): pass
166
+class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
167
+class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
168
+class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
169
+class UndeclaredNamespace(Exception): pass
170
+
171
+sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
172
+sgmllib.special = re.compile('<!')
173
+sgmllib.charref = re.compile('&#(\d+|x[0-9a-fA-F]+);')
174
+
175
+if sgmllib.endbracket.search(' <').start(0):
176
+    class EndBracketMatch:
177
+        endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''')
178
+        def search(self,string,index=0):
179
+            self.match = self.endbracket.match(string,index)
180
+            if self.match: return self
181
+        def start(self,n):
182
+            return self.match.end(n)
183
+    sgmllib.endbracket = EndBracketMatch()
184
+
185
+SUPPORTED_VERSIONS = {'': 'unknown',
186
+                      'rss090': 'RSS 0.90',
187
+                      'rss091n': 'RSS 0.91 (Netscape)',
188
+                      'rss091u': 'RSS 0.91 (Userland)',
189
+                      'rss092': 'RSS 0.92',
190
+                      'rss093': 'RSS 0.93',
191
+                      'rss094': 'RSS 0.94',
192
+                      'rss20': 'RSS 2.0',
193
+                      'rss10': 'RSS 1.0',
194
+                      'rss': 'RSS (unknown version)',
195
+                      'atom01': 'Atom 0.1',
196
+                      'atom02': 'Atom 0.2',
197
+                      'atom03': 'Atom 0.3',
198
+                      'atom10': 'Atom 1.0',
199
+                      'atom': 'Atom (unknown version)',
200
+                      'cdf': 'CDF',
201
+                      'hotrss': 'Hot RSS'
202
+                      }
203
+
204
+try:
205
+    UserDict = dict
206
+except NameError:
207
+    # Python 2.1 does not have dict
208
+    from UserDict import UserDict
209
+    def dict(aList):
210
+        rc = {}
211
+        for k, v in aList:
212
+            rc[k] = v
213
+        return rc
214
+
215
+class FeedParserDict(UserDict):
216
+    keymap = {'channel': 'feed',
217
+              'items': 'entries',
218
+              'guid': 'id',
219
+              'date': 'updated',
220
+              'date_parsed': 'updated_parsed',
221
+              'description': ['subtitle', 'summary'],
222
+              'url': ['href'],
223
+              'modified': 'updated',
224
+              'modified_parsed': 'updated_parsed',
225
+              'issued': 'published',
226
+              'issued_parsed': 'published_parsed',
227
+              'copyright': 'rights',
228
+              'copyright_detail': 'rights_detail',
229
+              'tagline': 'subtitle',
230
+              'tagline_detail': 'subtitle_detail'}
231
+    def __getitem__(self, key):
232
+        if key == 'category':
233
+            return UserDict.__getitem__(self, 'tags')[0]['term']
234
+        if key == 'enclosures':
235
+            norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel'])
236
+            return [norel(link) for link in UserDict.__getitem__(self, 'links') if link['rel']=='enclosure']
237
+        if key == 'license':
238
+            for link in UserDict.__getitem__(self, 'links'):
239
+                if link['rel']=='license' and link.has_key('href'):
240
+                    return link['href']
241
+        if key == 'categories':
242
+            return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')]
243
+        realkey = self.keymap.get(key, key)
244
+        if type(realkey) == types.ListType:
245
+            for k in realkey:
246
+                if UserDict.has_key(self, k):
247
+                    return UserDict.__getitem__(self, k)
248
+        if UserDict.has_key(self, key):
249
+            return UserDict.__getitem__(self, key)
250
+        return UserDict.__getitem__(self, realkey)
251
+
252
+    def __setitem__(self, key, value):
253
+        for k in self.keymap.keys():
254
+            if key == k:
255
+                key = self.keymap[k]
256
+                if type(key) == types.ListType:
257
+                    key = key[0]
258
+        return UserDict.__setitem__(self, key, value)
259
+
260
+    def get(self, key, default=None):
261
+        if self.has_key(key):
262
+            return self[key]
263
+        else:
264
+            return default
265
+
266
+    def setdefault(self, key, value):
267
+        if not self.has_key(key):
268
+            self[key] = value
269
+        return self[key]
270
+
271
+    def has_key(self, key):
272
+        try:
273
+            return hasattr(self, key) or UserDict.has_key(self, key)
274
+        except AttributeError:
275
+            return False
276
+
277
+    def __getattr__(self, key):
278
+        try:
279
+            return self.__dict__[key]
280
+        except KeyError:
281
+            pass
282
+        try:
283
+            assert not key.startswith('_')
284
+            return self.__getitem__(key)
285
+        except:
286
+            raise AttributeError, "object has no attribute '%s'" % key
287
+
288
+    def __setattr__(self, key, value):
289
+        if key.startswith('_') or key == 'data':
290
+            self.__dict__[key] = value
291
+        else:
292
+            return self.__setitem__(key, value)
293
+
294
+    def __contains__(self, key):
295
+        return self.has_key(key)
296
+
297
+def zopeCompatibilityHack():
298
+    global FeedParserDict
299
+    del FeedParserDict
300
+    def FeedParserDict(aDict=None):
301
+        rc = {}
302
+        if aDict:
303
+            rc.update(aDict)
304
+        return rc
305
+
306
+_ebcdic_to_ascii_map = None
307
+def _ebcdic_to_ascii(s):
308
+    global _ebcdic_to_ascii_map
309
+    if not _ebcdic_to_ascii_map:
310
+        emap = (
311
+            0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
312
+            16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
313
+            128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
314
+            144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
315
+            32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
316
+            38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
317
+            45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
318
+            186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
319
+            195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
320
+            202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
321
+            209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
322
+            216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
323
+            123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
324
+            125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
325
+            92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
326
+            48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
327
+            )
328
+        import string
329
+        _ebcdic_to_ascii_map = string.maketrans( \
330
+            ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
331
+    return s.translate(_ebcdic_to_ascii_map)
332
+
333
+_cp1252 = {
334
+  unichr(128): unichr(8364), # euro sign
335
+  unichr(130): unichr(8218), # single low-9 quotation mark
336
+  unichr(131): unichr( 402), # latin small letter f with hook
337
+  unichr(132): unichr(8222), # double low-9 quotation mark
338
+  unichr(133): unichr(8230), # horizontal ellipsis
339
+  unichr(134): unichr(8224), # dagger
340
+  unichr(135): unichr(8225), # double dagger
341
+  unichr(136): unichr( 710), # modifier letter circumflex accent
342
+  unichr(137): unichr(8240), # per mille sign
343
+  unichr(138): unichr( 352), # latin capital letter s with caron
344
+  unichr(139): unichr(8249), # single left-pointing angle quotation mark
345
+  unichr(140): unichr( 338), # latin capital ligature oe
346
+  unichr(142): unichr( 381), # latin capital letter z with caron
347
+  unichr(145): unichr(8216), # left single quotation mark
348
+  unichr(146): unichr(8217), # right single quotation mark
349
+  unichr(147): unichr(8220), # left double quotation mark
350
+  unichr(148): unichr(8221), # right double quotation mark
351
+  unichr(149): unichr(8226), # bullet
352
+  unichr(150): unichr(8211), # en dash
353
+  unichr(151): unichr(8212), # em dash
354
+  unichr(152): unichr( 732), # small tilde
355
+  unichr(153): unichr(8482), # trade mark sign
356
+  unichr(154): unichr( 353), # latin small letter s with caron
357
+  unichr(155): unichr(8250), # single right-pointing angle quotation mark
358
+  unichr(156): unichr( 339), # latin small ligature oe
359
+  unichr(158): unichr( 382), # latin small letter z with caron
360
+  unichr(159): unichr( 376)} # latin capital letter y with diaeresis
361
+
362
+_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
363
+def _urljoin(base, uri):
364
+    uri = _urifixer.sub(r'\1\3', uri)
365
+    try:
366
+        return urlparse.urljoin(base, uri)
367
+    except:
368
+        uri = urlparse.urlunparse([urllib.quote(part) for part in urlparse.urlparse(uri)])
369
+        return urlparse.urljoin(base, uri)
370
+
371
+class _FeedParserMixin:
372
+    namespaces = {'': '',
373
+                  'http://backend.userland.com/rss': '',
374
+                  'http://blogs.law.harvard.edu/tech/rss': '',
375
+                  'http://purl.org/rss/1.0/': '',
376
+                  'http://my.netscape.com/rdf/simple/0.9/': '',
377
+                  'http://example.com/newformat#': '',
378
+                  'http://example.com/necho': '',
379
+                  'http://purl.org/echo/': '',
380
+                  'uri/of/echo/namespace#': '',
381
+                  'http://purl.org/pie/': '',
382
+                  'http://purl.org/atom/ns#': '',
383
+                  'http://www.w3.org/2005/Atom': '',
384
+                  'http://purl.org/rss/1.0/modules/rss091#': '',
385
+
386
+                  'http://webns.net/mvcb/':                               'admin',
387
+                  'http://purl.org/rss/1.0/modules/aggregation/':         'ag',
388
+                  'http://purl.org/rss/1.0/modules/annotate/':            'annotate',
389
+                  'http://media.tangent.org/rss/1.0/':                    'audio',
390
+                  'http://backend.userland.com/blogChannelModule':        'blogChannel',
391
+                  'http://web.resource.org/cc/':                          'cc',
392
+                  'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',
393
+                  'http://purl.org/rss/1.0/modules/company':              'co',
394
+                  'http://purl.org/rss/1.0/modules/content/':             'content',
395
+                  'http://my.theinfo.org/changed/1.0/rss/':               'cp',
396
+                  'http://purl.org/dc/elements/1.1/':                     'dc',
397
+                  'http://purl.org/dc/terms/':                            'dcterms',
398
+                  'http://purl.org/rss/1.0/modules/email/':               'email',
399
+                  'http://purl.org/rss/1.0/modules/event/':               'ev',
400
+                  'http://rssnamespace.org/feedburner/ext/1.0':           'feedburner',
401
+                  'http://freshmeat.net/rss/fm/':                         'fm',
402
+                  'http://xmlns.com/foaf/0.1/':                           'foaf',
403
+                  'http://www.w3.org/2003/01/geo/wgs84_pos#':             'geo',
404
+                  'http://postneo.com/icbm/':                             'icbm',
405
+                  'http://purl.org/rss/1.0/modules/image/':               'image',
406
+                  'http://www.itunes.com/DTDs/PodCast-1.0.dtd':           'itunes',
407
+                  'http://example.com/DTDs/PodCast-1.0.dtd':              'itunes',
408
+                  'http://purl.org/rss/1.0/modules/link/':                'l',
409
+                  'http://search.yahoo.com/mrss':                         'media',
410
+                  'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
411
+                  'http://prismstandard.org/namespaces/1.2/basic/':       'prism',
412
+                  'http://www.w3.org/1999/02/22-rdf-syntax-ns#':          'rdf',
413
+                  'http://www.w3.org/2000/01/rdf-schema#':                'rdfs',
414
+                  'http://purl.org/rss/1.0/modules/reference/':           'ref',
415
+                  'http://purl.org/rss/1.0/modules/richequiv/':           'reqv',
416
+                  'http://purl.org/rss/1.0/modules/search/':              'search',
417
+                  'http://purl.org/rss/1.0/modules/slash/':               'slash',
418
+                  'http://schemas.xmlsoap.org/soap/envelope/':            'soap',
419
+                  'http://purl.org/rss/1.0/modules/servicestatus/':       'ss',
420
+                  'http://hacks.benhammersley.com/rss/streaming/':        'str',
421
+                  'http://purl.org/rss/1.0/modules/subscription/':        'sub',
422
+                  'http://purl.org/rss/1.0/modules/syndication/':         'sy',
423
+                  'http://schemas.pocketsoap.com/rss/myDescModule/':      'szf',
424
+                  'http://purl.org/rss/1.0/modules/taxonomy/':            'taxo',
425
+                  'http://purl.org/rss/1.0/modules/threading/':           'thr',
426
+                  'http://purl.org/rss/1.0/modules/textinput/':           'ti',
427
+                  'http://madskills.com/public/xml/rss/module/trackback/':'trackback',
428
+                  'http://wellformedweb.org/commentAPI/':                 'wfw',
429
+                  'http://purl.org/rss/1.0/modules/wiki/':                'wiki',
430
+                  'http://www.w3.org/1999/xhtml':                         'xhtml',
431
+                  'http://www.w3.org/1999/xlink':                         'xlink',
432
+                  'http://www.w3.org/XML/1998/namespace':                 'xml'
433
+}
434
+    _matchnamespaces = {}
435
+
436
+    can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo']
437
+    can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
438
+    can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
439
+    html_types = ['text/html', 'application/xhtml+xml']
440
+
441
+    def __init__(self, baseuri=None, baselang=None, encoding='utf-8'):
442
+        if _debug: sys.stderr.write('initializing FeedParser\n')
443
+        if not self._matchnamespaces:
444
+            for k, v in self.namespaces.items():
445
+                self._matchnamespaces[k.lower()] = v
446
+        self.feeddata = FeedParserDict() # feed-level data
447
+        self.encoding = encoding # character encoding
448
+        self.entries = [] # list of entry-level data
449
+        self.version = '' # feed type/version, see SUPPORTED_VERSIONS
450
+        self.namespacesInUse = {} # dictionary of namespaces defined by the feed
451
+
452
+        # the following are used internally to track state;
453
+        # this is really out of control and should be refactored
454
+        self.infeed = 0
455
+        self.inentry = 0
456
+        self.incontent = 0
457
+        self.intextinput = 0
458
+        self.inimage = 0
459
+        self.inauthor = 0
460
+        self.incontributor = 0
461
+        self.inpublisher = 0
462
+        self.insource = 0
463
+        self.sourcedata = FeedParserDict()
464
+        self.contentparams = FeedParserDict()
465
+        self._summaryKey = None
466
+        self.namespacemap = {}
467
+        self.elementstack = []
468
+        self.basestack = []
469
+        self.langstack = []
470
+        self.baseuri = baseuri or ''
471
+        self.lang = baselang or None
472
+        self.svgOK = 0
473
+        self.hasTitle = 0
474
+        if baselang:
475
+            self.feeddata['language'] = baselang.replace('_','-')
476
+
477
+    def unknown_starttag(self, tag, attrs):
478
+        if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
479
+        # normalize attrs
480
+        attrs = [(k.lower(), v) for k, v in attrs]
481
+        attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
482
+
483
+        # track xml:base and xml:lang
484
+        attrsD = dict(attrs)
485
+        baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
486
+        if type(baseuri) != type(u''):
487
+            try:
488
+                baseuri = unicode(baseuri, self.encoding)
489
+            except:
490
+                baseuri = unicode(baseuri, 'iso-8859-1')
491
+        self.baseuri = _urljoin(self.baseuri, baseuri)
492
+        lang = attrsD.get('xml:lang', attrsD.get('lang'))
493
+        if lang == '':
494
+            # xml:lang could be explicitly set to '', we need to capture that
495
+            lang = None
496
+        elif lang is None:
497
+            # if no xml:lang is specified, use parent lang
498
+            lang = self.lang
499
+        if lang:
500
+            if tag in ('feed', 'rss', 'rdf:RDF'):
501
+                self.feeddata['language'] = lang.replace('_','-')
502
+        self.lang = lang
503
+        self.basestack.append(self.baseuri)
504
+        self.langstack.append(lang)
505
+
506
+        # track namespaces
507
+        for prefix, uri in attrs:
508
+            if prefix.startswith('xmlns:'):
509
+                self.trackNamespace(prefix[6:], uri)
510
+            elif prefix == 'xmlns':
511
+                self.trackNamespace(None, uri)
512
+
513
+        # track inline content
514
+        if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
515
+            if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007
516
+            # element declared itself as escaped markup, but it isn't really
517
+            self.contentparams['type'] = 'application/xhtml+xml'
518
+        if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
519
+            if tag.find(':') <> -1:
520
+                prefix, tag = tag.split(':', 1)
521
+                namespace = self.namespacesInUse.get(prefix, '')
522
+                if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
523
+                    attrs.append(('xmlns',namespace))
524
+                if tag=='svg' and namespace=='http://www.w3.org/2000/svg':
525
+                    attrs.append(('xmlns',namespace))
526
+            if tag == 'svg': self.svgOK += 1
527
+            return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0)
528
+
529
+        # match namespaces
530
+        if tag.find(':') <> -1:
531
+            prefix, suffix = tag.split(':', 1)
532
+        else:
533
+            prefix, suffix = '', tag
534
+        prefix = self.namespacemap.get(prefix, prefix)
535
+        if prefix:
536
+            prefix = prefix + '_'
537
+
538
+        # special hack for better tracking of empty textinput/image elements in illformed feeds
539
+        if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
540
+            self.intextinput = 0
541
+        if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
542
+            self.inimage = 0
543
+
544
+        # call special handler (if defined) or default handler
545
+        methodname = '_start_' + prefix + suffix
546
+        try:
547
+            method = getattr(self, methodname)
548
+            return method(attrsD)
549
+        except AttributeError:
550
+            return self.push(prefix + suffix, 1)
551
+
552
+    def unknown_endtag(self, tag):
553
+        if _debug: sys.stderr.write('end %s\n' % tag)
554
+        # match namespaces
555
+        if tag.find(':') <> -1:
556
+            prefix, suffix = tag.split(':', 1)
557
+        else:
558
+            prefix, suffix = '', tag
559
+        prefix = self.namespacemap.get(prefix, prefix)
560
+        if prefix:
561
+            prefix = prefix + '_'
562
+        if suffix == 'svg' and self.svgOK: self.svgOK -= 1
563
+
564
+        # call special handler (if defined) or default handler
565
+        methodname = '_end_' + prefix + suffix
566
+        try:
567
+            if self.svgOK: raise AttributeError()
568
+            method = getattr(self, methodname)
569
+            method()
570
+        except AttributeError:
571
+            self.pop(prefix + suffix)
572
+
573
+        # track inline content
574
+        if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
575
+            # element declared itself as escaped markup, but it isn't really
576
+            if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007
577
+            self.contentparams['type'] = 'application/xhtml+xml'
578
+        if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
579
+            tag = tag.split(':')[-1]
580
+            self.handle_data('</%s>' % tag, escape=0)
581
+
582
+        # track xml:base and xml:lang going out of scope
583
+        if self.basestack:
584
+            self.basestack.pop()
585
+            if self.basestack and self.basestack[-1]:
586
+                self.baseuri = self.basestack[-1]
587
+        if self.langstack:
588
+            self.langstack.pop()
589
+            if self.langstack: # and (self.langstack[-1] is not None):
590
+                self.lang = self.langstack[-1]
591
+
592
+    def handle_charref(self, ref):
593
+        # called for each character reference, e.g. for '&#160;', ref will be '160'
594
+        if not self.elementstack: return
595
+        ref = ref.lower()
596
+        if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
597
+            text = '&#%s;' % ref
598
+        else:
599
+            if ref[0] == 'x':
600
+                c = int(ref[1:], 16)
601
+            else:
602
+                c = int(ref)
603
+            text = unichr(c).encode('utf-8')
604
+        self.elementstack[-1][2].append(text)
605
+
606
+    def handle_entityref(self, ref):
607
+        # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
608
+        if not self.elementstack: return
609
+        if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref)
610
+        if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
611
+            text = '&%s;' % ref
612
+        elif ref in self.entities.keys():
613
+            text = self.entities[ref]
614
+            if text.startswith('&#') and text.endswith(';'):
615
+                return self.handle_entityref(text)
616
+        else:
617
+            try: name2codepoint[ref]
618
+            except KeyError: text = '&%s;' % ref
619
+            else: text = unichr(name2codepoint[ref]).encode('utf-8')
620
+        self.elementstack[-1][2].append(text)
621
+
622
+    def handle_data(self, text, escape=1):
623
+        # called for each block of plain text, i.e. outside of any tag and
624
+        # not containing any character or entity references
625
+        if not self.elementstack: return
626
+        if escape and self.contentparams.get('type') == 'application/xhtml+xml':
627
+            text = _xmlescape(text)
628
+        self.elementstack[-1][2].append(text)
629
+
630
+    def handle_comment(self, text):
631
+        # called for each comment, e.g. <!-- insert message here -->
632
+        pass
633
+
634
+    def handle_pi(self, text):
635
+        # called for each processing instruction, e.g. <?instruction>
636
+        pass
637
+
638
+    def handle_decl(self, text):
639
+        pass
640
+
641
+    def parse_declaration(self, i):
642
+        # override internal declaration handler to handle CDATA blocks
643
+        if _debug: sys.stderr.write('entering parse_declaration\n')
644
+        if self.rawdata[i:i+9] == '<![CDATA[':
645
+            k = self.rawdata.find(']]>', i)
646
+            if k == -1: k = len(self.rawdata)
647
+            self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
648
+            return k+3
649
+        else:
650
+            k = self.rawdata.find('>', i)
651
+            return k+1
652
+
653
+    def mapContentType(self, contentType):
654
+        contentType = contentType.lower()
655
+        if contentType == 'text':
656
+            contentType = 'text/plain'
657
+        elif contentType == 'html':
658
+            contentType = 'text/html'
659
+        elif contentType == 'xhtml':
660
+            contentType = 'application/xhtml+xml'
661
+        return contentType
662
+
663
+    def trackNamespace(self, prefix, uri):
664
+        loweruri = uri.lower()
665
+        if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
666
+            self.version = 'rss090'
667
+        if loweruri == 'http://purl.org/rss/1.0/' and not self.version:
668
+            self.version = 'rss10'
669
+        if loweruri == 'http://www.w3.org/2005/atom' and not self.version:
670
+            self.version = 'atom10'
671
+        if loweruri.find('backend.userland.com/rss') <> -1:
672
+            # match any backend.userland.com namespace
673
+            uri = 'http://backend.userland.com/rss'
674
+            loweruri = uri
675
+        if self._matchnamespaces.has_key(loweruri):
676
+            self.namespacemap[prefix] = self._matchnamespaces[loweruri]
677
+            self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
678
+        else:
679
+            self.namespacesInUse[prefix or ''] = uri
680
+
681
+    def resolveURI(self, uri):
682
+        return _urljoin(self.baseuri or '', uri)
683
+
684
+    def decodeEntities(self, element, data):
685
+        return data
686
+
687
+    def strattrs(self, attrs):
688
+        return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'&quot;'})) for t in attrs])
689
+
690
+    def push(self, element, expectingText):
691
+        self.elementstack.append([element, expectingText, []])
692
+
693
+    def pop(self, element, stripWhitespace=1):
694
+        if not self.elementstack: return
695
+        if self.elementstack[-1][0] != element: return
696
+
697
+        element, expectingText, pieces = self.elementstack.pop()
698
+
699
+        if self.version == 'atom10' and self.contentparams.get('type','text') == 'application/xhtml+xml':
700
+            # remove enclosing child element, but only if it is a <div> and
701
+            # only if all the remaining content is nested underneath it.
702
+            # This means that the divs would be retained in the following:
703
+            #    <div>foo</div><div>bar</div>
704
+            while pieces and len(pieces)>1 and not pieces[-1].strip():
705
+                del pieces[-1]
706
+            while pieces and len(pieces)>1 and not pieces[0].strip():
707
+                del pieces[0]
708
+            if pieces and (pieces[0] == '<div>' or pieces[0].startswith('<div ')) and pieces[-1]=='</div>':
709
+                depth = 0
710
+                for piece in pieces[:-1]:
711
+                    if piece.startswith('</'):
712
+                        depth -= 1
713
+                        if depth == 0: break
714
+                    elif piece.startswith('<') and not piece.endswith('/>'):
715
+                        depth += 1
716
+                else:
717
+                    pieces = pieces[1:-1]
718
+
719
+        output = ''.join(pieces)
720
+        if stripWhitespace:
721
+            output = output.strip()
722
+        if not expectingText: return output
723
+
724
+        # decode base64 content
725
+        if base64 and self.contentparams.get('base64', 0):
726
+            try:
727
+                output = base64.decodestring(output)
728
+            except binascii.Error:
729
+                pass
730
+            except binascii.Incomplete:
731
+                pass
732
+
733
+        # resolve relative URIs
734
+        if (element in self.can_be_relative_uri) and output:
735
+            output = self.resolveURI(output)
736
+
737
+        # decode entities within embedded markup
738
+        if not self.contentparams.get('base64', 0):
739
+            output = self.decodeEntities(element, output)
740
+
741
+        if self.lookslikehtml(output):
742
+            self.contentparams['type']='text/html'
743
+
744
+        # remove temporary cruft from contentparams
745
+        try:
746
+            del self.contentparams['mode']
747
+        except KeyError:
748
+            pass
749
+        try:
750
+            del self.contentparams['base64']
751
+        except KeyError:
752
+            pass
753
+
754
+        is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types
755
+        # resolve relative URIs within embedded markup
756
+        if is_htmlish and RESOLVE_RELATIVE_URIS:
757
+            if element in self.can_contain_relative_uris:
758
+                output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html'))
759
+
760
+        # parse microformats
761
+        # (must do this before sanitizing because some microformats
762
+        # rely on elements that we sanitize)
763
+        if is_htmlish and element in ['content', 'description', 'summary']:
764
+            mfresults = _parseMicroformats(output, self.baseuri, self.encoding)
765
+            if mfresults:
766
+                for tag in mfresults.get('tags', []):
767
+                    self._addTag(tag['term'], tag['scheme'], tag['label'])
768
+                for enclosure in mfresults.get('enclosures', []):
769
+                    self._start_enclosure(enclosure)
770
+                for xfn in mfresults.get('xfn', []):
771
+                    self._addXFN(xfn['relationships'], xfn['href'], xfn['name'])
772
+                vcard = mfresults.get('vcard')
773
+                if vcard:
774
+                    self._getContext()['vcard'] = vcard
775
+
776
+        # sanitize embedded markup
777
+        if is_htmlish and SANITIZE_HTML:
778
+            if element in self.can_contain_dangerous_markup:
779
+                output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html'))
780
+
781
+        if self.encoding and type(output) != type(u''):
782
+            try:
783
+                output = unicode(output, self.encoding)
784
+            except:
785
+                pass
786
+
787
+        # address common error where people take data that is already
788
+        # utf-8, presume that it is iso-8859-1, and re-encode it.
789
+        if self.encoding=='utf-8' and type(output) == type(u''):
790
+            try:
791
+                output = unicode(output.encode('iso-8859-1'), 'utf-8')
792
+            except:
793
+                pass
794
+
795
+        # map win-1252 extensions to the proper code points
796
+        if type(output) == type(u''):
797
+            output = u''.join([c in _cp1252.keys() and _cp1252[c] or c for c in output])
798
+
799
+        # categories/tags/keywords/whatever are handled in _end_category
800
+        if element == 'category':
801
+            return output
802
+
803
+        if element == 'title' and self.hasTitle:
804
+            return output
805
+
806
+        # store output in appropriate place(s)
807
+        if self.inentry and not self.insource:
808
+            if element == 'content':
809
+                self.entries[-1].setdefault(element, [])
810
+                contentparams = copy.deepcopy(self.contentparams)
811
+                contentparams['value'] = output
812
+                self.entries[-1][element].append(contentparams)
813
+            elif element == 'link':
814
+                self.entries[-1][element] = output
815
+                if output:
816
+                    self.entries[-1]['links'][-1]['href'] = output
817
+            else:
818
+                if element == 'description':
819
+                    element = 'summary'
820
+                self.entries[-1][element] = output
821
+                if self.incontent:
822
+                    contentparams = copy.deepcopy(self.contentparams)
823
+                    contentparams['value'] = output
824
+                    self.entries[-1][element + '_detail'] = contentparams
825
+        elif (self.infeed or self.insource):# and (not self.intextinput) and (not self.inimage):
826
+            context = self._getContext()
827
+            if element == 'description':
828
+                element = 'subtitle'
829
+            context[element] = output
830
+            if element == 'link':
831
+                context['links'][-1]['href'] = output
832
+            elif self.incontent:
833
+                contentparams = copy.deepcopy(self.contentparams)
834
+                contentparams['value'] = output
835
+                context[element + '_detail'] = contentparams
836
+        return output
837
+
838
+    def pushContent(self, tag, attrsD, defaultContentType, expectingText):
839
+        self.incontent += 1
840
+        if self.lang: self.lang=self.lang.replace('_','-')
841
+        self.contentparams = FeedParserDict({
842
+            'type': self.mapContentType(attrsD.get('type', defaultContentType)),
843
+            'language': self.lang,
844
+            'base': self.baseuri})
845
+        self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
846
+        self.push(tag, expectingText)
847
+
848
+    def popContent(self, tag):
849
+        value = self.pop(tag)
850
+        self.incontent -= 1
851
+        self.contentparams.clear()
852
+        return value
853
+
854
+    # a number of elements in a number of RSS variants are nominally plain
855
+    # text, but this is routinely ignored.  This is an attempt to detect
856
+    # the most common cases.  As false positives often result in silent
857
+    # data loss, this function errs on the conservative side.
858
+    def lookslikehtml(self, str):
859
+        if self.version.startswith('atom'): return
860
+        if self.contentparams.get('type','text/html') != 'text/plain': return
861
+
862
+        # must have a close tag or a entity reference to qualify
863
+        if not (re.search(r'</(\w+)>',str) or re.search("&#?\w+;",str)): return
864
+
865
+        # all tags must be in a restricted subset of valid HTML tags
866
+        if filter(lambda t: t.lower() not in _HTMLSanitizer.acceptable_elements,
867
+            re.findall(r'</?(\w+)',str)): return
868
+
869
+        # all entities must have been defined as valid HTML entities
870
+        from htmlentitydefs import entitydefs
871
+        if filter(lambda e: e not in entitydefs.keys(),
872
+            re.findall(r'&(\w+);',str)): return
873
+
874
+        return 1
875
+
876
+    def _mapToStandardPrefix(self, name):
877
+        colonpos = name.find(':')
878
+        if colonpos <> -1:
879
+            prefix = name[:colonpos]
880
+            suffix = name[colonpos+1:]
881
+            prefix = self.namespacemap.get(prefix, prefix)
882
+            name = prefix + ':' + suffix
883
+        return name
884
+
885
+    def _getAttribute(self, attrsD, name):
886
+        return attrsD.get(self._mapToStandardPrefix(name))
887
+
888
+    def _isBase64(self, attrsD, contentparams):
889
+        if attrsD.get('mode', '') == 'base64':
890
+            return 1
891
+        if self.contentparams['type'].startswith('text/'):
892
+            return 0
893
+        if self.contentparams['type'].endswith('+xml'):
894
+            return 0
895
+        if self.contentparams['type'].endswith('/xml'):
896
+            return 0
897
+        return 1
898
+
899
+    def _itsAnHrefDamnIt(self, attrsD):
900
+        href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
901
+        if href:
902
+            try:
903
+                del attrsD['url']
904
+            except KeyError:
905
+                pass
906
+            try:
907
+                del attrsD['uri']
908
+            except KeyError:
909
+                pass
910
+            attrsD['href'] = href
911
+        return attrsD
912
+
913
+    def _save(self, key, value):
914
+        context = self._getContext()
915
+        context.setdefault(key, value)
916
+
917
+    def _start_rss(self, attrsD):
918
+        versionmap = {'0.91': 'rss091u',
919
+                      '0.92': 'rss092',
920
+                      '0.93': 'rss093',
921
+                      '0.94': 'rss094'}
922
+        if not self.version:
923
+            attr_version = attrsD.get('version', '')
924
+            version = versionmap.get(attr_version)
925
+            if version:
926
+                self.version = version
927
+            elif attr_version.startswith('2.'):
928
+                self.version = 'rss20'
929
+            else:
930
+                self.version = 'rss'
931
+
932
+    def _start_dlhottitles(self, attrsD):
933
+        self.version = 'hotrss'
934
+
935
+    def _start_channel(self, attrsD):
936
+        self.infeed = 1
937
+        self._cdf_common(attrsD)
938
+    _start_feedinfo = _start_channel
939
+
940
+    def _cdf_common(self, attrsD):
941
+        if attrsD.has_key('lastmod'):
942
+            self._start_modified({})
943
+            self.elementstack[-1][-1] = attrsD['lastmod']
944
+            self._end_modified()
945
+        if attrsD.has_key('href'):
946
+            self._start_link({})
947
+            self.elementstack[-1][-1] = attrsD['href']
948
+            self._end_link()
949
+
950
+    def _start_feed(self, attrsD):
951
+        self.infeed = 1
952
+        versionmap = {'0.1': 'atom01',
953
+                      '0.2': 'atom02',
954
+                      '0.3': 'atom03'}
955
+        if not self.version:
956
+            attr_version = attrsD.get('version')
957
+            version = versionmap.get(attr_version)
958
+            if version:
959
+                self.version = version
960
+            else:
961
+                self.version = 'atom'
962
+
963
+    def _end_channel(self):
964
+        self.infeed = 0
965
+    _end_feed = _end_channel
966
+
967
+    def _start_image(self, attrsD):
968
+        context = self._getContext()
969
+        context.setdefault('image', FeedParserDict())
970
+        self.inimage = 1
971
+        self.hasTitle = 0
972
+        self.push('image', 0)
973
+
974
+    def _end_image(self):
975
+        self.pop('image')
976
+        self.inimage = 0
977
+
978
+    def _start_textinput(self, attrsD):
979
+        context = self._getContext()
980
+        context.setdefault('textinput', FeedParserDict())
981
+        self.intextinput = 1
982
+        self.hasTitle = 0
983
+        self.push('textinput', 0)
984
+    _start_textInput = _start_textinput
985
+
986
+    def _end_textinput(self):
987
+        self.pop('textinput')
988
+        self.intextinput = 0
989
+    _end_textInput = _end_textinput
990
+
991
+    def _start_author(self, attrsD):
992
+        self.inauthor = 1
993
+        self.push('author', 1)
994
+    _start_managingeditor = _start_author
995
+    _start_dc_author = _start_author
996
+    _start_dc_creator = _start_author
997
+    _start_itunes_author = _start_author
998
+
999
+    def _end_author(self):
1000
+        self.pop('author')
1001
+        self.inauthor = 0
1002
+        self._sync_author_detail()
1003
+    _end_managingeditor = _end_author
1004
+    _end_dc_author = _end_author
1005
+    _end_dc_creator = _end_author
1006
+    _end_itunes_author = _end_author
1007
+
1008
+    def _start_itunes_owner(self, attrsD):
1009
+        self.inpublisher = 1
1010
+        self.push('publisher', 0)
1011
+
1012
+    def _end_itunes_owner(self):
1013
+        self.pop('publisher')
1014
+        self.inpublisher = 0
1015
+        self._sync_author_detail('publisher')
1016
+
1017
+    def _start_contributor(self, attrsD):
1018
+        self.incontributor = 1
1019
+        context = self._getContext()
1020
+        context.setdefault('contributors', [])
1021
+        context['contributors'].append(FeedParserDict())
1022
+        self.push('contributor', 0)
1023
+
1024
+    def _end_contributor(self):
1025
+        self.pop('contributor')
1026
+        self.incontributor = 0
1027
+
1028
+    def _start_dc_contributor(self, attrsD):
1029
+        self.incontributor = 1
1030
+        context = self._getContext()
1031
+        context.setdefault('contributors', [])
1032
+        context['contributors'].append(FeedParserDict())
1033
+        self.push('name', 0)
1034
+
1035
+    def _end_dc_contributor(self):
1036
+        self._end_name()
1037
+        self.incontributor = 0
1038
+
1039
+    def _start_name(self, attrsD):
1040
+        self.push('name', 0)
1041
+    _start_itunes_name = _start_name
1042
+
1043
+    def _end_name(self):
1044
+        value = self.pop('name')
1045
+        if self.inpublisher:
1046
+            self._save_author('name', value, 'publisher')
1047
+        elif self.inauthor:
1048
+            self._save_author('name', value)
1049
+        elif self.incontributor:
1050
+            self._save_contributor('name', value)
1051
+        elif self.intextinput:
1052
+            context = self._getContext()
1053
+            context['name'] = value
1054
+    _end_itunes_name = _end_name
1055
+
1056
+    def _start_width(self, attrsD):
1057
+        self.push('width', 0)
1058
+
1059
+    def _end_width(self):
1060
+        value = self.pop('width')
1061
+        try:
1062
+            value = int(value)
1063
+        except:
1064
+            value = 0
1065
+        if self.inimage:
1066
+            context = self._getContext()
1067
+            context['width'] = value
1068
+
1069
+    def _start_height(self, attrsD):
1070
+        self.push('height', 0)
1071
+
1072
+    def _end_height(self):
1073
+        value = self.pop('height')
1074
+        try:
1075
+            value = int(value)
1076
+        except:
1077
+            value = 0
1078
+        if self.inimage:
1079
+            context = self._getContext()
1080
+            context['height'] = value
1081
+
1082
+    def _start_url(self, attrsD):
1083
+        self.push('href', 1)
1084
+    _start_homepage = _start_url
1085
+    _start_uri = _start_url
1086
+
1087
+    def _end_url(self):
1088
+        value = self.pop('href')
1089
+        if self.inauthor:
1090
+            self._save_author('href', value)
1091
+        elif self.incontributor:
1092
+            self._save_contributor('href', value)
1093
+    _end_homepage = _end_url
1094
+    _end_uri = _end_url
1095
+
1096
+    def _start_email(self, attrsD):
1097
+        self.push('email', 0)
1098
+    _start_itunes_email = _start_email
1099
+
1100
+    def _end_email(self):
1101
+        value = self.pop('email')
1102
+        if self.inpublisher:
1103
+            self._save_author('email', value, 'publisher')
1104
+        elif self.inauthor:
1105
+            self._save_author('email', value)
1106
+        elif self.incontributor:
1107
+            self._save_contributor('email', value)
1108
+    _end_itunes_email = _end_email
1109
+
1110
+    def _getContext(self):
1111
+        if self.insource:
1112
+            context = self.sourcedata
1113
+        elif self.inimage:
1114
+            context = self.feeddata['image']
1115
+        elif self.intextinput:
1116
+            context = self.feeddata['textinput']
1117
+        elif self.inentry:
1118
+            context = self.entries[-1]
1119
+        else:
1120
+            context = self.feeddata
1121
+        return context
1122
+
1123
+    def _save_author(self, key, value, prefix='author'):
1124
+        context = self._getContext()
1125
+        context.setdefault(prefix + '_detail', FeedParserDict())
1126
+        context[prefix + '_detail'][key] = value
1127
+        self._sync_author_detail()
1128
+
1129
+    def _save_contributor(self, key, value):
1130
+        context = self._getContext()
1131
+        context.setdefault('contributors', [FeedParserDict()])
1132
+        context['contributors'][-1][key] = value
1133
+
1134
+    def _sync_author_detail(self, key='author'):
1135
+        context = self._getContext()
1136
+        detail = context.get('%s_detail' % key)
1137
+        if detail:
1138
+            name = detail.get('name')
1139
+            email = detail.get('email')
1140
+            if name and email:
1141
+                context[key] = '%s (%s)' % (name, email)
1142
+            elif name:
1143
+                context[key] = name
1144
+            elif email:
1145
+                context[key] = email
1146
+        else:
1147
+            author, email = context.get(key), None
1148
+            if not author: return
1149
+            emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author)
1150
+            if emailmatch:
1151
+                email = emailmatch.group(0)
1152
+                # probably a better way to do the following, but it passes all the tests
1153
+                author = author.replace(email, '')
1154
+                author = author.replace('()', '')
1155
+                author = author.replace('<>', '')
1156
+                author = author.replace('&lt;&gt;', '')
1157
+                author = author.strip()
1158
+                if author and (author[0] == '('):
1159
+                    author = author[1:]
1160
+                if author and (author[-1] == ')'):
1161
+                    author = author[:-1]
1162
+                author = author.strip()
1163
+            if author or email:
1164
+                context.setdefault('%s_detail' % key, FeedParserDict())
1165
+            if author:
1166
+                context['%s_detail' % key]['name'] = author
1167
+            if email:
1168
+                context['%s_detail' % key]['email'] = email
1169
+
1170
+    def _start_subtitle(self, attrsD):
1171
+        self.pushContent('subtitle', attrsD, 'text/plain', 1)
1172
+    _start_tagline = _start_subtitle
1173
+    _start_itunes_subtitle = _start_subtitle
1174
+
1175
+    def _end_subtitle(self):
1176
+        self.popContent('subtitle')
1177
+    _end_tagline = _end_subtitle
1178
+    _end_itunes_subtitle = _end_subtitle
1179
+
1180
+    def _start_rights(self, attrsD):
1181
+        self.pushContent('rights', attrsD, 'text/plain', 1)
1182
+    _start_dc_rights = _start_rights
1183
+    _start_copyright = _start_rights
1184
+
1185
+    def _end_rights(self):
1186
+        self.popContent('rights')
1187
+    _end_dc_rights = _end_rights
1188
+    _end_copyright = _end_rights
1189
+
1190
+    def _start_item(self, attrsD):
1191
+        self.entries.append(FeedParserDict())
1192
+        self.push('item', 0)
1193
+        self.inentry = 1
1194
+        self.guidislink = 0
1195
+        self.hasTitle = 0
1196
+        id = self._getAttribute(attrsD, 'rdf:about')
1197
+        if id:
1198
+            context = self._getContext()
1199
+            context['id'] = id
1200
+        self._cdf_common(attrsD)
1201
+    _start_entry = _start_item
1202
+    _start_product = _start_item
1203
+
1204
+    def _end_item(self):
1205
+        self.pop('item')
1206
+        self.inentry = 0
1207
+    _end_entry = _end_item
1208
+
1209
+    def _start_dc_language(self, attrsD):
1210
+        self.push('language', 1)
1211
+    _start_language = _start_dc_language
1212
+
1213
+    def _end_dc_language(self):
1214
+        self.lang = self.pop('language')
1215
+    _end_language = _end_dc_language
1216
+
1217
+    def _start_dc_publisher(self, attrsD):
1218
+        self.push('publisher', 1)
1219
+    _start_webmaster = _start_dc_publisher
1220
+
1221
+    def _end_dc_publisher(self):
1222
+        self.pop('publisher')
1223
+        self._sync_author_detail('publisher')
1224
+    _end_webmaster = _end_dc_publisher
1225
+
1226
+    def _start_published(self, attrsD):
1227
+        self.push('published', 1)
1228
+    _start_dcterms_issued = _start_published
1229
+    _start_issued = _start_published
1230
+
1231
+    def _end_published(self):
1232
+        value = self.pop('published')
1233
+        self._save('published_parsed', _parse_date(value))
1234
+    _end_dcterms_issued = _end_published
1235
+    _end_issued = _end_published
1236
+
1237
+    def _start_updated(self, attrsD):
1238
+        self.push('updated', 1)
1239
+    _start_modified = _start_updated
1240
+    _start_dcterms_modified = _start_updated
1241
+    _start_pubdate = _start_updated
1242
+    _start_dc_date = _start_updated
1243
+
1244
+    def _end_updated(self):
1245
+        value = self.pop('updated')
1246
+        parsed_value = _parse_date(value)
1247
+        self._save('updated_parsed', parsed_value)
1248
+    _end_modified = _end_updated
1249
+    _end_dcterms_modified = _end_updated
1250
+    _end_pubdate = _end_updated
1251
+    _end_dc_date = _end_updated
1252
+
1253
+    def _start_created(self, attrsD):
1254
+        self.push('created', 1)
1255
+    _start_dcterms_created = _start_created
1256
+
1257
+    def _end_created(self):
1258
+        value = self.pop('created')
1259
+        self._save('created_parsed', _parse_date(value))
1260
+    _end_dcterms_created = _end_created
1261
+
1262
+    def _start_expirationdate(self, attrsD):
1263
+        self.push('expired', 1)
1264
+
1265
+    def _end_expirationdate(self):
1266
+        self._save('expired_parsed', _parse_date(self.pop('expired')))
1267
+
1268
+    def _start_cc_license(self, attrsD):
1269
+        context = self._getContext()
1270
+        value = self._getAttribute(attrsD, 'rdf:resource')
1271
+        attrsD = FeedParserDict()
1272
+        attrsD['rel']='license'
1273
+        if value: attrsD['href']=value
1274
+        context.setdefault('links', []).append(attrsD)
1275
+
1276
+    def _start_creativecommons_license(self, attrsD):
1277
+        self.push('license', 1)
1278
+    _start_creativeCommons_license = _start_creativecommons_license
1279
+
1280
+    def _end_creativecommons_license(self):
1281
+        value = self.pop('license')
1282
+        context = self._getContext()
1283
+        attrsD = FeedParserDict()
1284
+        attrsD['rel']='license'
1285
+        if value: attrsD['href']=value
1286
+        context.setdefault('links', []).append(attrsD)
1287
+        del context['license']
1288
+    _end_creativeCommons_license = _end_creativecommons_license
1289
+
1290
+    def _addXFN(self, relationships, href, name):
1291
+        context = self._getContext()
1292
+        xfn = context.setdefault('xfn', [])
1293
+        value = FeedParserDict({'relationships': relationships, 'href': href, 'name': name})
1294
+        if value not in xfn:
1295
+            xfn.append(value)
1296
+
1297
+    def _addTag(self, term, scheme, label):
1298
+        context = self._getContext()
1299
+        tags = context.setdefault('tags', [])
1300
+        if (not term) and (not scheme) and (not label): return
1301
+        value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
1302
+        if value not in tags:
1303
+            tags.append(value)
1304
+
1305
+    def _start_category(self, attrsD):
1306
+        if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD))
1307
+        term = attrsD.get('term')
1308
+        scheme = attrsD.get('scheme', attrsD.get('domain'))
1309
+        label = attrsD.get('label')
1310
+        self._addTag(term, scheme, label)
1311
+        self.push('category', 1)
1312
+    _start_dc_subject = _start_category
1313
+    _start_keywords = _start_category
1314
+
1315
+    def _end_itunes_keywords(self):
1316
+        for term in self.pop('itunes_keywords').split():
1317
+            self._addTag(term, 'http://www.itunes.com/', None)
1318
+
1319
+    def _start_itunes_category(self, attrsD):
1320
+        self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None)
1321
+        self.push('category', 1)
1322
+
1323
+    def _end_category(self):
1324
+        value = self.pop('category')
1325
+        if not value: return
1326
+        context = self._getContext()
1327
+        tags = context['tags']
1328
+        if value and len(tags) and not tags[-1]['term']:
1329
+            tags[-1]['term'] = value
1330
+        else:
1331
+            self._addTag(value, None, None)
1332
+    _end_dc_subject = _end_category
1333
+    _end_keywords = _end_category
1334
+    _end_itunes_category = _end_category
1335
+
1336
+    def _start_cloud(self, attrsD):
1337
+        self._getContext()['cloud'] = FeedParserDict(attrsD)
1338
+
1339
+    def _start_link(self, attrsD):
1340
+        attrsD.setdefault('rel', 'alternate')
1341
+        if attrsD['rel'] == 'self':
1342
+            attrsD.setdefault('type', 'application/atom+xml')
1343
+        else:
1344
+            attrsD.setdefault('type', 'text/html')
1345
+        context = self._getContext()
1346
+        attrsD = self._itsAnHrefDamnIt(attrsD)
1347
+        if attrsD.has_key('href'):
1348
+            attrsD['href'] = self.resolveURI(attrsD['href'])
1349
+            if attrsD.get('rel')=='enclosure' and not context.get('id'):
1350
+                context['id'] = attrsD.get('href')
1351
+        expectingText = self.infeed or self.inentry or self.insource
1352
+        context.setdefault('links', [])
1353
+        context['links'].append(FeedParserDict(attrsD))
1354
+        if attrsD.has_key('href'):
1355
+            expectingText = 0
1356
+            if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
1357
+                context['link'] = attrsD['href']
1358
+        else:
1359
+            self.push('link', expectingText)
1360
+    _start_producturl = _start_link
1361
+
1362
+    def _end_link(self):
1363
+        value = self.pop('link')
1364
+        context = self._getContext()
1365
+    _end_producturl = _end_link
1366
+
1367
+    def _start_guid(self, attrsD):
1368
+        self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
1369
+        self.push('id', 1)
1370
+
1371
+    def _end_guid(self):
1372
+        value = self.pop('id')
1373
+        self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
1374
+        if self.guidislink:
1375
+            # guid acts as link, but only if 'ispermalink' is not present or is 'true',
1376
+            # and only if the item doesn't already have a link element
1377
+            self._save('link', value)
1378
+
1379
+    def _start_title(self, attrsD):
1380
+        if self.svgOK: return self.unknown_starttag('title', attrsD.items())
1381
+        self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1382
+    _start_dc_title = _start_title
1383
+    _start_media_title = _start_title
1384
+
1385
+    def _end_title(self):
1386
+        if self.svgOK: return
1387
+        value = self.popContent('title')
1388
+        if not value: return
1389
+        context = self._getContext()
1390
+        self.hasTitle = 1
1391
+    _end_dc_title = _end_title
1392
+
1393
+    def _end_media_title(self):
1394
+        hasTitle = self.hasTitle
1395
+        self._end_title()
1396
+        self.hasTitle = hasTitle
1397
+
1398
+    def _start_description(self, attrsD):
1399
+        context = self._getContext()
1400
+        if context.has_key('summary'):
1401
+            self._summaryKey = 'content'
1402
+            self._start_content(attrsD)
1403
+        else:
1404
+            self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)
1405
+    _start_dc_description = _start_description
1406
+
1407
+    def _start_abstract(self, attrsD):
1408
+        self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1409
+
1410
+    def _end_description(self):
1411
+        if self._summaryKey == 'content':
1412
+            self._end_content()
1413
+        else:
1414
+            value = self.popContent('description')
1415
+        self._summaryKey = None
1416
+    _end_abstract = _end_description
1417
+    _end_dc_description = _end_description
1418
+
1419
+    def _start_info(self, attrsD):
1420
+        self.pushContent('info', attrsD, 'text/plain', 1)
1421
+    _start_feedburner_browserfriendly = _start_info
1422
+
1423
+    def _end_info(self):
1424
+        self.popContent('info')
1425
+    _end_feedburner_browserfriendly = _end_info
1426
+
1427
+    def _start_generator(self, attrsD):
1428
+        if attrsD:
1429
+            attrsD = self._itsAnHrefDamnIt(attrsD)
1430
+            if attrsD.has_key('href'):
1431
+                attrsD['href'] = self.resolveURI(attrsD['href'])
1432
+        self._getContext()['generator_detail'] = FeedParserDict(attrsD)
1433
+        self.push('generator', 1)
1434
+
1435
+    def _end_generator(self):
1436
+        value = self.pop('generator')
1437
+        context = self._getContext()
1438
+        if context.has_key('generator_detail'):
1439
+            context['generator_detail']['name'] = value
1440
+
1441
+    def _start_admin_generatoragent(self, attrsD):
1442
+        self.push('generator', 1)
1443
+        value = self._getAttribute(attrsD, 'rdf:resource')
1444
+        if value:
1445
+            self.elementstack[-1][2].append(value)
1446
+        self.pop('generator')
1447
+        self._getContext()['generator_detail'] = FeedParserDict({'href': value})
1448
+
1449
+    def _start_admin_errorreportsto(self, attrsD):
1450
+        self.push('errorreportsto', 1)
1451
+        value = self._getAttribute(attrsD, 'rdf:resource')
1452
+        if value:
1453
+            self.elementstack[-1][2].append(value)
1454
+        self.pop('errorreportsto')
1455
+
1456
+    def _start_summary(self, attrsD):
1457
+        context = self._getContext()
1458
+        if context.has_key('summary'):
1459
+            self._summaryKey = 'content'
1460
+            self._start_content(attrsD)
1461
+        else:
1462
+            self._summaryKey = 'summary'
1463
+            self.pushContent(self._summaryKey, attrsD, 'text/plain', 1)
1464
+    _start_itunes_summary = _start_summary
1465
+
1466
+    def _end_summary(self):
1467
+        if self._summaryKey == 'content':
1468
+            self._end_content()
1469
+        else:
1470
+            self.popContent(self._summaryKey or 'summary')
1471
+        self._summaryKey = None
1472
+    _end_itunes_summary = _end_summary
1473
+
1474
+    def _start_enclosure(self, attrsD):
1475
+        attrsD = self._itsAnHrefDamnIt(attrsD)
1476
+        context = self._getContext()
1477
+        attrsD['rel']='enclosure'
1478
+        context.setdefault('links', []).append(FeedParserDict(attrsD))
1479
+        href = attrsD.get('href')
1480
+        if href and not context.get('id'):
1481
+            context['id'] = href
1482
+
1483
+    def _start_source(self, attrsD):
1484
+        self.insource = 1
1485
+        self.hasTitle = 0
1486
+
1487
+    def _end_source(self):
1488
+        self.insource = 0
1489
+        self._getContext()['source'] = copy.deepcopy(self.sourcedata)
1490
+        self.sourcedata.clear()
1491
+
1492
+    def _start_content(self, attrsD):
1493
+        self.pushContent('content', attrsD, 'text/plain', 1)
1494
+        src = attrsD.get('src')
1495
+        if src:
1496
+            self.contentparams['src'] = src
1497
+        self.push('content', 1)
1498
+
1499
+    def _start_prodlink(self, attrsD):
1500
+        self.pushContent('content', attrsD, 'text/html', 1)
1501
+
1502
+    def _start_body(self, attrsD):
1503
+        self.pushContent('content', attrsD, 'application/xhtml+xml', 1)
1504
+    _start_xhtml_body = _start_body
1505
+
1506
+    def _start_content_encoded(self, attrsD):
1507
+        self.pushContent('content', attrsD, 'text/html', 1)
1508
+    _start_fullitem = _start_content_encoded
1509
+
1510
+    def _end_content(self):
1511
+        copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)
1512
+        value = self.popContent('content')
1513
+        if copyToDescription:
1514
+            self._save('description', value)
1515
+
1516
+    _end_body = _end_content
1517
+    _end_xhtml_body = _end_content
1518
+    _end_content_encoded = _end_content
1519
+    _end_fullitem = _end_content
1520
+    _end_prodlink = _end_content
1521
+
1522
+    def _start_itunes_image(self, attrsD):
1523
+        self.push('itunes_image', 0)
1524
+        self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
1525
+    _start_itunes_link = _start_itunes_image
1526
+
1527
+    def _end_itunes_block(self):
1528
+        value = self.pop('itunes_block', 0)
1529
+        self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
1530
+
1531
+    def _end_itunes_explicit(self):
1532
+        value = self.pop('itunes_explicit', 0)
1533
+        self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0
1534
+
1535
+if _XML_AVAILABLE:
1536
+    class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
1537
+        def __init__(self, baseuri, baselang, encoding):
1538
+            if _debug: sys.stderr.write('trying StrictFeedParser\n')
1539
+            xml.sax.handler.ContentHandler.__init__(self)
1540
+            _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1541
+            self.bozo = 0
1542
+            self.exc = None
1543
+
1544
+        def startPrefixMapping(self, prefix, uri):
1545
+            self.trackNamespace(prefix, uri)
1546
+
1547
+        def startElementNS(self, name, qname, attrs):
1548
+            namespace, localname = name
1549
+            lowernamespace = str(namespace or '').lower()
1550
+            if lowernamespace.find('backend.userland.com/rss') <> -1:
1551
+                # match any backend.userland.com namespace
1552
+                namespace = 'http://backend.userland.com/rss'
1553
+                lowernamespace = namespace
1554
+            if qname and qname.find(':') > 0:
1555
+                givenprefix = qname.split(':')[0]
1556
+            else:
1557
+                givenprefix = None
1558
+            prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1559
+            if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):
1560
+                    raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
1561
+            localname = str(localname).lower()
1562
+
1563
+            # qname implementation is horribly broken in Python 2.1 (it
1564
+            # doesn't report any), and slightly broken in Python 2.2 (it
1565
+            # doesn't report the xml: namespace). So we match up namespaces
1566
+            # with a known list first, and then possibly override them with
1567
+            # the qnames the SAX parser gives us (if indeed it gives us any
1568
+            # at all).  Thanks to MatejC for helping me test this and
1569
+            # tirelessly telling me that it didn't work yet.
1570
+            attrsD = {}
1571
+            if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
1572
+                attrsD['xmlns']=namespace
1573
+            if localname=='svg' and namespace=='http://www.w3.org/2000/svg':
1574
+                attrsD['xmlns']=namespace
1575
+
1576
+            if prefix:
1577
+                localname = prefix.lower() + ':' + localname
1578
+            elif namespace and not qname: #Expat
1579
+                for name,value in self.namespacesInUse.items():
1580
+                     if name and value == namespace:
1581
+                         localname = name + ':' + localname
1582
+                         break
1583
+            if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname))
1584
+
1585
+            for (namespace, attrlocalname), attrvalue in attrs._attrs.items():
1586
+                lowernamespace = (namespace or '').lower()
1587
+                prefix = self._matchnamespaces.get(lowernamespace, '')
1588
+                if prefix:
1589
+                    attrlocalname = prefix + ':' + attrlocalname
1590
+                attrsD[str(attrlocalname).lower()] = attrvalue
1591
+            for qname in attrs.getQNames():
1592
+                attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
1593
+            self.unknown_starttag(localname, attrsD.items())
1594
+
1595
+        def characters(self, text):
1596
+            self.handle_data(text)
1597
+
1598
+        def endElementNS(self, name, qname):
1599
+            namespace, localname = name
1600
+            lowernamespace = str(namespace or '').lower()
1601
+            if qname and qname.find(':') > 0:
1602
+                givenprefix = qname.split(':')[0]
1603
+            else:
1604
+                givenprefix = ''
1605
+            prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1606
+            if prefix:
1607
+                localname = prefix + ':' + localname
1608
+            elif namespace and not qname: #Expat
1609
+                for name,value in self.namespacesInUse.items():
1610
+                     if name and value == namespace:
1611
+                         localname = name + ':' + localname
1612
+                         break
1613
+            localname = str(localname).lower()
1614
+            self.unknown_endtag(localname)
1615
+
1616
+        def error(self, exc):
1617
+            self.bozo = 1
1618
+            self.exc = exc
1619
+
1620
+        def fatalError(self, exc):
1621
+            self.error(exc)
1622
+            raise exc
1623
+
1624
+class _BaseHTMLProcessor(sgmllib.SGMLParser):
1625
+    special = re.compile('''[<>'"]''')
1626
+    bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
1627
+    elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
1628
+      'img', 'input', 'isindex', 'link', 'meta', 'param']
1629
+
1630
+    def __init__(self, encoding, type):
1631
+        self.encoding = encoding
1632
+        self.type = type
1633
+        if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
1634
+        sgmllib.SGMLParser.__init__(self)
1635
+
1636
+    def reset(self):
1637
+        self.pieces = []
1638
+        sgmllib.SGMLParser.reset(self)
1639
+
1640
+    def _shorttag_replace(self, match):
1641
+        tag = match.group(1)
1642
+        if tag in self.elements_no_end_tag:
1643
+            return '<' + tag + ' />'
1644
+        else:
1645
+            return '<' + tag + '></' + tag + '>'
1646
+
1647
+    def parse_starttag(self,i):
1648
+        j=sgmllib.SGMLParser.parse_starttag(self, i)
1649
+        if self.type == 'application/xhtml+xml':
1650
+            if j>2 and self.rawdata[j-2:j]=='/>':
1651
+                self.unknown_endtag(self.lasttag)
1652
+        return j
1653
+
1654
+    def feed(self, data):
1655
+        data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'&lt;!\1', data)
1656
+        #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
1657
+        data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data)
1658
+        data = data.replace('&#39;', "'")
1659
+        data = data.replace('&#34;', '"')
1660
+        if self.encoding and type(data) == type(u''):
1661
+            data = data.encode(self.encoding)
1662
+        sgmllib.SGMLParser.feed(self, data)
1663
+        sgmllib.SGMLParser.close(self)
1664
+
1665
+    def normalize_attrs(self, attrs):
1666
+        if not attrs: return attrs
1667
+        # utility method to be called by descendants
1668
+        attrs = dict([(k.lower(), v) for k, v in attrs]).items()
1669
+        attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
1670
+        attrs.sort()
1671
+        return attrs
1672
+
1673
+    def unknown_starttag(self, tag, attrs):
1674
+        # called for each start tag
1675
+        # attrs is a list of (attr, value) tuples
1676
+        # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
1677
+        if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
1678
+        uattrs = []
1679
+        strattrs=''
1680
+        if attrs:
1681
+            for key, value in attrs:
1682
+                value=value.replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
1683
+                value = self.bare_ampersand.sub("&amp;", value)
1684
+                # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
1685
+                if type(value) != type(u''):
1686
+                    try:
1687
+                        value = unicode(value, self.encoding)
1688
+                    except:
1689
+                        value = unicode(value, 'iso-8859-1')
1690
+                uattrs.append((unicode(key, self.encoding), value))
1691
+            strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs])
1692
+            if self.encoding:
1693
+                try:
1694
+                    strattrs=strattrs.encode(self.encoding)
1695
+                except:
1696
+                    pass
1697
+        if tag in self.elements_no_end_tag:
1698
+            self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
1699
+        else:
1700
+            self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
1701
+
1702
+    def unknown_endtag(self, tag):
1703
+        # called for each end tag, e.g. for </pre>, tag will be 'pre'
1704
+        # Reconstruct the original end tag.
1705
+        if tag not in self.elements_no_end_tag:
1706
+            self.pieces.append("</%(tag)s>" % locals())
1707
+
1708
+    def handle_charref(self, ref):
1709
+        # called for each character reference, e.g. for '&#160;', ref will be '160'
1710
+        # Reconstruct the original character reference.
1711
+        if ref.startswith('x'):
1712
+            value = unichr(int(ref[1:],16))
1713
+        else:
1714
+            value = unichr(int(ref))
1715
+
1716
+        if value in _cp1252.keys():
1717
+            self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:])
1718
+        else:
1719
+            self.pieces.append('&#%(ref)s;' % locals())
1720
+
1721
+    def handle_entityref(self, ref):
1722
+        # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
1723
+        # Reconstruct the original entity reference.
1724
+        if name2codepoint.has_key(ref):
1725
+            self.pieces.append('&%(ref)s;' % locals())
1726
+        else:
1727
+            self.pieces.append('&amp;%(ref)s' % locals())
1728
+
1729
+    def handle_data(self, text):
1730
+        # called for each block of plain text, i.e. outside of any tag and
1731
+        # not containing any character or entity references
1732
+        # Store the original text verbatim.
1733
+        if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
1734
+        self.pieces.append(text)
1735
+
1736
+    def handle_comment(self, text):
1737
+        # called for each HTML comment, e.g. <!-- insert Javascript code here -->
1738
+        # Reconstruct the original comment.
1739
+        self.pieces.append('<!--%(text)s-->' % locals())
1740
+
1741
+    def handle_pi(self, text):
1742
+        # called for each processing instruction, e.g. <?instruction>
1743
+        # Reconstruct original processing instruction.
1744
+        self.pieces.append('<?%(text)s>' % locals())
1745
+
1746
+    def handle_decl(self, text):
1747
+        # called for the DOCTYPE, if present, e.g.
1748
+        # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
1749
+        #     "http://www.w3.org/TR/html4/loose.dtd">
1750
+        # Reconstruct original DOCTYPE
1751
+        self.pieces.append('<!%(text)s>' % locals())
1752
+
1753
+    _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
1754
+    def _scan_name(self, i, declstartpos):
1755
+        rawdata = self.rawdata
1756
+        n = len(rawdata)
1757
+        if i == n:
1758
+            return None, -1
1759
+        m = self._new_declname_match(rawdata, i)
1760
+        if m:
1761
+            s = m.group()
1762
+            name = s.strip()
1763
+            if (i + len(s)) == n:
1764
+                return None, -1  # end of buffer
1765
+            return name.lower(), m.end()
1766
+        else:
1767
+            self.handle_data(rawdata)
1768
+#            self.updatepos(declstartpos, i)
1769
+            return None, -1
1770
+
1771
+    def convert_charref(self, name):
1772
+        return '&#%s;' % name
1773
+
1774
+    def convert_entityref(self, name):
1775
+        return '&%s;' % name
1776
+
1777
+    def output(self):
1778
+        '''Return processed HTML as a single string'''
1779
+        return ''.join([str(p) for p in self.pieces])
1780
+
1781
+class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
1782
+    def __init__(self, baseuri, baselang, encoding, entities):
1783
+        sgmllib.SGMLParser.__init__(self)
1784
+        _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1785
+        _BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml')
1786
+        self.entities=entities
1787
+
1788
+    def decodeEntities(self, element, data):
1789