Original 2010-09-03 version
David Blume

David Blume commited on 2018-01-20 20:10:33
Showing 4 changed files, with 4291 additions and 0 deletions.

... ...
@@ -0,0 +1,19 @@
1
+Copyright (c) 2018, David Blume
2
+
3
+Permission is hereby granted, free of charge, to any person obtaining
4
+a copy of this software and associated documentation files (the "Software"),
5
+to deal in the Software without restriction, including without limitation
6
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
7
+and/or sell copies of the Software, and to permit persons to whom the
8
+Software is furnished to do so, subject to the following conditions:
9
+
10
+The above copyright notice and this permission notice shall be included
11
+in all copies or substantial portions of the Software.
12
+
13
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
14
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
16
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
19
+DEALINGS IN THE SOFTWARE.
... ...
@@ -0,0 +1,30 @@
1
+[![License](https://img.shields.io/badge/license-MIT_license-blue.svg)](http://git.dlma.com/techcrunch.git/blob/master/LICENSE.txt)
2
+![python2.x](https://img.shields.io/badge/python-2.x-yellow.svg)
3
+# TechCrunch Feed Filter
4
+
5
+This is a Python script run as a cronjob to read the TechCrunch article feed, 
6
+and decide which articles to include in its own feed.
7
+
8
+Here's a [blog post about it](http://david.dlma.com/blog/my-techcrunch-feed-filter).
9
+
10
+# History
11
+
12
+This was originally archived in a Subversion repo. I'd forgotten about the
13
+version control and had gotten into the habit of just modifying the production
14
+site.
15
+
16
+* 2010-09-03: Original
17
+* 2010-09-03: Save off the disqus identifier for use later.
18
+* 2011-02-04: Algorithm changes (tags and author checked), new chart drawing, spaces used instead of tabs.
19
+* 2011-02-04: Update to the chart drawing algorithm.
20
+* 2013-08-04: Miscellaneous changes to techcrunch.py
21
+* 2015-11-23: Resync svn with production site.
22
+* 2015-11-27: Remove obsolete disqus and retweet code, and refactor style to be more PEP-8ish.
23
+
24
+# Is it any good?
25
+
26
+[Yes](https://news.ycombinator.com/item?id=3067434).
27
+
28
+# Licence
29
+
30
+This software uses the [MIT license](http://git.dlma.com/techcrunch.git/blob/master/LICENSE.txt)
... ...
@@ -0,0 +1,3612 @@
1
+#!/usr/bin/env python
2
+"""Universal feed parser
3
+
4
+Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
5
+
6
+Visit http://feedparser.org/ for the latest version
7
+Visit http://feedparser.org/docs/ for the latest documentation
8
+
9
+Required: Python 2.1 or later
10
+Recommended: Python 2.3 or later
11
+Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
12
+"""
13
+
14
+__version__ = "4.2-pre-" + "$Revision: 291 $"[11:14] + "-svn"
15
+__license__ = """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved.
16
+
17
+Redistribution and use in source and binary forms, with or without modification,
18
+are permitted provided that the following conditions are met:
19
+
20
+* Redistributions of source code must retain the above copyright notice,
21
+  this list of conditions and the following disclaimer.
22
+* Redistributions in binary form must reproduce the above copyright notice,
23
+  this list of conditions and the following disclaimer in the documentation
24
+  and/or other materials provided with the distribution.
25
+
26
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
27
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36
+POSSIBILITY OF SUCH DAMAGE."""
37
+__author__ = "Mark Pilgrim <http://diveintomark.org/>"
38
+__contributors__ = ["Jason Diamond <http://injektilo.org/>",
39
+                    "John Beimler <http://john.beimler.org/>",
40
+                    "Fazal Majid <http://www.majid.info/mylos/weblog/>",
41
+                    "Aaron Swartz <http://aaronsw.com/>",
42
+                    "Kevin Marks <http://epeus.blogspot.com/>",
43
+                    "Sam Ruby <http://intertwingly.net/>"]
44
+_debug = 0
45
+
46
+# HTTP "User-Agent" header to send to servers when downloading feeds.
47
+# If you are embedding feedparser in a larger application, you should
48
+# change this to your application name and URL.
49
+USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__
50
+
51
+# HTTP "Accept" header to send to servers when downloading feeds.  If you don't
52
+# want to send an Accept header, set this to None.
53
+ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
54
+
55
+# List of preferred XML parsers, by SAX driver name.  These will be tried first,
56
+# but if they're not installed, Python will keep searching through its own list
57
+# of pre-installed parsers until it finds one that supports everything we need.
58
+PREFERRED_XML_PARSERS = ["drv_libxml2"]
59
+
60
+# If you want feedparser to automatically run HTML markup through HTML Tidy, set
61
+# this to 1.  Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
62
+# or utidylib <http://utidylib.berlios.de/>.
63
+TIDY_MARKUP = 0
64
+
65
+# List of Python interfaces for HTML Tidy, in order of preference.  Only useful
66
+# if TIDY_MARKUP = 1
67
+PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
68
+
69
+# If you want feedparser to automatically resolve all relative URIs, set this
70
+# to 1.
71
+RESOLVE_RELATIVE_URIS = 1
72
+
73
+# If you want feedparser to automatically sanitize all potentially unsafe
74
+# HTML content, set this to 1.
75
+SANITIZE_HTML = 1
76
+
77
+# ---------- required modules (should come with any Python distribution) ----------
78
+import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
79
+try:
80
+    from cStringIO import StringIO as _StringIO
81
+except:
82
+    from StringIO import StringIO as _StringIO
83
+
84
+# ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
85
+
86
+# gzip is included with most Python distributions, but may not be available if you compiled your own
87
+try:
88
+    import gzip
89
+except:
90
+    gzip = None
91
+try:
92
+    import zlib
93
+except:
94
+    zlib = None
95
+
96
+# If a real XML parser is available, feedparser will attempt to use it.  feedparser has
97
+# been tested with the built-in SAX parser, PyXML, and libxml2.  On platforms where the
98
+# Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
99
+# versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
100
+try:
101
+    import xml.sax
102
+    xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
103
+    from xml.sax.saxutils import escape as _xmlescape
104
+    _XML_AVAILABLE = 1
105
+except:
106
+    _XML_AVAILABLE = 0
107
+    def _xmlescape(data,entities={}):
108
+        data = data.replace('&', '&amp;')
109
+        data = data.replace('>', '&gt;')
110
+        data = data.replace('<', '&lt;')
111
+        for char, entity in entities:
112
+            data = data.replace(char, entity)
113
+        return data
114
+
115
+# base64 support for Atom feeds that contain embedded binary data
116
+try:
117
+    import base64, binascii
118
+except:
119
+    base64 = binascii = None
120
+
121
+# cjkcodecs and iconv_codec provide support for more character encodings.
122
+# Both are available from http://cjkpython.i18n.org/
123
+try:
124
+    import cjkcodecs.aliases
125
+except:
126
+    pass
127
+try:
128
+    import iconv_codec
129
+except:
130
+    pass
131
+
132
+# chardet library auto-detects character encodings
133
+# Download from http://chardet.feedparser.org/
134
+try:
135
+    import chardet
136
+    if _debug:
137
+        import chardet.constants
138
+        chardet.constants._debug = 1
139
+except:
140
+    chardet = None
141
+
142
+# reversable htmlentitydefs mappings for Python 2.2
143
+try:
144
+  from htmlentitydefs import name2codepoint, codepoint2name
145
+except:
146
+  import htmlentitydefs
147
+  name2codepoint={}
148
+  codepoint2name={}
149
+  for (name,codepoint) in htmlentitydefs.entitydefs.iteritems():
150
+    if codepoint.startswith('&#'): codepoint=unichr(int(codepoint[2:-1]))
151
+    name2codepoint[name]=ord(codepoint)
152
+    codepoint2name[ord(codepoint)]=name
153
+
154
+# BeautifulSoup parser used for parsing microformats from embedded HTML content
155
+# http://www.crummy.com/software/BeautifulSoup/
156
+# feedparser is tested with BeautifulSoup 3.0.x, but it might work with the
157
+# older 2.x series.  If it doesn't, and you can figure out why, I'll accept a
158
+# patch and modify the compatibility statement accordingly.
159
+try:
160
+    import BeautifulSoup
161
+except:
162
+    BeautifulSoup = None
163
+
164
+# ---------- don't touch these ----------
165
+class ThingsNobodyCaresAboutButMe(Exception): pass
166
+class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
167
+class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
168
+class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
169
+class UndeclaredNamespace(Exception): pass
170
+
171
+sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
172
+sgmllib.special = re.compile('<!')
173
+sgmllib.charref = re.compile('&#(\d+|x[0-9a-fA-F]+);')
174
+
175
+if sgmllib.endbracket.search(' <').start(0):
176
+    class EndBracketMatch:
177
+        endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''')
178
+        def search(self,string,index=0):
179
+            self.match = self.endbracket.match(string,index)
180
+            if self.match: return self
181
+        def start(self,n):
182
+            return self.match.end(n)
183
+    sgmllib.endbracket = EndBracketMatch()
184
+
185
+SUPPORTED_VERSIONS = {'': 'unknown',
186
+                      'rss090': 'RSS 0.90',
187
+                      'rss091n': 'RSS 0.91 (Netscape)',
188
+                      'rss091u': 'RSS 0.91 (Userland)',
189
+                      'rss092': 'RSS 0.92',
190
+                      'rss093': 'RSS 0.93',
191
+                      'rss094': 'RSS 0.94',
192
+                      'rss20': 'RSS 2.0',
193
+                      'rss10': 'RSS 1.0',
194
+                      'rss': 'RSS (unknown version)',
195
+                      'atom01': 'Atom 0.1',
196
+                      'atom02': 'Atom 0.2',
197
+                      'atom03': 'Atom 0.3',
198
+                      'atom10': 'Atom 1.0',
199
+                      'atom': 'Atom (unknown version)',
200
+                      'cdf': 'CDF',
201
+                      'hotrss': 'Hot RSS'
202
+                      }
203
+
204
+try:
205
+    UserDict = dict
206
+except NameError:
207
+    # Python 2.1 does not have dict
208
+    from UserDict import UserDict
209
+    def dict(aList):
210
+        rc = {}
211
+        for k, v in aList:
212
+            rc[k] = v
213
+        return rc
214
+
215
+class FeedParserDict(UserDict):
216
+    keymap = {'channel': 'feed',
217
+              'items': 'entries',
218
+              'guid': 'id',
219
+              'date': 'updated',
220
+              'date_parsed': 'updated_parsed',
221
+              'description': ['subtitle', 'summary'],
222
+              'url': ['href'],
223
+              'modified': 'updated',
224
+              'modified_parsed': 'updated_parsed',
225
+              'issued': 'published',
226
+              'issued_parsed': 'published_parsed',
227
+              'copyright': 'rights',
228
+              'copyright_detail': 'rights_detail',
229
+              'tagline': 'subtitle',
230
+              'tagline_detail': 'subtitle_detail'}
231
+    def __getitem__(self, key):
232
+        if key == 'category':
233
+            return UserDict.__getitem__(self, 'tags')[0]['term']
234
+        if key == 'enclosures':
235
+            norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel'])
236
+            return [norel(link) for link in UserDict.__getitem__(self, 'links') if link['rel']=='enclosure']
237
+        if key == 'license':
238
+            for link in UserDict.__getitem__(self, 'links'):
239
+                if link['rel']=='license' and link.has_key('href'):
240
+                    return link['href']
241
+        if key == 'categories':
242
+            return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')]
243
+        realkey = self.keymap.get(key, key)
244
+        if type(realkey) == types.ListType:
245
+            for k in realkey:
246
+                if UserDict.has_key(self, k):
247
+                    return UserDict.__getitem__(self, k)
248
+        if UserDict.has_key(self, key):
249
+            return UserDict.__getitem__(self, key)
250
+        return UserDict.__getitem__(self, realkey)
251
+
252
+    def __setitem__(self, key, value):
253
+        for k in self.keymap.keys():
254
+            if key == k:
255
+                key = self.keymap[k]
256
+                if type(key) == types.ListType:
257
+                    key = key[0]
258
+        return UserDict.__setitem__(self, key, value)
259
+
260
+    def get(self, key, default=None):
261
+        if self.has_key(key):
262
+            return self[key]
263
+        else:
264
+            return default
265
+
266
+    def setdefault(self, key, value):
267
+        if not self.has_key(key):
268
+            self[key] = value
269
+        return self[key]
270
+
271
+    def has_key(self, key):
272
+        try:
273
+            return hasattr(self, key) or UserDict.has_key(self, key)
274
+        except AttributeError:
275
+            return False
276
+
277
+    def __getattr__(self, key):
278
+        try:
279
+            return self.__dict__[key]
280
+        except KeyError:
281
+            pass
282
+        try:
283
+            assert not key.startswith('_')
284
+            return self.__getitem__(key)
285
+        except:
286
+            raise AttributeError, "object has no attribute '%s'" % key
287
+
288
+    def __setattr__(self, key, value):
289
+        if key.startswith('_') or key == 'data':
290
+            self.__dict__[key] = value
291
+        else:
292
+            return self.__setitem__(key, value)
293
+
294
+    def __contains__(self, key):
295
+        return self.has_key(key)
296
+
297
+def zopeCompatibilityHack():
298
+    global FeedParserDict
299
+    del FeedParserDict
300
+    def FeedParserDict(aDict=None):
301
+        rc = {}
302
+        if aDict:
303
+            rc.update(aDict)
304
+        return rc
305
+
306
+_ebcdic_to_ascii_map = None
307
+def _ebcdic_to_ascii(s):
308
+    global _ebcdic_to_ascii_map
309
+    if not _ebcdic_to_ascii_map:
310
+        emap = (
311
+            0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
312
+            16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
313
+            128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
314
+            144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
315
+            32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
316
+            38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
317
+            45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
318
+            186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
319
+            195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
320
+            202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
321
+            209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
322
+            216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
323
+            123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
324
+            125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
325
+            92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
326
+            48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
327
+            )
328
+        import string
329
+        _ebcdic_to_ascii_map = string.maketrans( \
330
+            ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
331
+    return s.translate(_ebcdic_to_ascii_map)
332
+
333
+_cp1252 = {
334
+  unichr(128): unichr(8364), # euro sign
335
+  unichr(130): unichr(8218), # single low-9 quotation mark
336
+  unichr(131): unichr( 402), # latin small letter f with hook
337
+  unichr(132): unichr(8222), # double low-9 quotation mark
338
+  unichr(133): unichr(8230), # horizontal ellipsis
339
+  unichr(134): unichr(8224), # dagger
340
+  unichr(135): unichr(8225), # double dagger
341
+  unichr(136): unichr( 710), # modifier letter circumflex accent
342
+  unichr(137): unichr(8240), # per mille sign
343
+  unichr(138): unichr( 352), # latin capital letter s with caron
344
+  unichr(139): unichr(8249), # single left-pointing angle quotation mark
345
+  unichr(140): unichr( 338), # latin capital ligature oe
346
+  unichr(142): unichr( 381), # latin capital letter z with caron
347
+  unichr(145): unichr(8216), # left single quotation mark
348
+  unichr(146): unichr(8217), # right single quotation mark
349
+  unichr(147): unichr(8220), # left double quotation mark
350
+  unichr(148): unichr(8221), # right double quotation mark
351
+  unichr(149): unichr(8226), # bullet
352
+  unichr(150): unichr(8211), # en dash
353
+  unichr(151): unichr(8212), # em dash
354
+  unichr(152): unichr( 732), # small tilde
355
+  unichr(153): unichr(8482), # trade mark sign
356
+  unichr(154): unichr( 353), # latin small letter s with caron
357
+  unichr(155): unichr(8250), # single right-pointing angle quotation mark
358
+  unichr(156): unichr( 339), # latin small ligature oe
359
+  unichr(158): unichr( 382), # latin small letter z with caron
360
+  unichr(159): unichr( 376)} # latin capital letter y with diaeresis
361
+
362
+_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
363
+def _urljoin(base, uri):
364
+    uri = _urifixer.sub(r'\1\3', uri)
365
+    try:
366
+        return urlparse.urljoin(base, uri)
367
+    except:
368
+        uri = urlparse.urlunparse([urllib.quote(part) for part in urlparse.urlparse(uri)])
369
+        return urlparse.urljoin(base, uri)
370
+
371
+class _FeedParserMixin:
372
+    namespaces = {'': '',
373
+                  'http://backend.userland.com/rss': '',
374
+                  'http://blogs.law.harvard.edu/tech/rss': '',
375
+                  'http://purl.org/rss/1.0/': '',
376
+                  'http://my.netscape.com/rdf/simple/0.9/': '',
377
+                  'http://example.com/newformat#': '',
378
+                  'http://example.com/necho': '',
379
+                  'http://purl.org/echo/': '',
380
+                  'uri/of/echo/namespace#': '',
381
+                  'http://purl.org/pie/': '',
382
+                  'http://purl.org/atom/ns#': '',
383
+                  'http://www.w3.org/2005/Atom': '',
384
+                  'http://purl.org/rss/1.0/modules/rss091#': '',
385
+
386
+                  'http://webns.net/mvcb/':                               'admin',
387
+                  'http://purl.org/rss/1.0/modules/aggregation/':         'ag',
388
+                  'http://purl.org/rss/1.0/modules/annotate/':            'annotate',
389
+                  'http://media.tangent.org/rss/1.0/':                    'audio',
390
+                  'http://backend.userland.com/blogChannelModule':        'blogChannel',
391
+                  'http://web.resource.org/cc/':                          'cc',
392
+                  'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',
393
+                  'http://purl.org/rss/1.0/modules/company':              'co',
394
+                  'http://purl.org/rss/1.0/modules/content/':             'content',
395
+                  'http://my.theinfo.org/changed/1.0/rss/':               'cp',
396
+                  'http://purl.org/dc/elements/1.1/':                     'dc',
397
+                  'http://purl.org/dc/terms/':                            'dcterms',
398
+                  'http://purl.org/rss/1.0/modules/email/':               'email',
399
+                  'http://purl.org/rss/1.0/modules/event/':               'ev',
400
+                  'http://rssnamespace.org/feedburner/ext/1.0':           'feedburner',
401
+                  'http://freshmeat.net/rss/fm/':                         'fm',
402
+                  'http://xmlns.com/foaf/0.1/':                           'foaf',
403
+                  'http://www.w3.org/2003/01/geo/wgs84_pos#':             'geo',
404
+                  'http://postneo.com/icbm/':                             'icbm',
405
+                  'http://purl.org/rss/1.0/modules/image/':               'image',
406
+                  'http://www.itunes.com/DTDs/PodCast-1.0.dtd':           'itunes',
407
+                  'http://example.com/DTDs/PodCast-1.0.dtd':              'itunes',
408
+                  'http://purl.org/rss/1.0/modules/link/':                'l',
409
+                  'http://search.yahoo.com/mrss':                         'media',
410
+                  'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
411
+                  'http://prismstandard.org/namespaces/1.2/basic/':       'prism',
412
+                  'http://www.w3.org/1999/02/22-rdf-syntax-ns#':          'rdf',
413
+                  'http://www.w3.org/2000/01/rdf-schema#':                'rdfs',
414
+                  'http://purl.org/rss/1.0/modules/reference/':           'ref',
415
+                  'http://purl.org/rss/1.0/modules/richequiv/':           'reqv',
416
+                  'http://purl.org/rss/1.0/modules/search/':              'search',
417
+                  'http://purl.org/rss/1.0/modules/slash/':               'slash',
418
+                  'http://schemas.xmlsoap.org/soap/envelope/':            'soap',
419
+                  'http://purl.org/rss/1.0/modules/servicestatus/':       'ss',
420
+                  'http://hacks.benhammersley.com/rss/streaming/':        'str',
421
+                  'http://purl.org/rss/1.0/modules/subscription/':        'sub',
422
+                  'http://purl.org/rss/1.0/modules/syndication/':         'sy',
423
+                  'http://schemas.pocketsoap.com/rss/myDescModule/':      'szf',
424
+                  'http://purl.org/rss/1.0/modules/taxonomy/':            'taxo',
425
+                  'http://purl.org/rss/1.0/modules/threading/':           'thr',
426
+                  'http://purl.org/rss/1.0/modules/textinput/':           'ti',
427
+                  'http://madskills.com/public/xml/rss/module/trackback/':'trackback',
428
+                  'http://wellformedweb.org/commentAPI/':                 'wfw',
429
+                  'http://purl.org/rss/1.0/modules/wiki/':                'wiki',
430
+                  'http://www.w3.org/1999/xhtml':                         'xhtml',
431
+                  'http://www.w3.org/1999/xlink':                         'xlink',
432
+                  'http://www.w3.org/XML/1998/namespace':                 'xml'
433
+}
434
+    _matchnamespaces = {}
435
+
436
+    can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo']
437
+    can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
438
+    can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
439
+    html_types = ['text/html', 'application/xhtml+xml']
440
+
441
+    def __init__(self, baseuri=None, baselang=None, encoding='utf-8'):
442
+        if _debug: sys.stderr.write('initializing FeedParser\n')
443
+        if not self._matchnamespaces:
444
+            for k, v in self.namespaces.items():
445
+                self._matchnamespaces[k.lower()] = v
446
+        self.feeddata = FeedParserDict() # feed-level data
447
+        self.encoding = encoding # character encoding
448
+        self.entries = [] # list of entry-level data
449
+        self.version = '' # feed type/version, see SUPPORTED_VERSIONS
450
+        self.namespacesInUse = {} # dictionary of namespaces defined by the feed
451
+
452
+        # the following are used internally to track state;
453
+        # this is really out of control and should be refactored
454
+        self.infeed = 0
455
+        self.inentry = 0
456
+        self.incontent = 0
457
+        self.intextinput = 0
458
+        self.inimage = 0
459
+        self.inauthor = 0
460
+        self.incontributor = 0
461
+        self.inpublisher = 0
462
+        self.insource = 0
463
+        self.sourcedata = FeedParserDict()
464
+        self.contentparams = FeedParserDict()
465
+        self._summaryKey = None
466
+        self.namespacemap = {}
467
+        self.elementstack = []
468
+        self.basestack = []
469
+        self.langstack = []
470
+        self.baseuri = baseuri or ''
471
+        self.lang = baselang or None
472
+        self.svgOK = 0
473
+        self.hasTitle = 0
474
+        if baselang:
475
+            self.feeddata['language'] = baselang.replace('_','-')
476
+
477
+    def unknown_starttag(self, tag, attrs):
478
+        if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
479
+        # normalize attrs
480
+        attrs = [(k.lower(), v) for k, v in attrs]
481
+        attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
482
+
483
+        # track xml:base and xml:lang
484
+        attrsD = dict(attrs)
485
+        baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
486
+        if type(baseuri) != type(u''):
487
+            try:
488
+                baseuri = unicode(baseuri, self.encoding)
489
+            except:
490
+                baseuri = unicode(baseuri, 'iso-8859-1')
491
+        self.baseuri = _urljoin(self.baseuri, baseuri)
492
+        lang = attrsD.get('xml:lang', attrsD.get('lang'))
493
+        if lang == '':
494
+            # xml:lang could be explicitly set to '', we need to capture that
495
+            lang = None
496
+        elif lang is None:
497
+            # if no xml:lang is specified, use parent lang
498
+            lang = self.lang
499
+        if lang:
500
+            if tag in ('feed', 'rss', 'rdf:RDF'):
501
+                self.feeddata['language'] = lang.replace('_','-')
502
+        self.lang = lang
503
+        self.basestack.append(self.baseuri)
504
+        self.langstack.append(lang)
505
+
506
+        # track namespaces
507
+        for prefix, uri in attrs:
508
+            if prefix.startswith('xmlns:'):
509
+                self.trackNamespace(prefix[6:], uri)
510
+            elif prefix == 'xmlns':
511
+                self.trackNamespace(None, uri)
512
+
513
+        # track inline content
514
+        if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
515
+            if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007
516
+            # element declared itself as escaped markup, but it isn't really
517
+            self.contentparams['type'] = 'application/xhtml+xml'
518
+        if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
519
+            if tag.find(':') <> -1:
520
+                prefix, tag = tag.split(':', 1)
521
+                namespace = self.namespacesInUse.get(prefix, '')
522
+                if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
523
+                    attrs.append(('xmlns',namespace))
524
+                if tag=='svg' and namespace=='http://www.w3.org/2000/svg':
525
+                    attrs.append(('xmlns',namespace))
526
+            if tag == 'svg': self.svgOK += 1
527
+            return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0)
528
+
529
+        # match namespaces
530
+        if tag.find(':') <> -1:
531
+            prefix, suffix = tag.split(':', 1)
532
+        else:
533
+            prefix, suffix = '', tag
534
+        prefix = self.namespacemap.get(prefix, prefix)
535
+        if prefix:
536
+            prefix = prefix + '_'
537
+
538
+        # special hack for better tracking of empty textinput/image elements in illformed feeds
539
+        if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
540
+            self.intextinput = 0
541
+        if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
542
+            self.inimage = 0
543
+
544
+        # call special handler (if defined) or default handler
545
+        methodname = '_start_' + prefix + suffix
546
+        try:
547
+            method = getattr(self, methodname)
548
+            return method(attrsD)
549
+        except AttributeError:
550
+            return self.push(prefix + suffix, 1)
551
+
552
+    def unknown_endtag(self, tag):
553
+        if _debug: sys.stderr.write('end %s\n' % tag)
554
+        # match namespaces
555
+        if tag.find(':') <> -1:
556
+            prefix, suffix = tag.split(':', 1)
557
+        else:
558
+            prefix, suffix = '', tag
559
+        prefix = self.namespacemap.get(prefix, prefix)
560
+        if prefix:
561
+            prefix = prefix + '_'
562
+        if suffix == 'svg' and self.svgOK: self.svgOK -= 1
563
+
564
+        # call special handler (if defined) or default handler
565
+        methodname = '_end_' + prefix + suffix
566
+        try:
567
+            if self.svgOK: raise AttributeError()
568
+            method = getattr(self, methodname)
569
+            method()
570
+        except AttributeError:
571
+            self.pop(prefix + suffix)
572
+
573
+        # track inline content
574
+        if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
575
+            # element declared itself as escaped markup, but it isn't really
576
+            if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007
577
+            self.contentparams['type'] = 'application/xhtml+xml'
578
+        if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
579
+            tag = tag.split(':')[-1]
580
+            self.handle_data('</%s>' % tag, escape=0)
581
+
582
+        # track xml:base and xml:lang going out of scope
583
+        if self.basestack:
584
+            self.basestack.pop()
585
+            if self.basestack and self.basestack[-1]:
586
+                self.baseuri = self.basestack[-1]
587
+        if self.langstack:
588
+            self.langstack.pop()
589
+            if self.langstack: # and (self.langstack[-1] is not None):
590
+                self.lang = self.langstack[-1]
591
+
592
+    def handle_charref(self, ref):
593
+        # called for each character reference, e.g. for '&#160;', ref will be '160'
594
+        if not self.elementstack: return
595
+        ref = ref.lower()
596
+        if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
597
+            text = '&#%s;' % ref
598
+        else:
599
+            if ref[0] == 'x':
600
+                c = int(ref[1:], 16)
601
+            else:
602
+                c = int(ref)
603
+            text = unichr(c).encode('utf-8')
604
+        self.elementstack[-1][2].append(text)
605
+
606
+    def handle_entityref(self, ref):
607
+        # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
608
+        if not self.elementstack: return
609
+        if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref)
610
+        if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
611
+            text = '&%s;' % ref
612
+        elif ref in self.entities.keys():
613
+            text = self.entities[ref]
614
+            if text.startswith('&#') and text.endswith(';'):
615
+                return self.handle_entityref(text)
616
+        else:
617
+            try: name2codepoint[ref]
618
+            except KeyError: text = '&%s;' % ref
619
+            else: text = unichr(name2codepoint[ref]).encode('utf-8')
620
+        self.elementstack[-1][2].append(text)
621
+
622
+    def handle_data(self, text, escape=1):
623
+        # called for each block of plain text, i.e. outside of any tag and
624
+        # not containing any character or entity references
625
+        if not self.elementstack: return
626
+        if escape and self.contentparams.get('type') == 'application/xhtml+xml':
627
+            text = _xmlescape(text)
628
+        self.elementstack[-1][2].append(text)
629
+
630
+    def handle_comment(self, text):
631
+        # called for each comment, e.g. <!-- insert message here -->
632
+        pass
633
+
634
+    def handle_pi(self, text):
635
+        # called for each processing instruction, e.g. <?instruction>
636
+        pass
637
+
638
+    def handle_decl(self, text):
639
+        pass
640
+
641
+    def parse_declaration(self, i):
642
+        # override internal declaration handler to handle CDATA blocks
643
+        if _debug: sys.stderr.write('entering parse_declaration\n')
644
+        if self.rawdata[i:i+9] == '<![CDATA[':
645
+            k = self.rawdata.find(']]>', i)
646
+            if k == -1: k = len(self.rawdata)
647
+            self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
648
+            return k+3
649
+        else:
650
+            k = self.rawdata.find('>', i)
651
+            return k+1
652
+
653
+    def mapContentType(self, contentType):
654
+        contentType = contentType.lower()
655
+        if contentType == 'text':
656
+            contentType = 'text/plain'
657
+        elif contentType == 'html':
658
+            contentType = 'text/html'
659
+        elif contentType == 'xhtml':
660
+            contentType = 'application/xhtml+xml'
661
+        return contentType
662
+
663
+    def trackNamespace(self, prefix, uri):
664
+        loweruri = uri.lower()
665
+        if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
666
+            self.version = 'rss090'
667
+        if loweruri == 'http://purl.org/rss/1.0/' and not self.version:
668
+            self.version = 'rss10'
669
+        if loweruri == 'http://www.w3.org/2005/atom' and not self.version:
670
+            self.version = 'atom10'
671
+        if loweruri.find('backend.userland.com/rss') <> -1:
672
+            # match any backend.userland.com namespace
673
+            uri = 'http://backend.userland.com/rss'
674
+            loweruri = uri
675
+        if self._matchnamespaces.has_key(loweruri):
676
+            self.namespacemap[prefix] = self._matchnamespaces[loweruri]
677
+            self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
678
+        else:
679
+            self.namespacesInUse[prefix or ''] = uri
680
+
681
+    def resolveURI(self, uri):
682
+        return _urljoin(self.baseuri or '', uri)
683
+
684
+    def decodeEntities(self, element, data):
685
+        return data
686
+
687
+    def strattrs(self, attrs):
688
+        return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'&quot;'})) for t in attrs])
689
+
690
+    def push(self, element, expectingText):
691
+        self.elementstack.append([element, expectingText, []])
692
+
693
+    def pop(self, element, stripWhitespace=1):
694
+        if not self.elementstack: return
695
+        if self.elementstack[-1][0] != element: return
696
+
697
+        element, expectingText, pieces = self.elementstack.pop()
698
+
699
+        if self.version == 'atom10' and self.contentparams.get('type','text') == 'application/xhtml+xml':
700
+            # remove enclosing child element, but only if it is a <div> and
701
+            # only if all the remaining content is nested underneath it.
702
+            # This means that the divs would be retained in the following:
703
+            #    <div>foo</div><div>bar</div>
704
+            while pieces and len(pieces)>1 and not pieces[-1].strip():
705
+                del pieces[-1]
706
+            while pieces and len(pieces)>1 and not pieces[0].strip():
707
+                del pieces[0]
708
+            if pieces and (pieces[0] == '<div>' or pieces[0].startswith('<div ')) and pieces[-1]=='</div>':
709
+                depth = 0
710
+                for piece in pieces[:-1]:
711
+                    if piece.startswith('</'):
712
+                        depth -= 1
713
+                        if depth == 0: break
714
+                    elif piece.startswith('<') and not piece.endswith('/>'):
715
+                        depth += 1
716
+                else:
717
+                    pieces = pieces[1:-1]
718
+
719
+        output = ''.join(pieces)
720
+        if stripWhitespace:
721
+            output = output.strip()
722
+        if not expectingText: return output
723
+
724
+        # decode base64 content
725
+        if base64 and self.contentparams.get('base64', 0):
726
+            try:
727
+                output = base64.decodestring(output)
728
+            except binascii.Error:
729
+                pass
730
+            except binascii.Incomplete:
731
+                pass
732
+
733
+        # resolve relative URIs
734
+        if (element in self.can_be_relative_uri) and output:
735
+            output = self.resolveURI(output)
736
+
737
+        # decode entities within embedded markup
738
+        if not self.contentparams.get('base64', 0):
739
+            output = self.decodeEntities(element, output)
740
+
741
+        if self.lookslikehtml(output):
742
+            self.contentparams['type']='text/html'
743
+
744
+        # remove temporary cruft from contentparams
745
+        try:
746
+            del self.contentparams['mode']
747
+        except KeyError:
748
+            pass
749
+        try:
750
+            del self.contentparams['base64']
751
+        except KeyError:
752
+            pass
753
+
754
+        is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types
755
+        # resolve relative URIs within embedded markup
756
+        if is_htmlish and RESOLVE_RELATIVE_URIS:
757
+            if element in self.can_contain_relative_uris:
758
+                output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html'))
759
+
760
+        # parse microformats
761
+        # (must do this before sanitizing because some microformats
762
+        # rely on elements that we sanitize)
763
+        if is_htmlish and element in ['content', 'description', 'summary']:
764
+            mfresults = _parseMicroformats(output, self.baseuri, self.encoding)
765
+            if mfresults:
766
+                for tag in mfresults.get('tags', []):
767
+                    self._addTag(tag['term'], tag['scheme'], tag['label'])
768
+                for enclosure in mfresults.get('enclosures', []):
769
+                    self._start_enclosure(enclosure)
770
+                for xfn in mfresults.get('xfn', []):
771
+                    self._addXFN(xfn['relationships'], xfn['href'], xfn['name'])
772
+                vcard = mfresults.get('vcard')
773
+                if vcard:
774
+                    self._getContext()['vcard'] = vcard
775
+
776
+        # sanitize embedded markup
777
+        if is_htmlish and SANITIZE_HTML:
778
+            if element in self.can_contain_dangerous_markup:
779
+                output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html'))
780
+
781
+        if self.encoding and type(output) != type(u''):
782
+            try:
783
+                output = unicode(output, self.encoding)
784
+            except:
785
+                pass
786
+
787
+        # address common error where people take data that is already
788
+        # utf-8, presume that it is iso-8859-1, and re-encode it.
789
+        if self.encoding=='utf-8' and type(output) == type(u''):
790
+            try:
791
+                output = unicode(output.encode('iso-8859-1'), 'utf-8')
792
+            except:
793
+                pass
794
+
795
+        # map win-1252 extensions to the proper code points
796
+        if type(output) == type(u''):
797
+            output = u''.join([c in _cp1252.keys() and _cp1252[c] or c for c in output])
798
+
799
+        # categories/tags/keywords/whatever are handled in _end_category
800
+        if element == 'category':
801
+            return output
802
+
803
+        if element == 'title' and self.hasTitle:
804
+            return output
805
+
806
+        # store output in appropriate place(s)
807
+        if self.inentry and not self.insource:
808
+            if element == 'content':
809
+                self.entries[-1].setdefault(element, [])
810
+                contentparams = copy.deepcopy(self.contentparams)
811
+                contentparams['value'] = output
812
+                self.entries[-1][element].append(contentparams)
813
+            elif element == 'link':
814
+                self.entries[-1][element] = output
815
+                if output:
816
+                    self.entries[-1]['links'][-1]['href'] = output
817
+            else:
818
+                if element == 'description':
819
+                    element = 'summary'
820
+                self.entries[-1][element] = output
821
+                if self.incontent:
822
+                    contentparams = copy.deepcopy(self.contentparams)
823
+                    contentparams['value'] = output
824
+                    self.entries[-1][element + '_detail'] = contentparams
825
+        elif (self.infeed or self.insource):# and (not self.intextinput) and (not self.inimage):
826
+            context = self._getContext()
827
+            if element == 'description':
828
+                element = 'subtitle'
829
+            context[element] = output
830
+            if element == 'link':
831
+                context['links'][-1]['href'] = output
832
+            elif self.incontent:
833
+                contentparams = copy.deepcopy(self.contentparams)
834
+                contentparams['value'] = output
835
+                context[element + '_detail'] = contentparams
836
+        return output
837
+
838
+    def pushContent(self, tag, attrsD, defaultContentType, expectingText):
839
+        self.incontent += 1
840
+        if self.lang: self.lang=self.lang.replace('_','-')
841
+        self.contentparams = FeedParserDict({
842
+            'type': self.mapContentType(attrsD.get('type', defaultContentType)),
843
+            'language': self.lang,
844
+            'base': self.baseuri})
845
+        self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
846
+        self.push(tag, expectingText)
847
+
848
+    def popContent(self, tag):
849
+        value = self.pop(tag)
850
+        self.incontent -= 1
851
+        self.contentparams.clear()
852
+        return value
853
+
854
+    # a number of elements in a number of RSS variants are nominally plain
855
+    # text, but this is routinely ignored.  This is an attempt to detect
856
+    # the most common cases.  As false positives often result in silent
857
+    # data loss, this function errs on the conservative side.
858
+    def lookslikehtml(self, str):
859
+        if self.version.startswith('atom'): return
860
+        if self.contentparams.get('type','text/html') != 'text/plain': return
861
+
862
+        # must have a close tag or a entity reference to qualify
863
+        if not (re.search(r'</(\w+)>',str) or re.search("&#?\w+;",str)): return
864
+
865
+        # all tags must be in a restricted subset of valid HTML tags
866
+        if filter(lambda t: t.lower() not in _HTMLSanitizer.acceptable_elements,
867
+            re.findall(r'</?(\w+)',str)): return
868
+
869
+        # all entities must have been defined as valid HTML entities
870
+        from htmlentitydefs import entitydefs
871
+        if filter(lambda e: e not in entitydefs.keys(),
872
+            re.findall(r'&(\w+);',str)): return
873
+
874
+        return 1
875
+
876
+    def _mapToStandardPrefix(self, name):
877
+        colonpos = name.find(':')
878
+        if colonpos <> -1:
879
+            prefix = name[:colonpos]
880
+            suffix = name[colonpos+1:]
881
+            prefix = self.namespacemap.get(prefix, prefix)
882
+            name = prefix + ':' + suffix
883
+        return name
884
+
885
+    def _getAttribute(self, attrsD, name):
886
+        return attrsD.get(self._mapToStandardPrefix(name))
887
+
888
+    def _isBase64(self, attrsD, contentparams):
889
+        if attrsD.get('mode', '') == 'base64':
890
+            return 1
891
+        if self.contentparams['type'].startswith('text/'):
892
+            return 0
893
+        if self.contentparams['type'].endswith('+xml'):
894
+            return 0
895
+        if self.contentparams['type'].endswith('/xml'):
896
+            return 0
897
+        return 1
898
+
899
+    def _itsAnHrefDamnIt(self, attrsD):
900
+        href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
901
+        if href:
902
+            try:
903
+                del attrsD['url']
904
+            except KeyError:
905
+                pass
906
+            try:
907
+                del attrsD['uri']
908
+            except KeyError:
909
+                pass
910
+            attrsD['href'] = href
911
+        return attrsD
912
+
913
+    def _save(self, key, value):
914
+        context = self._getContext()
915
+        context.setdefault(key, value)
916
+
917
+    def _start_rss(self, attrsD):
918
+        versionmap = {'0.91': 'rss091u',
919
+                      '0.92': 'rss092',
920
+                      '0.93': 'rss093',
921
+                      '0.94': 'rss094'}
922
+        if not self.version:
923
+            attr_version = attrsD.get('version', '')
924
+            version = versionmap.get(attr_version)
925
+            if version:
926
+                self.version = version
927
+            elif attr_version.startswith('2.'):
928
+                self.version = 'rss20'
929
+            else:
930
+                self.version = 'rss'
931
+
932
+    def _start_dlhottitles(self, attrsD):
933
+        self.version = 'hotrss'
934
+
935
+    def _start_channel(self, attrsD):
936
+        self.infeed = 1
937
+        self._cdf_common(attrsD)
938
+    _start_feedinfo = _start_channel
939
+
940
+    def _cdf_common(self, attrsD):
941
+        if attrsD.has_key('lastmod'):
942
+            self._start_modified({})
943
+            self.elementstack[-1][-1] = attrsD['lastmod']
944
+            self._end_modified()
945
+        if attrsD.has_key('href'):
946
+            self._start_link({})
947
+            self.elementstack[-1][-1] = attrsD['href']
948
+            self._end_link()
949
+
950
+    def _start_feed(self, attrsD):
951
+        self.infeed = 1
952
+        versionmap = {'0.1': 'atom01',
953
+                      '0.2': 'atom02',
954
+                      '0.3': 'atom03'}
955
+        if not self.version:
956
+            attr_version = attrsD.get('version')
957
+            version = versionmap.get(attr_version)
958
+            if version:
959
+                self.version = version
960
+            else:
961
+                self.version = 'atom'
962
+
963
+    def _end_channel(self):
964
+        self.infeed = 0
965
+    _end_feed = _end_channel
966
+
967
+    def _start_image(self, attrsD):
968
+        context = self._getContext()
969
+        context.setdefault('image', FeedParserDict())
970
+        self.inimage = 1
971
+        self.hasTitle = 0
972
+        self.push('image', 0)
973
+
974
+    def _end_image(self):
975
+        self.pop('image')
976
+        self.inimage = 0
977
+
978
+    def _start_textinput(self, attrsD):
979
+        context = self._getContext()
980
+        context.setdefault('textinput', FeedParserDict())
981
+        self.intextinput = 1
982
+        self.hasTitle = 0
983
+        self.push('textinput', 0)
984
+    _start_textInput = _start_textinput
985
+
986
+    def _end_textinput(self):
987
+        self.pop('textinput')
988
+        self.intextinput = 0
989
+    _end_textInput = _end_textinput
990
+
991
+    def _start_author(self, attrsD):
992
+        self.inauthor = 1
993
+        self.push('author', 1)
994
+    _start_managingeditor = _start_author
995
+    _start_dc_author = _start_author
996
+    _start_dc_creator = _start_author
997
+    _start_itunes_author = _start_author
998
+
999
+    def _end_author(self):
1000
+        self.pop('author')
1001
+        self.inauthor = 0
1002
+        self._sync_author_detail()
1003
+    _end_managingeditor = _end_author
1004
+    _end_dc_author = _end_author
1005
+    _end_dc_creator = _end_author
1006
+    _end_itunes_author = _end_author
1007
+
1008
+    def _start_itunes_owner(self, attrsD):
1009
+        self.inpublisher = 1
1010
+        self.push('publisher', 0)
1011
+
1012
+    def _end_itunes_owner(self):
1013
+        self.pop('publisher')
1014
+        self.inpublisher = 0
1015
+        self._sync_author_detail('publisher')
1016
+
1017
+    def _start_contributor(self, attrsD):
1018
+        self.incontributor = 1
1019
+        context = self._getContext()
1020
+        context.setdefault('contributors', [])
1021
+        context['contributors'].append(FeedParserDict())
1022
+        self.push('contributor', 0)
1023
+
1024
+    def _end_contributor(self):
1025
+        self.pop('contributor')
1026
+        self.incontributor = 0
1027
+
1028
+    def _start_dc_contributor(self, attrsD):
1029
+        self.incontributor = 1
1030
+        context = self._getContext()
1031
+        context.setdefault('contributors', [])
1032
+        context['contributors'].append(FeedParserDict())
1033
+        self.push('name', 0)
1034
+
1035
+    def _end_dc_contributor(self):
1036
+        self._end_name()
1037
+        self.incontributor = 0
1038
+
1039
+    def _start_name(self, attrsD):
1040
+        self.push('name', 0)
1041
+    _start_itunes_name = _start_name
1042
+
1043
+    def _end_name(self):
1044
+        value = self.pop('name')
1045
+        if self.inpublisher:
1046
+            self._save_author('name', value, 'publisher')
1047
+        elif self.inauthor:
1048
+            self._save_author('name', value)
1049
+        elif self.incontributor:
1050
+            self._save_contributor('name', value)
1051
+        elif self.intextinput:
1052
+            context = self._getContext()
1053
+            context['name'] = value
1054
+    _end_itunes_name = _end_name
1055
+
1056
+    def _start_width(self, attrsD):
1057
+        self.push('width', 0)
1058
+
1059
+    def _end_width(self):
1060
+        value = self.pop('width')
1061
+        try:
1062
+            value = int(value)
1063
+        except:
1064
+            value = 0
1065
+        if self.inimage:
1066
+            context = self._getContext()
1067
+            context['width'] = value
1068
+
1069
+    def _start_height(self, attrsD):
1070
+        self.push('height', 0)
1071
+
1072
+    def _end_height(self):
1073
+        value = self.pop('height')
1074
+        try:
1075
+            value = int(value)
1076
+        except:
1077
+            value = 0
1078
+        if self.inimage:
1079
+            context = self._getContext()
1080
+            context['height'] = value
1081
+
1082
+    def _start_url(self, attrsD):
1083
+        self.push('href', 1)
1084
+    _start_homepage = _start_url
1085
+    _start_uri = _start_url
1086
+
1087
+    def _end_url(self):
1088
+        value = self.pop('href')
1089
+        if self.inauthor:
1090
+            self._save_author('href', value)
1091
+        elif self.incontributor:
1092
+            self._save_contributor('href', value)
1093
+    _end_homepage = _end_url
1094
+    _end_uri = _end_url
1095
+
1096
+    def _start_email(self, attrsD):
1097
+        self.push('email', 0)
1098
+    _start_itunes_email = _start_email
1099
+
1100
+    def _end_email(self):
1101
+        value = self.pop('email')
1102
+        if self.inpublisher:
1103
+            self._save_author('email', value, 'publisher')
1104
+        elif self.inauthor:
1105
+            self._save_author('email', value)
1106
+        elif self.incontributor:
1107
+            self._save_contributor('email', value)
1108
+    _end_itunes_email = _end_email
1109
+
1110
+    def _getContext(self):
1111
+        if self.insource:
1112
+            context = self.sourcedata
1113
+        elif self.inimage:
1114
+            context = self.feeddata['image']
1115
+        elif self.intextinput:
1116
+            context = self.feeddata['textinput']
1117
+        elif self.inentry:
1118
+            context = self.entries[-1]
1119
+        else:
1120
+            context = self.feeddata
1121
+        return context
1122
+
1123
+    def _save_author(self, key, value, prefix='author'):
1124
+        context = self._getContext()
1125
+        context.setdefault(prefix + '_detail', FeedParserDict())
1126
+        context[prefix + '_detail'][key] = value
1127
+        self._sync_author_detail()
1128
+
1129
+    def _save_contributor(self, key, value):
1130
+        context = self._getContext()
1131
+        context.setdefault('contributors', [FeedParserDict()])
1132
+        context['contributors'][-1][key] = value
1133
+
1134
+    def _sync_author_detail(self, key='author'):
1135
+        context = self._getContext()
1136
+        detail = context.get('%s_detail' % key)
1137
+        if detail:
1138
+            name = detail.get('name')
1139
+            email = detail.get('email')
1140
+            if name and email:
1141
+                context[key] = '%s (%s)' % (name, email)
1142
+            elif name:
1143
+                context[key] = name
1144
+            elif email:
1145
+                context[key] = email
1146
+        else:
1147
+            author, email = context.get(key), None
1148
+            if not author: return
1149
+            emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author)
1150
+            if emailmatch:
1151
+                email = emailmatch.group(0)
1152
+                # probably a better way to do the following, but it passes all the tests
1153
+                author = author.replace(email, '')
1154
+                author = author.replace('()', '')
1155
+                author = author.replace('<>', '')
1156
+                author = author.replace('&lt;&gt;', '')
1157
+                author = author.strip()
1158
+                if author and (author[0] == '('):
1159
+                    author = author[1:]
1160
+                if author and (author[-1] == ')'):
1161
+                    author = author[:-1]
1162
+                author = author.strip()
1163
+            if author or email:
1164
+                context.setdefault('%s_detail' % key, FeedParserDict())
1165
+            if author:
1166
+                context['%s_detail' % key]['name'] = author
1167
+            if email:
1168
+                context['%s_detail' % key]['email'] = email
1169
+
1170
+    def _start_subtitle(self, attrsD):
1171
+        self.pushContent('subtitle', attrsD, 'text/plain', 1)
1172
+    _start_tagline = _start_subtitle
1173
+    _start_itunes_subtitle = _start_subtitle
1174
+
1175
+    def _end_subtitle(self):
1176
+        self.popContent('subtitle')
1177
+    _end_tagline = _end_subtitle
1178
+    _end_itunes_subtitle = _end_subtitle
1179
+
1180
+    def _start_rights(self, attrsD):
1181
+        self.pushContent('rights', attrsD, 'text/plain', 1)
1182
+    _start_dc_rights = _start_rights
1183
+    _start_copyright = _start_rights
1184
+
1185
+    def _end_rights(self):
1186
+        self.popContent('rights')
1187
+    _end_dc_rights = _end_rights
1188
+    _end_copyright = _end_rights
1189
+
1190
+    def _start_item(self, attrsD):
1191
+        self.entries.append(FeedParserDict())
1192
+        self.push('item', 0)
1193
+        self.inentry = 1
1194
+        self.guidislink = 0
1195
+        self.hasTitle = 0
1196
+        id = self._getAttribute(attrsD, 'rdf:about')
1197
+        if id:
1198
+            context = self._getContext()
1199
+            context['id'] = id
1200
+        self._cdf_common(attrsD)
1201
+    _start_entry = _start_item
1202
+    _start_product = _start_item
1203
+
1204
+    def _end_item(self):
1205
+        self.pop('item')
1206
+        self.inentry = 0
1207
+    _end_entry = _end_item
1208
+
1209
+    def _start_dc_language(self, attrsD):
1210
+        self.push('language', 1)
1211
+    _start_language = _start_dc_language
1212
+
1213
+    def _end_dc_language(self):
1214
+        self.lang = self.pop('language')
1215
+    _end_language = _end_dc_language
1216
+
1217
+    def _start_dc_publisher(self, attrsD):
1218
+        self.push('publisher', 1)
1219
+    _start_webmaster = _start_dc_publisher
1220
+
1221
+    def _end_dc_publisher(self):
1222
+        self.pop('publisher')
1223
+        self._sync_author_detail('publisher')
1224
+    _end_webmaster = _end_dc_publisher
1225
+
1226
+    def _start_published(self, attrsD):
1227
+        self.push('published', 1)
1228
+    _start_dcterms_issued = _start_published
1229
+    _start_issued = _start_published
1230
+
1231
+    def _end_published(self):
1232
+        value = self.pop('published')
1233
+        self._save('published_parsed', _parse_date(value))
1234
+    _end_dcterms_issued = _end_published
1235
+    _end_issued = _end_published
1236
+
1237
+    def _start_updated(self, attrsD):
1238
+        self.push('updated', 1)
1239
+    _start_modified = _start_updated
1240
+    _start_dcterms_modified = _start_updated
1241
+    _start_pubdate = _start_updated
1242
+    _start_dc_date = _start_updated
1243
+
1244
+    def _end_updated(self):
1245
+        value = self.pop('updated')
1246
+        parsed_value = _parse_date(value)
1247
+        self._save('updated_parsed', parsed_value)
1248
+    _end_modified = _end_updated
1249
+    _end_dcterms_modified = _end_updated
1250
+    _end_pubdate = _end_updated
1251
+    _end_dc_date = _end_updated
1252
+
1253
+    def _start_created(self, attrsD):
1254
+        self.push('created', 1)
1255
+    _start_dcterms_created = _start_created
1256
+
1257
+    def _end_created(self):
1258
+        value = self.pop('created')
1259
+        self._save('created_parsed', _parse_date(value))
1260
+    _end_dcterms_created = _end_created
1261
+
1262
+    def _start_expirationdate(self, attrsD):
1263
+        self.push('expired', 1)
1264
+
1265
+    def _end_expirationdate(self):
1266
+        self._save('expired_parsed', _parse_date(self.pop('expired')))
1267
+
1268
+    def _start_cc_license(self, attrsD):
1269
+        context = self._getContext()
1270
+        value = self._getAttribute(attrsD, 'rdf:resource')
1271
+        attrsD = FeedParserDict()
1272
+        attrsD['rel']='license'
1273
+        if value: attrsD['href']=value
1274
+        context.setdefault('links', []).append(attrsD)
1275
+
1276
+    def _start_creativecommons_license(self, attrsD):
1277
+        self.push('license', 1)
1278
+    _start_creativeCommons_license = _start_creativecommons_license
1279
+
1280
+    def _end_creativecommons_license(self):
1281
+        value = self.pop('license')
1282
+        context = self._getContext()
1283
+        attrsD = FeedParserDict()
1284
+        attrsD['rel']='license'
1285
+        if value: attrsD['href']=value
1286
+        context.setdefault('links', []).append(attrsD)
1287
+        del context['license']
1288
+    _end_creativeCommons_license = _end_creativecommons_license
1289
+
1290
+    def _addXFN(self, relationships, href, name):
1291
+        context = self._getContext()
1292
+        xfn = context.setdefault('xfn', [])
1293
+        value = FeedParserDict({'relationships': relationships, 'href': href, 'name': name})
1294
+        if value not in xfn:
1295
+            xfn.append(value)
1296
+
1297
+    def _addTag(self, term, scheme, label):
1298
+        context = self._getContext()
1299
+        tags = context.setdefault('tags', [])
1300
+        if (not term) and (not scheme) and (not label): return
1301
+        value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
1302
+        if value not in tags:
1303
+            tags.append(value)
1304
+
1305
+    def _start_category(self, attrsD):
1306
+        if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD))
1307
+        term = attrsD.get('term')
1308
+        scheme = attrsD.get('scheme', attrsD.get('domain'))
1309
+        label = attrsD.get('label')
1310
+        self._addTag(term, scheme, label)
1311
+        self.push('category', 1)
1312
+    _start_dc_subject = _start_category
1313
+    _start_keywords = _start_category
1314
+
1315
+    def _end_itunes_keywords(self):
1316
+        for term in self.pop('itunes_keywords').split():
1317
+            self._addTag(term, 'http://www.itunes.com/', None)
1318
+
1319
+    def _start_itunes_category(self, attrsD):
1320
+        self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None)
1321
+        self.push('category', 1)
1322
+
1323
+    def _end_category(self):
1324
+        value = self.pop('category')
1325
+        if not value: return
1326
+        context = self._getContext()
1327
+        tags = context['tags']
1328
+        if value and len(tags) and not tags[-1]['term']:
1329
+            tags[-1]['term'] = value
1330
+        else:
1331
+            self._addTag(value, None, None)
1332
+    _end_dc_subject = _end_category
1333
+    _end_keywords = _end_category
1334
+    _end_itunes_category = _end_category
1335
+
1336
+    def _start_cloud(self, attrsD):
1337
+        self._getContext()['cloud'] = FeedParserDict(attrsD)
1338
+
1339
+    def _start_link(self, attrsD):
1340
+        attrsD.setdefault('rel', 'alternate')
1341
+        if attrsD['rel'] == 'self':
1342
+            attrsD.setdefault('type', 'application/atom+xml')
1343
+        else:
1344
+            attrsD.setdefault('type', 'text/html')
1345
+        context = self._getContext()
1346
+        attrsD = self._itsAnHrefDamnIt(attrsD)
1347
+        if attrsD.has_key('href'):
1348
+            attrsD['href'] = self.resolveURI(attrsD['href'])
1349
+            if attrsD.get('rel')=='enclosure' and not context.get('id'):
1350
+                context['id'] = attrsD.get('href')
1351
+        expectingText = self.infeed or self.inentry or self.insource
1352
+        context.setdefault('links', [])
1353
+        context['links'].append(FeedParserDict(attrsD))
1354
+        if attrsD.has_key('href'):
1355
+            expectingText = 0
1356
+            if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
1357
+                context['link'] = attrsD['href']
1358
+        else:
1359
+            self.push('link', expectingText)
1360
+    _start_producturl = _start_link
1361
+
1362
+    def _end_link(self):
1363
+        value = self.pop('link')
1364
+        context = self._getContext()
1365
+    _end_producturl = _end_link
1366
+
1367
+    def _start_guid(self, attrsD):
1368
+        self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
1369
+        self.push('id', 1)
1370
+
1371
+    def _end_guid(self):
1372
+        value = self.pop('id')
1373
+        self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
1374
+        if self.guidislink:
1375
+            # guid acts as link, but only if 'ispermalink' is not present or is 'true',
1376
+            # and only if the item doesn't already have a link element
1377
+            self._save('link', value)
1378
+
1379
+    def _start_title(self, attrsD):
1380
+        if self.svgOK: return self.unknown_starttag('title', attrsD.items())
1381
+        self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1382
+    _start_dc_title = _start_title
1383
+    _start_media_title = _start_title
1384
+
1385
+    def _end_title(self):
1386
+        if self.svgOK: return
1387
+        value = self.popContent('title')
1388
+        if not value: return
1389
+        context = self._getContext()
1390
+        self.hasTitle = 1
1391
+    _end_dc_title = _end_title
1392
+
1393
+    def _end_media_title(self):
1394
+        hasTitle = self.hasTitle
1395
+        self._end_title()
1396
+        self.hasTitle = hasTitle
1397
+
1398
+    def _start_description(self, attrsD):
1399
+        context = self._getContext()
1400
+        if context.has_key('summary'):
1401
+            self._summaryKey = 'content'
1402
+            self._start_content(attrsD)
1403
+        else:
1404
+            self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)
1405
+    _start_dc_description = _start_description
1406
+
1407
+    def _start_abstract(self, attrsD):
1408
+        self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1409
+
1410
+    def _end_description(self):
1411
+        if self._summaryKey == 'content':
1412
+            self._end_content()
1413
+        else:
1414
+            value = self.popContent('description')
1415
+        self._summaryKey = None
1416
+    _end_abstract = _end_description
1417
+    _end_dc_description = _end_description
1418
+
1419
+    def _start_info(self, attrsD):
1420
+        self.pushContent('info', attrsD, 'text/plain', 1)
1421
+    _start_feedburner_browserfriendly = _start_info
1422
+
1423
+    def _end_info(self):
1424
+        self.popContent('info')
1425
+    _end_feedburner_browserfriendly = _end_info
1426
+
1427
+    def _start_generator(self, attrsD):
1428
+        if attrsD:
1429
+            attrsD = self._itsAnHrefDamnIt(attrsD)
1430
+            if attrsD.has_key('href'):
1431
+                attrsD['href'] = self.resolveURI(attrsD['href'])
1432
+        self._getContext()['generator_detail'] = FeedParserDict(attrsD)
1433
+        self.push('generator', 1)
1434
+
1435
+    def _end_generator(self):
1436
+        value = self.pop('generator')
1437
+        context = self._getContext()
1438
+        if context.has_key('generator_detail'):
1439
+            context['generator_detail']['name'] = value
1440
+
1441
+    def _start_admin_generatoragent(self, attrsD):
1442
+        self.push('generator', 1)
1443
+        value = self._getAttribute(attrsD, 'rdf:resource')
1444
+        if value:
1445
+            self.elementstack[-1][2].append(value)
1446
+        self.pop('generator')
1447
+        self._getContext()['generator_detail'] = FeedParserDict({'href': value})
1448
+
1449
+    def _start_admin_errorreportsto(self, attrsD):
1450
+        self.push('errorreportsto', 1)
1451
+        value = self._getAttribute(attrsD, 'rdf:resource')
1452
+        if value:
1453
+            self.elementstack[-1][2].append(value)
1454
+        self.pop('errorreportsto')
1455
+
1456
+    def _start_summary(self, attrsD):
1457
+        context = self._getContext()
1458
+        if context.has_key('summary'):
1459
+            self._summaryKey = 'content'
1460
+            self._start_content(attrsD)
1461
+        else:
1462
+            self._summaryKey = 'summary'
1463
+            self.pushContent(self._summaryKey, attrsD, 'text/plain', 1)
1464
+    _start_itunes_summary = _start_summary
1465
+
1466
+    def _end_summary(self):
1467
+        if self._summaryKey == 'content':
1468
+            self._end_content()
1469
+        else:
1470
+            self.popContent(self._summaryKey or 'summary')
1471
+        self._summaryKey = None
1472
+    _end_itunes_summary = _end_summary
1473
+
1474
+    def _start_enclosure(self, attrsD):
1475
+        attrsD = self._itsAnHrefDamnIt(attrsD)
1476
+        context = self._getContext()
1477
+        attrsD['rel']='enclosure'
1478
+        context.setdefault('links', []).append(FeedParserDict(attrsD))
1479
+        href = attrsD.get('href')
1480
+        if href and not context.get('id'):
1481
+            context['id'] = href
1482
+
1483
+    def _start_source(self, attrsD):
1484
+        self.insource = 1
1485
+        self.hasTitle = 0
1486
+
1487
+    def _end_source(self):
1488
+        self.insource = 0
1489
+        self._getContext()['source'] = copy.deepcopy(self.sourcedata)
1490
+        self.sourcedata.clear()
1491
+
1492
+    def _start_content(self, attrsD):
1493
+        self.pushContent('content', attrsD, 'text/plain', 1)
1494
+        src = attrsD.get('src')
1495
+        if src:
1496
+            self.contentparams['src'] = src
1497
+        self.push('content', 1)
1498
+
1499
+    def _start_prodlink(self, attrsD):
1500
+        self.pushContent('content', attrsD, 'text/html', 1)
1501
+
1502
+    def _start_body(self, attrsD):
1503
+        self.pushContent('content', attrsD, 'application/xhtml+xml', 1)
1504
+    _start_xhtml_body = _start_body
1505
+
1506
+    def _start_content_encoded(self, attrsD):
1507
+        self.pushContent('content', attrsD, 'text/html', 1)
1508
+    _start_fullitem = _start_content_encoded
1509
+
1510
+    def _end_content(self):
1511
+        copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)
1512
+        value = self.popContent('content')
1513
+        if copyToDescription:
1514
+            self._save('description', value)
1515
+
1516
+    _end_body = _end_content
1517
+    _end_xhtml_body = _end_content
1518
+    _end_content_encoded = _end_content
1519
+    _end_fullitem = _end_content
1520
+    _end_prodlink = _end_content
1521
+
1522
+    def _start_itunes_image(self, attrsD):
1523
+        self.push('itunes_image', 0)
1524
+        self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
1525
+    _start_itunes_link = _start_itunes_image
1526
+
1527
+    def _end_itunes_block(self):
1528
+        value = self.pop('itunes_block', 0)
1529
+        self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
1530
+
1531
+    def _end_itunes_explicit(self):
1532
+        value = self.pop('itunes_explicit', 0)
1533
+        self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0
1534
+
1535
+if _XML_AVAILABLE:
1536
+    class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
1537
+        def __init__(self, baseuri, baselang, encoding):
1538
+            if _debug: sys.stderr.write('trying StrictFeedParser\n')
1539
+            xml.sax.handler.ContentHandler.__init__(self)
1540
+            _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1541
+            self.bozo = 0
1542
+            self.exc = None
1543
+
1544
+        def startPrefixMapping(self, prefix, uri):
1545
+            self.trackNamespace(prefix, uri)
1546
+
1547
+        def startElementNS(self, name, qname, attrs):
1548
+            namespace, localname = name
1549
+            lowernamespace = str(namespace or '').lower()
1550
+            if lowernamespace.find('backend.userland.com/rss') <> -1:
1551
+                # match any backend.userland.com namespace
1552
+                namespace = 'http://backend.userland.com/rss'
1553
+                lowernamespace = namespace
1554
+            if qname and qname.find(':') > 0:
1555
+                givenprefix = qname.split(':')[0]
1556
+            else:
1557
+                givenprefix = None
1558
+            prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1559
+            if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):
1560
+                    raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
1561
+            localname = str(localname).lower()
1562
+
1563
+            # qname implementation is horribly broken in Python 2.1 (it
1564
+            # doesn't report any), and slightly broken in Python 2.2 (it
1565
+            # doesn't report the xml: namespace). So we match up namespaces
1566
+            # with a known list first, and then possibly override them with
1567
+            # the qnames the SAX parser gives us (if indeed it gives us any
1568
+            # at all).  Thanks to MatejC for helping me test this and
1569
+            # tirelessly telling me that it didn't work yet.
1570
+            attrsD = {}
1571
+            if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
1572
+                attrsD['xmlns']=namespace
1573
+            if localname=='svg' and namespace=='http://www.w3.org/2000/svg':
1574
+                attrsD['xmlns']=namespace
1575
+
1576
+            if prefix:
1577
+                localname = prefix.lower() + ':' + localname
1578
+            elif namespace and not qname: #Expat
1579
+                for name,value in self.namespacesInUse.items():
1580
+                     if name and value == namespace:
1581
+                         localname = name + ':' + localname
1582
+                         break
1583
+            if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname))
1584
+
1585
+            for (namespace, attrlocalname), attrvalue in attrs._attrs.items():
1586
+                lowernamespace = (namespace or '').lower()
1587
+                prefix = self._matchnamespaces.get(lowernamespace, '')
1588
+                if prefix:
1589
+                    attrlocalname = prefix + ':' + attrlocalname
1590
+                attrsD[str(attrlocalname).lower()] = attrvalue
1591
+            for qname in attrs.getQNames():
1592
+                attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
1593
+            self.unknown_starttag(localname, attrsD.items())
1594
+
1595
+        def characters(self, text):
1596
+            self.handle_data(text)
1597
+
1598
+        def endElementNS(self, name, qname):
1599
+            namespace, localname = name
1600
+            lowernamespace = str(namespace or '').lower()
1601
+            if qname and qname.find(':') > 0:
1602
+                givenprefix = qname.split(':')[0]
1603
+            else:
1604
+                givenprefix = ''
1605
+            prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1606
+            if prefix:
1607
+                localname = prefix + ':' + localname
1608
+            elif namespace and not qname: #Expat
1609
+                for name,value in self.namespacesInUse.items():
1610
+                     if name and value == namespace:
1611
+                         localname = name + ':' + localname
1612
+                         break
1613
+            localname = str(localname).lower()
1614
+            self.unknown_endtag(localname)
1615
+
1616
+        def error(self, exc):
1617
+            self.bozo = 1
1618
+            self.exc = exc
1619
+
1620
+        def fatalError(self, exc):
1621
+            self.error(exc)
1622
+            raise exc
1623
+
1624
+class _BaseHTMLProcessor(sgmllib.SGMLParser):
1625
+    special = re.compile('''[<>'"]''')
1626
+    bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
1627
+    elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
1628
+      'img', 'input', 'isindex', 'link', 'meta', 'param']
1629
+
1630
+    def __init__(self, encoding, type):
1631
+        self.encoding = encoding
1632
+        self.type = type
1633
+        if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
1634
+        sgmllib.SGMLParser.__init__(self)
1635
+
1636
+    def reset(self):
1637
+        self.pieces = []
1638
+        sgmllib.SGMLParser.reset(self)
1639
+
1640
+    def _shorttag_replace(self, match):
1641
+        tag = match.group(1)
1642
+        if tag in self.elements_no_end_tag:
1643
+            return '<' + tag + ' />'
1644
+        else:
1645
+            return '<' + tag + '></' + tag + '>'
1646
+
1647
+    def parse_starttag(self,i):
1648
+        j=sgmllib.SGMLParser.parse_starttag(self, i)
1649
+        if self.type == 'application/xhtml+xml':
1650
+            if j>2 and self.rawdata[j-2:j]=='/>':
1651
+                self.unknown_endtag(self.lasttag)
1652
+        return j
1653
+
1654
+    def feed(self, data):
1655
+        data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'&lt;!\1', data)
1656
+        #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
1657
+        data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data)
1658
+        data = data.replace('&#39;', "'")
1659
+        data = data.replace('&#34;', '"')
1660
+        if self.encoding and type(data) == type(u''):
1661
+            data = data.encode(self.encoding)
1662
+        sgmllib.SGMLParser.feed(self, data)
1663
+        sgmllib.SGMLParser.close(self)
1664
+
1665
+    def normalize_attrs(self, attrs):
1666
+        if not attrs: return attrs
1667
+        # utility method to be called by descendants
1668
+        attrs = dict([(k.lower(), v) for k, v in attrs]).items()
1669
+        attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
1670
+        attrs.sort()
1671
+        return attrs
1672
+
1673
+    def unknown_starttag(self, tag, attrs):
1674
+        # called for each start tag
1675
+        # attrs is a list of (attr, value) tuples
1676
+        # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
1677
+        if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
1678
+        uattrs = []
1679
+        strattrs=''
1680
+        if attrs:
1681
+            for key, value in attrs:
1682
+                value=value.replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
1683
+                value = self.bare_ampersand.sub("&amp;", value)
1684
+                # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
1685
+                if type(value) != type(u''):
1686
+                    try:
1687
+                        value = unicode(value, self.encoding)
1688
+                    except:
1689
+                        value = unicode(value, 'iso-8859-1')
1690
+                uattrs.append((unicode(key, self.encoding), value))
1691
+            strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs])
1692
+            if self.encoding:
1693
+                try:
1694
+                    strattrs=strattrs.encode(self.encoding)
1695
+                except:
1696
+                    pass
1697
+        if tag in self.elements_no_end_tag:
1698
+            self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
1699
+        else:
1700
+            self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
1701
+
1702
+    def unknown_endtag(self, tag):
1703
+        # called for each end tag, e.g. for </pre>, tag will be 'pre'
1704
+        # Reconstruct the original end tag.
1705
+        if tag not in self.elements_no_end_tag:
1706
+            self.pieces.append("</%(tag)s>" % locals())
1707
+
1708
+    def handle_charref(self, ref):
1709
+        # called for each character reference, e.g. for '&#160;', ref will be '160'
1710
+        # Reconstruct the original character reference.
1711
+        if ref.startswith('x'):
1712
+            value = unichr(int(ref[1:],16))
1713
+        else:
1714
+            value = unichr(int(ref))
1715
+
1716
+        if value in _cp1252.keys():
1717
+            self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:])
1718
+        else:
1719
+            self.pieces.append('&#%(ref)s;' % locals())
1720
+
1721
+    def handle_entityref(self, ref):
1722
+        # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
1723
+        # Reconstruct the original entity reference.
1724
+        if name2codepoint.has_key(ref):
1725
+            self.pieces.append('&%(ref)s;' % locals())
1726
+        else:
1727
+            self.pieces.append('&amp;%(ref)s' % locals())
1728
+
1729
+    def handle_data(self, text):
1730
+        # called for each block of plain text, i.e. outside of any tag and
1731
+        # not containing any character or entity references
1732
+        # Store the original text verbatim.
1733
+        if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
1734
+        self.pieces.append(text)
1735
+
1736
+    def handle_comment(self, text):
1737
+        # called for each HTML comment, e.g. <!-- insert Javascript code here -->
1738
+        # Reconstruct the original comment.
1739
+        self.pieces.append('<!--%(text)s-->' % locals())
1740
+
1741
+    def handle_pi(self, text):
1742
+        # called for each processing instruction, e.g. <?instruction>
1743
+        # Reconstruct original processing instruction.
1744
+        self.pieces.append('<?%(text)s>' % locals())
1745
+
1746
+    def handle_decl(self, text):
1747
+        # called for the DOCTYPE, if present, e.g.
1748
+        # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
1749
+        #     "http://www.w3.org/TR/html4/loose.dtd">
1750
+        # Reconstruct original DOCTYPE
1751
+        self.pieces.append('<!%(text)s>' % locals())
1752
+
1753
+    _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
1754
+    def _scan_name(self, i, declstartpos):
1755
+        rawdata = self.rawdata
1756
+        n = len(rawdata)
1757
+        if i == n:
1758
+            return None, -1
1759
+        m = self._new_declname_match(rawdata, i)
1760
+        if m:
1761
+            s = m.group()
1762
+            name = s.strip()
1763
+            if (i + len(s)) == n:
1764
+                return None, -1  # end of buffer
1765
+            return name.lower(), m.end()
1766
+        else:
1767
+            self.handle_data(rawdata)
1768
+#            self.updatepos(declstartpos, i)
1769
+            return None, -1
1770
+
1771
+    def convert_charref(self, name):
1772
+        return '&#%s;' % name
1773
+
1774
+    def convert_entityref(self, name):
1775
+        return '&%s;' % name
1776
+
1777
+    def output(self):
1778
+        '''Return processed HTML as a single string'''
1779
+        return ''.join([str(p) for p in self.pieces])
1780
+
1781
+class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
1782
+    def __init__(self, baseuri, baselang, encoding, entities):
1783
+        sgmllib.SGMLParser.__init__(self)
1784
+        _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1785
+        _BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml')
1786
+        self.entities=entities
1787
+
1788
+    def decodeEntities(self, element, data):
1789
+        data = data.replace('&#60;', '&lt;')
1790
+        data = data.replace('&#x3c;', '&lt;')
1791
+        data = data.replace('&#x3C;', '&lt;')
1792
+        data = data.replace('&#62;', '&gt;')
1793
+        data = data.replace('&#x3e;', '&gt;')
1794
+        data = data.replace('&#x3E;', '&gt;')
1795
+        data = data.replace('&#38;', '&amp;')
1796
+        data = data.replace('&#x26;', '&amp;')
1797
+        data = data.replace('&#34;', '&quot;')
1798
+        data = data.replace('&#x22;', '&quot;')
1799
+        data = data.replace('&#39;', '&apos;')
1800
+        data = data.replace('&#x27;', '&apos;')
1801
+        if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
1802
+            data = data.replace('&lt;', '<')
1803
+            data = data.replace('&gt;', '>')
1804
+            data = data.replace('&amp;', '&')
1805
+            data = data.replace('&quot;', '"')
1806
+            data = data.replace('&apos;', "'")
1807
+        return data
1808
+
1809
+    def strattrs(self, attrs):
1810
+        return ''.join([' %s="%s"' % (n,v.replace('"','&quot;')) for n,v in attrs])
1811
+
1812
+class _MicroformatsParser:
1813
+    STRING = 1
1814
+    DATE = 2
1815
+    URI = 3
1816
+    NODE = 4
1817
+    EMAIL = 5
1818
+
1819
+    known_xfn_relationships = ['contact', 'acquaintance', 'friend', 'met', 'co-worker', 'coworker', 'colleague', 'co-resident', 'coresident', 'neighbor', 'child', 'parent', 'sibling', 'brother', 'sister', 'spouse', 'wife', 'husband', 'kin', 'relative', 'muse', 'crush', 'date', 'sweetheart', 'me']
1820
+    known_binary_extensions =  ['zip','rar','exe','gz','tar','tgz','tbz2','bz2','z','7z','dmg','img','sit','sitx','hqx','deb','rpm','bz2','jar','rar','iso','bin','msi','mp2','mp3','ogg','ogm','mp4','m4v','m4a','avi','wma','wmv']
1821
+
1822
+    def __init__(self, data, baseuri, encoding):
1823
+        self.document = BeautifulSoup.BeautifulSoup(data)
1824
+        self.baseuri = baseuri
1825
+        self.encoding = encoding
1826
+        if type(data) == type(u''):
1827
+            data = data.encode(encoding)
1828
+        self.tags = []
1829
+        self.enclosures = []
1830
+        self.xfn = []
1831
+        self.vcard = None
1832
+
1833
+    def vcardEscape(self, s):
1834
+        if type(s) in (type(''), type(u'')):
1835
+            s = s.replace(',', '\\,').replace(';', '\\;').replace('\n', '\\n')
1836
+        return s
1837
+
1838
+    def vcardFold(self, s):
1839
+        s = re.sub(';+$', '', s)
1840
+        sFolded = ''
1841
+        iMax = 75
1842
+        sPrefix = ''
1843
+        while len(s) > iMax:
1844
+            sFolded += sPrefix + s[:iMax] + '\n'
1845
+            s = s[iMax:]
1846
+            sPrefix = ' '
1847
+            iMax = 74
1848
+        sFolded += sPrefix + s
1849
+        return sFolded
1850
+
1851
+    def normalize(self, s):
1852
+        return re.sub(r'\s+', ' ', s).strip()
1853
+
1854
+    def unique(self, aList):
1855
+        results = []
1856
+        for element in aList:
1857
+            if element not in results:
1858
+                results.append(element)
1859
+        return results
1860
+
1861
+    def toISO8601(self, dt):
1862
+        return time.strftime('%Y-%m-%dT%H:%M:%SZ', dt)
1863
+
1864
+    def getPropertyValue(self, elmRoot, sProperty, iPropertyType=4, bAllowMultiple=0, bAutoEscape=0):
1865
+        all = lambda x: 1
1866
+        sProperty = sProperty.lower()
1867
+        bFound = 0
1868
+        bNormalize = 1
1869
+        propertyMatch = {'class': re.compile(r'\b%s\b' % sProperty)}
1870
+        if bAllowMultiple and (iPropertyType != self.NODE):
1871
+            snapResults = []
1872
+            containers = elmRoot(['ul', 'ol'], propertyMatch)
1873
+            for container in containers:
1874
+                snapResults.extend(container('li'))
1875
+            bFound = (len(snapResults) != 0)
1876
+        if not bFound:
1877
+            snapResults = elmRoot(all, propertyMatch)
1878
+            bFound = (len(snapResults) != 0)
1879
+        if (not bFound) and (sProperty == 'value'):
1880
+            snapResults = elmRoot('pre')
1881
+            bFound = (len(snapResults) != 0)
1882
+            bNormalize = not bFound
1883
+            if not bFound:
1884
+                snapResults = [elmRoot]
1885
+                bFound = (len(snapResults) != 0)
1886
+        arFilter = []
1887
+        if sProperty == 'vcard':
1888
+            snapFilter = elmRoot(all, propertyMatch)
1889
+            for node in snapFilter:
1890
+                if node.findParent(all, propertyMatch):
1891
+                    arFilter.append(node)
1892
+        arResults = []
1893
+        for node in snapResults:
1894
+            if node not in arFilter:
1895
+                arResults.append(node)
1896
+        bFound = (len(arResults) != 0)
1897
+        if not bFound:
1898
+            if bAllowMultiple: return []
1899
+            elif iPropertyType == self.STRING: return ''
1900
+            elif iPropertyType == self.DATE: return None
1901
+            elif iPropertyType == self.URI: return ''
1902
+            elif iPropertyType == self.NODE: return None
1903
+            else: return None
1904
+        arValues = []
1905
+        for elmResult in arResults:
1906
+            sValue = None
1907
+            if iPropertyType == self.NODE:
1908
+                if bAllowMultiple:
1909
+                    arValues.append(elmResult)
1910
+                    continue
1911
+                else:
1912
+                    return elmResult
1913
+            sNodeName = elmResult.name.lower()
1914
+            if (iPropertyType == self.EMAIL) and (sNodeName == 'a'):
1915
+                sValue = (elmResult.get('href') or '').split('mailto:').pop().split('?')[0]
1916
+            if sValue:
1917
+                sValue = bNormalize and self.normalize(sValue) or sValue.strip()
1918
+            if (not sValue) and (sNodeName == 'abbr'):
1919
+                sValue = elmResult.get('title')
1920
+            if sValue:
1921
+                sValue = bNormalize and self.normalize(sValue) or sValue.strip()
1922
+            if (not sValue) and (iPropertyType == self.URI):
1923
+                if sNodeName == 'a': sValue = elmResult.get('href')
1924
+                elif sNodeName == 'img': sValue = elmResult.get('src')
1925
+                elif sNodeName == 'object': sValue = elmResult.get('data')
1926
+            if sValue:
1927
+                sValue = bNormalize and self.normalize(sValue) or sValue.strip()
1928
+            if (not sValue) and (sNodeName == 'img'):
1929
+                sValue = elmResult.get('alt')
1930
+            if sValue:
1931
+                sValue = bNormalize and self.normalize(sValue) or sValue.strip()
1932
+            if not sValue:
1933
+                sValue = elmResult.renderContents()
1934
+                sValue = re.sub(r'<\S[^>]*>', '', sValue)
1935
+                sValue = sValue.replace('\r\n', '\n')
1936
+                sValue = sValue.replace('\r', '\n')
1937
+            if sValue:
1938
+                sValue = bNormalize and self.normalize(sValue) or sValue.strip()
1939
+            if not sValue: continue
1940
+            if iPropertyType == self.DATE:
1941
+                sValue = _parse_date_iso8601(sValue)
1942
+            if bAllowMultiple:
1943
+                arValues.append(bAutoEscape and self.vcardEscape(sValue) or sValue)
1944
+            else:
1945
+                return bAutoEscape and self.vcardEscape(sValue) or sValue
1946
+        return arValues
1947
+
1948
+    def findVCards(self, elmRoot, bAgentParsing=0):
1949
+        sVCards = ''
1950
+
1951
+        if not bAgentParsing:
1952
+            arCards = self.getPropertyValue(elmRoot, 'vcard', bAllowMultiple=1)
1953
+        else:
1954
+            arCards = [elmRoot]
1955
+
1956
+        for elmCard in arCards:
1957
+            arLines = []
1958
+
1959
+            def processSingleString(sProperty):
1960
+                sValue = self.getPropertyValue(elmCard, sProperty, self.STRING, bAutoEscape=1)
1961
+                if sValue:
1962
+                    arLines.append(self.vcardFold(sProperty.upper() + ':' + sValue))
1963
+                return sValue or ''
1964
+
1965
+            def processSingleURI(sProperty):
1966
+                sValue = self.getPropertyValue(elmCard, sProperty, self.URI)
1967
+                if sValue:
1968
+                    sContentType = ''
1969
+                    sEncoding = ''
1970
+                    sValueKey = ''
1971
+                    if sValue.startswith('data:'):
1972
+                        sEncoding = ';ENCODING=b'
1973
+                        sContentType = sValue.split(';')[0].split('/').pop()
1974
+                        sValue = sValue.split(',', 1).pop()
1975
+                    else:
1976
+                        elmValue = self.getPropertyValue(elmCard, sProperty)
1977
+                        if elmValue:
1978
+                            if sProperty != 'url':
1979
+                                sValueKey = ';VALUE=uri'
1980
+                            sContentType = elmValue.get('type', '').strip().split('/').pop().strip()
1981
+                    sContentType = sContentType.upper()
1982
+                    if sContentType == 'OCTET-STREAM':
1983
+                        sContentType = ''
1984
+                    if sContentType:
1985
+                        sContentType = ';TYPE=' + sContentType.upper()
1986
+                    arLines.append(self.vcardFold(sProperty.upper() + sEncoding + sContentType + sValueKey + ':' + sValue))
1987
+
1988
+            def processTypeValue(sProperty, arDefaultType, arForceType=None):
1989
+                arResults = self.getPropertyValue(elmCard, sProperty, bAllowMultiple=1)
1990
+                for elmResult in arResults:
1991
+                    arType = self.getPropertyValue(elmResult, 'type', self.STRING, 1, 1)
1992
+                    if arForceType:
1993
+                        arType = self.unique(arForceType + arType)
1994
+                    if not arType:
1995
+                        arType = arDefaultType
1996
+                    sValue = self.getPropertyValue(elmResult, 'value', self.EMAIL, 0)
1997
+                    if sValue:
1998
+                        arLines.append(self.vcardFold(sProperty.upper() + ';TYPE=' + ','.join(arType) + ':' + sValue))
1999
+
2000
+            # AGENT
2001
+            # must do this before all other properties because it is destructive
2002
+            # (removes nested class="vcard" nodes so they don't interfere with
2003
+            # this vcard's other properties)
2004
+            arAgent = self.getPropertyValue(elmCard, 'agent', bAllowMultiple=1)
2005
+            for elmAgent in arAgent:
2006
+                if re.compile(r'\bvcard\b').search(elmAgent.get('class')):
2007
+                    sAgentValue = self.findVCards(elmAgent, 1) + '\n'
2008
+                    sAgentValue = sAgentValue.replace('\n', '\\n')
2009
+                    sAgentValue = sAgentValue.replace(';', '\\;')
2010
+                    if sAgentValue:
2011
+                        arLines.append(self.vcardFold('AGENT:' + sAgentValue))
2012
+                    elmAgent['class'] = ''
2013
+                    elmAgent.contents = []
2014
+                else:
2015
+                    sAgentValue = self.getPropertyValue(elmAgent, 'value', self.URI, bAutoEscape=1);
2016
+                    if sAgentValue:
2017
+                        arLines.append(self.vcardFold('AGENT;VALUE=uri:' + sAgentValue))
2018
+
2019
+            # FN (full name)
2020
+            sFN = processSingleString('fn')
2021
+
2022
+            # N (name)
2023
+            elmName = self.getPropertyValue(elmCard, 'n')
2024
+            if elmName:
2025
+                sFamilyName = self.getPropertyValue(elmName, 'family-name', self.STRING, bAutoEscape=1)
2026
+                sGivenName = self.getPropertyValue(elmName, 'given-name', self.STRING, bAutoEscape=1)
2027
+                arAdditionalNames = self.getPropertyValue(elmName, 'additional-name', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'additional-names', self.STRING, 1, 1)
2028
+                arHonorificPrefixes = self.getPropertyValue(elmName, 'honorific-prefix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-prefixes', self.STRING, 1, 1)
2029
+                arHonorificSuffixes = self.getPropertyValue(elmName, 'honorific-suffix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-suffixes', self.STRING, 1, 1)
2030
+                arLines.append(self.vcardFold('N:' + sFamilyName + ';' +
2031
+                                         sGivenName + ';' +
2032
+                                         ','.join(arAdditionalNames) + ';' +
2033
+                                         ','.join(arHonorificPrefixes) + ';' +
2034
+                                         ','.join(arHonorificSuffixes)))
2035
+            elif sFN:
2036
+                # implied "N" optimization
2037
+                # http://microformats.org/wiki/hcard#Implied_.22N.22_Optimization
2038
+                arNames = self.normalize(sFN).split()
2039
+                if len(arNames) == 2:
2040
+                    bFamilyNameFirst = (arNames[0].endswith(',') or
2041
+                                        len(arNames[1]) == 1 or
2042
+                                        ((len(arNames[1]) == 2) and (arNames[1].endswith('.'))))
2043
+                    if bFamilyNameFirst:
2044
+                        arLines.append(self.vcardFold('N:' + arNames[0] + ';' + arNames[1]))
2045
+                    else:
2046
+                        arLines.append(self.vcardFold('N:' + arNames[1] + ';' + arNames[0]))
2047
+
2048
+            # SORT-STRING
2049
+            sSortString = self.getPropertyValue(elmCard, 'sort-string', self.STRING, bAutoEscape=1)
2050
+            if sSortString:
2051
+                arLines.append(self.vcardFold('SORT-STRING:' + sSortString))
2052
+
2053
+            # NICKNAME
2054
+            arNickname = self.getPropertyValue(elmCard, 'nickname', self.STRING, 1, 1)
2055
+            if arNickname:
2056
+                arLines.append(self.vcardFold('NICKNAME:' + ','.join(arNickname)))
2057
+
2058
+            # PHOTO
2059
+            processSingleURI('photo')
2060
+
2061
+            # BDAY
2062
+            dtBday = self.getPropertyValue(elmCard, 'bday', self.DATE)
2063
+            if dtBday:
2064
+                arLines.append(self.vcardFold('BDAY:' + self.toISO8601(dtBday)))
2065
+
2066
+            # ADR (address)
2067
+            arAdr = self.getPropertyValue(elmCard, 'adr', bAllowMultiple=1)
2068
+            for elmAdr in arAdr:
2069
+                arType = self.getPropertyValue(elmAdr, 'type', self.STRING, 1, 1)
2070
+                if not arType:
2071
+                    arType = ['intl','postal','parcel','work'] # default adr types, see RFC 2426 section 3.2.1
2072
+                sPostOfficeBox = self.getPropertyValue(elmAdr, 'post-office-box', self.STRING, 0, 1)
2073
+                sExtendedAddress = self.getPropertyValue(elmAdr, 'extended-address', self.STRING, 0, 1)
2074
+                sStreetAddress = self.getPropertyValue(elmAdr, 'street-address', self.STRING, 0, 1)
2075
+                sLocality = self.getPropertyValue(elmAdr, 'locality', self.STRING, 0, 1)
2076
+                sRegion = self.getPropertyValue(elmAdr, 'region', self.STRING, 0, 1)
2077
+                sPostalCode = self.getPropertyValue(elmAdr, 'postal-code', self.STRING, 0, 1)
2078
+                sCountryName = self.getPropertyValue(elmAdr, 'country-name', self.STRING, 0, 1)
2079
+                arLines.append(self.vcardFold('ADR;TYPE=' + ','.join(arType) + ':' +
2080
+                                         sPostOfficeBox + ';' +
2081
+                                         sExtendedAddress + ';' +
2082
+                                         sStreetAddress + ';' +
2083
+                                         sLocality + ';' +
2084
+                                         sRegion + ';' +
2085
+                                         sPostalCode + ';' +
2086
+                                         sCountryName))
2087
+
2088
+            # LABEL
2089
+            processTypeValue('label', ['intl','postal','parcel','work'])
2090
+
2091
+            # TEL (phone number)
2092
+            processTypeValue('tel', ['voice'])
2093
+
2094
+            # EMAIL
2095
+            processTypeValue('email', ['internet'], ['internet'])
2096
+
2097
+            # MAILER
2098
+            processSingleString('mailer')
2099
+
2100
+            # TZ (timezone)
2101
+            processSingleString('tz')
2102
+
2103
+            # GEO (geographical information)
2104
+            elmGeo = self.getPropertyValue(elmCard, 'geo')
2105
+            if elmGeo:
2106
+                sLatitude = self.getPropertyValue(elmGeo, 'latitude', self.STRING, 0, 1)
2107
+                sLongitude = self.getPropertyValue(elmGeo, 'longitude', self.STRING, 0, 1)
2108
+                arLines.append(self.vcardFold('GEO:' + sLatitude + ';' + sLongitude))
2109
+
2110
+            # TITLE
2111
+            processSingleString('title')
2112
+
2113
+            # ROLE
2114
+            processSingleString('role')
2115
+
2116
+            # LOGO
2117
+            processSingleURI('logo')
2118
+
2119
+            # ORG (organization)
2120
+            elmOrg = self.getPropertyValue(elmCard, 'org')
2121
+            if elmOrg:
2122
+                sOrganizationName = self.getPropertyValue(elmOrg, 'organization-name', self.STRING, 0, 1)
2123
+                if not sOrganizationName:
2124
+                    # implied "organization-name" optimization
2125
+                    # http://microformats.org/wiki/hcard#Implied_.22organization-name.22_Optimization
2126
+                    sOrganizationName = self.getPropertyValue(elmCard, 'org', self.STRING, 0, 1)
2127
+                    if sOrganizationName:
2128
+                        arLines.append(self.vcardFold('ORG:' + sOrganizationName))
2129
+                else:
2130
+                    arOrganizationUnit = self.getPropertyValue(elmOrg, 'organization-unit', self.STRING, 1, 1)
2131
+                    arLines.append(self.vcardFold('ORG:' + sOrganizationName + ';' + ';'.join(arOrganizationUnit)))
2132
+
2133
+            # CATEGORY
2134
+            arCategory = self.getPropertyValue(elmCard, 'category', self.STRING, 1, 1) + self.getPropertyValue(elmCard, 'categories', self.STRING, 1, 1)
2135
+            if arCategory:
2136
+                arLines.append(self.vcardFold('CATEGORIES:' + ','.join(arCategory)))
2137
+
2138
+            # NOTE
2139
+            processSingleString('note')
2140
+
2141
+            # REV
2142
+            processSingleString('rev')
2143
+
2144
+            # SOUND
2145
+            processSingleURI('sound')
2146
+
2147
+            # UID
2148
+            processSingleString('uid')
2149
+
2150
+            # URL
2151
+            processSingleURI('url')
2152
+
2153
+            # CLASS
2154
+            processSingleString('class')
2155
+
2156
+            # KEY
2157
+            processSingleURI('key')
2158
+
2159
+            if arLines:
2160
+                arLines = ['BEGIN:vCard','VERSION:3.0'] + arLines + ['END:vCard']
2161
+                sVCards += '\n'.join(arLines) + '\n'
2162
+
2163
+        return sVCards.strip()
2164
+
2165
+    def isProbablyDownloadable(self, elm):
2166
+        attrsD = elm.attrMap
2167
+        if not attrsD.has_key('href'): return 0
2168
+        linktype = attrsD.get('type', '').strip()
2169
+        if linktype.startswith('audio/') or \
2170
+           linktype.startswith('video/') or \
2171
+           (linktype.startswith('application/') and not linktype.endswith('xml')):
2172
+            return 1
2173
+        path = urlparse.urlparse(attrsD['href'])[2]
2174
+        if path.find('.') == -1: return 0
2175
+        fileext = path.split('.').pop().lower()
2176
+        return fileext in self.known_binary_extensions
2177
+
2178
+    def findTags(self):
2179
+        all = lambda x: 1
2180
+        for elm in self.document(all, {'rel': re.compile(r'\btag\b')}):
2181
+            href = elm.get('href')
2182
+            if not href: continue
2183
+            urlscheme, domain, path, params, query, fragment = \
2184
+                       urlparse.urlparse(_urljoin(self.baseuri, href))
2185
+            segments = path.split('/')
2186
+            tag = segments.pop()
2187
+            if not tag:
2188
+                tag = segments.pop()
2189
+            tagscheme = urlparse.urlunparse((urlscheme, domain, '/'.join(segments), '', '', ''))
2190
+            if not tagscheme.endswith('/'):
2191
+                tagscheme += '/'
2192
+            self.tags.append(FeedParserDict({"term": tag, "scheme": tagscheme, "label": elm.string or ''}))
2193
+
2194
+    def findEnclosures(self):
2195
+        all = lambda x: 1
2196
+        enclosure_match = re.compile(r'\benclosure\b')
2197
+        for elm in self.document(all, {'href': re.compile(r'.+')}):
2198
+            if not enclosure_match.search(elm.get('rel', '')) and not self.isProbablyDownloadable(elm): continue
2199
+            if elm.attrMap not in self.enclosures:
2200
+                self.enclosures.append(elm.attrMap)
2201
+                if elm.string and not elm.get('title'):
2202
+                    self.enclosures[-1]['title'] = elm.string
2203
+
2204
+    def findXFN(self):
2205
+        all = lambda x: 1
2206
+        for elm in self.document(all, {'rel': re.compile('.+'), 'href': re.compile('.+')}):
2207
+            rels = elm.get('rel', '').split()
2208
+            xfn_rels = []
2209
+            for rel in rels:
2210
+                if rel in self.known_xfn_relationships:
2211
+                    xfn_rels.append(rel)
2212
+            if xfn_rels:
2213
+                self.xfn.append({"relationships": xfn_rels, "href": elm.get('href', ''), "name": elm.string})
2214
+
2215
+def _parseMicroformats(htmlSource, baseURI, encoding):
2216
+    if not BeautifulSoup: return
2217
+    if _debug: sys.stderr.write('entering _parseMicroformats\n')
2218
+    p = _MicroformatsParser(htmlSource, baseURI, encoding)
2219
+    p.vcard = p.findVCards(p.document)
2220
+    p.findTags()
2221
+    p.findEnclosures()
2222
+    p.findXFN()
2223
+    return {"tags": p.tags, "enclosures": p.enclosures, "xfn": p.xfn, "vcard": p.vcard}
2224
+
2225
+class _RelativeURIResolver(_BaseHTMLProcessor):
2226
+    relative_uris = [('a', 'href'),
2227
+                     ('applet', 'codebase'),
2228
+                     ('area', 'href'),
2229
+                     ('blockquote', 'cite'),
2230
+                     ('body', 'background'),
2231
+                     ('del', 'cite'),
2232
+                     ('form', 'action'),
2233
+                     ('frame', 'longdesc'),
2234
+                     ('frame', 'src'),
2235
+                     ('iframe', 'longdesc'),
2236
+                     ('iframe', 'src'),
2237
+                     ('head', 'profile'),
2238
+                     ('img', 'longdesc'),
2239
+                     ('img', 'src'),
2240
+                     ('img', 'usemap'),
2241
+                     ('input', 'src'),
2242
+                     ('input', 'usemap'),
2243
+                     ('ins', 'cite'),
2244
+                     ('link', 'href'),
2245
+                     ('object', 'classid'),
2246
+                     ('object', 'codebase'),
2247
+                     ('object', 'data'),
2248
+                     ('object', 'usemap'),
2249
+                     ('q', 'cite'),
2250
+                     ('script', 'src')]
2251
+
2252
+    def __init__(self, baseuri, encoding, type):
2253
+        _BaseHTMLProcessor.__init__(self, encoding, type)
2254
+        self.baseuri = baseuri
2255
+
2256
+    def resolveURI(self, uri):
2257
+        return _urljoin(self.baseuri, uri.strip())
2258
+
2259
+    def unknown_starttag(self, tag, attrs):
2260
+        attrs = self.normalize_attrs(attrs)
2261
+        attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
2262
+        _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
2263
+
2264
+def _resolveRelativeURIs(htmlSource, baseURI, encoding, type):
2265
+    if _debug: sys.stderr.write('entering _resolveRelativeURIs\n')
2266
+    p = _RelativeURIResolver(baseURI, encoding, type)
2267
+    p.feed(htmlSource)
2268
+    return p.output()
2269
+
2270
+class _HTMLSanitizer(_BaseHTMLProcessor):
2271
+    acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'article',
2272
+      'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button', 'canvas',
2273
+      'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'command',
2274
+      'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir',
2275
+      'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', 'figure', 'footer',
2276
+      'font', 'form', 'header', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i',
2277
+      'img', 'input', 'ins', 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map',
2278
+      'menu', 'meter', 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup',
2279
+      'option', 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
2280
+      'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', 'sub',
2281
+      'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', 'th', 'thead',
2282
+      'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript']
2283
+
2284
+    acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
2285
+      'action', 'align', 'alt', 'autoplay', 'autocomplete', 'autofocus', 'axis',
2286
+      'background', 'balance', 'bgcolor', 'bgproperties', 'border',
2287
+      'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
2288
+      'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
2289
+      'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols',
2290
+      'colspan', 'compact', 'contenteditable', 'coords', 'data', 'datafld',
2291
+      'datapagesize', 'datasrc', 'datetime', 'default', 'delay', 'dir',
2292
+      'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for',
2293
+      'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus',
2294
+      'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode',
2295
+      'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc',
2296
+      'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max',
2297
+      'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref',
2298
+      'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size',
2299
+      'prompt', 'pqg', 'radiogroup', 'readonly', 'rel', 'repeat-max',
2300
+      'repeat-min', 'replace', 'required', 'rev', 'rightspacing', 'rows',
2301
+      'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src',
2302
+      'start', 'step', 'summary', 'suppress', 'tabindex', 'target', 'template',
2303
+      'title', 'toppadding', 'type', 'unselectable', 'usemap', 'urn', 'valign',
2304
+      'value', 'variable', 'volume', 'vspace', 'vrml', 'width', 'wrap',
2305
+      'xml:lang']
2306
+
2307
+    unacceptable_elements_with_end_tag = ['script', 'applet', 'style']
2308
+
2309
+    acceptable_css_properties = ['azimuth', 'background-color',
2310
+      'border-bottom-color', 'border-collapse', 'border-color',
2311
+      'border-left-color', 'border-right-color', 'border-top-color', 'clear',
2312
+      'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
2313
+      'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
2314
+      'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
2315
+      'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
2316
+      'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
2317
+      'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
2318
+      'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
2319
+      'white-space', 'width']
2320
+
2321
+    # survey of common keywords found in feeds
2322
+    acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
2323
+      'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
2324
+      'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
2325
+      'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
2326
+      'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
2327
+      'transparent', 'underline', 'white', 'yellow']
2328
+
2329
+    valid_css_values = re.compile('^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' +
2330
+      '\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$')
2331
+
2332
+    mathml_elements = ['annotation', 'annotation-xml', 'maction', 'math',
2333
+      'merror', 'mfenced', 'mfrac', 'mi', 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded',
2334
+      'mphantom', 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle',
2335
+      'msub', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
2336
+      'munderover', 'none', 'semantics']
2337
+
2338
+    mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
2339
+      'columnalign', 'close', 'columnlines', 'columnspacing', 'columnspan', 'depth',
2340
+      'display', 'displaystyle', 'encoding', 'equalcolumns', 'equalrows',
2341
+      'fence', 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness',
2342
+      'lspace', 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant',
2343
+      'maxsize', 'minsize', 'open', 'other', 'rowalign', 'rowalign', 'rowalign',
2344
+      'rowlines', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
2345
+      'separator', 'separators', 'stretchy', 'width', 'width', 'xlink:href',
2346
+      'xlink:show', 'xlink:type', 'xmlns', 'xmlns:xlink']
2347
+
2348
+    # svgtiny - foreignObject + linearGradient + radialGradient + stop
2349
+    svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
2350
+      'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject',
2351
+      'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
2352
+      'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath',
2353
+      'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop',
2354
+      'svg', 'switch', 'text', 'title', 'tspan', 'use']
2355
+
2356
+    # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
2357
+    svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
2358
+       'arabic-form', 'ascent', 'attributeName', 'attributeType',
2359
+       'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
2360
+       'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
2361
+       'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
2362
+       'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
2363
+       'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
2364
+       'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
2365
+       'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines',
2366
+       'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid',
2367
+       'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max',
2368
+       'min', 'name', 'offset', 'opacity', 'orient', 'origin',
2369
+       'overline-position', 'overline-thickness', 'panose-1', 'path',
2370
+       'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY',
2371
+       'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
2372
+       'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
2373
+       'stop-color', 'stop-opacity', 'strikethrough-position',
2374
+       'strikethrough-thickness', 'stroke', 'stroke-dasharray',
2375
+       'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
2376
+       'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage',
2377
+       'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
2378
+       'underline-position', 'underline-thickness', 'unicode', 'unicode-range',
2379
+       'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width',
2380
+       'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
2381
+       'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
2382
+       'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1',
2383
+       'y2', 'zoomAndPan']
2384
+
2385
+    svg_attr_map = None
2386
+    svg_elem_map = None
2387
+
2388
+    acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
2389
+      'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
2390
+      'stroke-opacity']
2391
+
2392
+    def reset(self):
2393
+        _BaseHTMLProcessor.reset(self)
2394
+        self.unacceptablestack = 0
2395
+        self.mathmlOK = 0
2396
+        self.svgOK = 0
2397
+
2398
+    def unknown_starttag(self, tag, attrs):
2399
+        acceptable_attributes = self.acceptable_attributes
2400
+        keymap = {}
2401
+        if not tag in self.acceptable_elements or self.svgOK:
2402
+            if tag in self.unacceptable_elements_with_end_tag:
2403
+                self.unacceptablestack += 1
2404
+
2405
+            # not otherwise acceptable, perhaps it is MathML or SVG?
2406
+            if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs:
2407
+                self.mathmlOK += 1
2408
+            if tag=='svg' and ('xmlns','http://www.w3.org/2000/svg') in attrs:
2409
+                self.svgOK += 1
2410
+
2411
+            # chose acceptable attributes based on tag class, else bail
2412
+            if  self.mathmlOK and tag in self.mathml_elements:
2413
+                acceptable_attributes = self.mathml_attributes
2414
+            elif self.svgOK and tag in self.svg_elements:
2415
+                # for most vocabularies, lowercasing is a good idea.  Many
2416
+                # svg elements, however, are camel case
2417
+                if not self.svg_attr_map:
2418
+                    lower=[attr.lower() for attr in self.svg_attributes]
2419
+                    mix=[a for a in self.svg_attributes if a not in lower]
2420
+                    self.svg_attributes = lower
2421
+                    self.svg_attr_map = dict([(a.lower(),a) for a in mix])
2422
+
2423
+                    lower=[attr.lower() for attr in self.svg_elements]
2424
+                    mix=[a for a in self.svg_elements if a not in lower]
2425
+                    self.svg_elements = lower
2426
+                    self.svg_elem_map = dict([(a.lower(),a) for a in mix])
2427
+                acceptable_attributes = self.svg_attributes
2428
+                tag = self.svg_elem_map.get(tag,tag)
2429
+                keymap = self.svg_attr_map
2430
+            elif not tag in self.acceptable_elements:
2431
+                return
2432
+
2433
+        # declare xlink namespace, if needed
2434
+        if self.mathmlOK or self.svgOK:
2435
+            if filter(lambda (n,v): n.startswith('xlink:'),attrs):
2436
+                if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs:
2437
+                    attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink'))
2438
+
2439
+        clean_attrs = []
2440
+        for key, value in self.normalize_attrs(attrs):
2441
+            if key in acceptable_attributes:
2442
+                key=keymap.get(key,key)
2443
+                clean_attrs.append((key,value))
2444
+            elif key=='style':
2445
+                clean_value = self.sanitize_style(value)
2446
+                if clean_value: clean_attrs.append((key,clean_value))
2447
+        _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs)
2448
+
2449
+    def unknown_endtag(self, tag):
2450
+        if not tag in self.acceptable_elements:
2451
+            if tag in self.unacceptable_elements_with_end_tag:
2452
+                self.unacceptablestack -= 1
2453
+            if self.mathmlOK and tag in self.mathml_elements:
2454
+                if tag == 'math' and self.mathmlOK: self.mathmlOK -= 1
2455
+            elif self.svgOK and tag in self.svg_elements:
2456
+                tag = self.svg_elem_map.get(tag,tag)
2457
+                if tag == 'svg' and self.svgOK: self.svgOK -= 1
2458
+            else:
2459
+                return
2460
+        _BaseHTMLProcessor.unknown_endtag(self, tag)
2461
+
2462
+    def handle_pi(self, text):
2463
+        pass
2464
+
2465
+    def handle_decl(self, text):
2466
+        pass
2467
+
2468
+    def handle_data(self, text):
2469
+        if not self.unacceptablestack:
2470
+            _BaseHTMLProcessor.handle_data(self, text)
2471
+
2472
+    def sanitize_style(self, style):
2473
+        # disallow urls
2474
+        style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
2475
+
2476
+        # gauntlet
2477
+        if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
2478
+        if not re.match("^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$", style): return ''
2479
+
2480
+        clean = []
2481
+        for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
2482
+          if not value: continue
2483
+          if prop.lower() in self.acceptable_css_properties:
2484
+              clean.append(prop + ': ' + value + ';')
2485
+          elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
2486
+              for keyword in value.split():
2487
+                  if not keyword in self.acceptable_css_keywords and \
2488
+                      not self.valid_css_values.match(keyword):
2489
+                      break
2490
+              else:
2491
+                  clean.append(prop + ': ' + value + ';')
2492
+          elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
2493
+              clean.append(prop + ': ' + value + ';')
2494
+
2495
+        return ' '.join(clean)
2496
+
2497
+
2498
+def _sanitizeHTML(htmlSource, encoding, type):
2499
+    p = _HTMLSanitizer(encoding, type)
2500
+    p.feed(htmlSource)
2501
+    data = p.output()
2502
+    if TIDY_MARKUP:
2503
+        # loop through list of preferred Tidy interfaces looking for one that's installed,
2504
+        # then set up a common _tidy function to wrap the interface-specific API.
2505
+        _tidy = None
2506
+        for tidy_interface in PREFERRED_TIDY_INTERFACES:
2507
+            try:
2508
+                if tidy_interface == "uTidy":
2509
+                    from tidy import parseString as _utidy
2510
+                    def _tidy(data, **kwargs):
2511
+                        return str(_utidy(data, **kwargs))
2512
+                    break
2513
+                elif tidy_interface == "mxTidy":
2514
+                    from mx.Tidy import Tidy as _mxtidy
2515
+                    def _tidy(data, **kwargs):
2516
+                        nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs)
2517
+                        return data
2518
+                    break
2519
+            except:
2520
+                pass
2521
+        if _tidy:
2522
+            utf8 = type(data) == type(u'')
2523
+            if utf8:
2524
+                data = data.encode('utf-8')
2525
+            data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8")
2526
+            if utf8:
2527
+                data = unicode(data, 'utf-8')
2528
+            if data.count('<body'):
2529
+                data = data.split('<body', 1)[1]
2530
+                if data.count('>'):
2531
+                    data = data.split('>', 1)[1]
2532
+            if data.count('</body'):
2533
+                data = data.split('</body', 1)[0]
2534
+    data = data.strip().replace('\r\n', '\n')
2535
+    return data
2536
+
2537
+class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
2538
+    def http_error_default(self, req, fp, code, msg, headers):
2539
+        if ((code / 100) == 3) and (code != 304):
2540
+            return self.http_error_302(req, fp, code, msg, headers)
2541
+        infourl = urllib.addinfourl(fp, headers, req.get_full_url())
2542
+        infourl.status = code
2543
+        return infourl
2544
+
2545
+    def http_error_302(self, req, fp, code, msg, headers):
2546
+        if headers.dict.has_key('location'):
2547
+            infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
2548
+        else:
2549
+            infourl = urllib.addinfourl(fp, headers, req.get_full_url())
2550
+        if not hasattr(infourl, 'status'):
2551
+            infourl.status = code
2552
+        return infourl
2553
+
2554
+    def http_error_301(self, req, fp, code, msg, headers):
2555
+        if headers.dict.has_key('location'):
2556
+            infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
2557
+        else:
2558
+            infourl = urllib.addinfourl(fp, headers, req.get_full_url())
2559
+        if not hasattr(infourl, 'status'):
2560
+            infourl.status = code
2561
+        return infourl
2562
+
2563
+    http_error_300 = http_error_302
2564
+    http_error_303 = http_error_302
2565
+    http_error_307 = http_error_302
2566
+
2567
+    def http_error_401(self, req, fp, code, msg, headers):
2568
+        # Check if
2569
+        # - server requires digest auth, AND
2570
+        # - we tried (unsuccessfully) with basic auth, AND
2571
+        # - we're using Python 2.3.3 or later (digest auth is irreparably broken in earlier versions)
2572
+        # If all conditions hold, parse authentication information
2573
+        # out of the Authorization header we sent the first time
2574
+        # (for the username and password) and the WWW-Authenticate
2575
+        # header the server sent back (for the realm) and retry
2576
+        # the request with the appropriate digest auth headers instead.
2577
+        # This evil genius hack has been brought to you by Aaron Swartz.
2578
+        host = urlparse.urlparse(req.get_full_url())[1]
2579
+        try:
2580
+            assert sys.version.split()[0] >= '2.3.3'
2581
+            assert base64 != None
2582
+            user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':')
2583
+            realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
2584
+            self.add_password(realm, host, user, passw)
2585
+            retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
2586
+            self.reset_retry_count()
2587
+            return retry
2588
+        except:
2589
+            return self.http_error_default(req, fp, code, msg, headers)
2590
+
2591
+def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers):
2592
+    """URL, filename, or string --> stream
2593
+
2594
+    This function lets you define parsers that take any input source
2595
+    (URL, pathname to local or network file, or actual data as a string)
2596
+    and deal with it in a uniform manner.  Returned object is guaranteed
2597
+    to have all the basic stdio read methods (read, readline, readlines).
2598
+    Just .close() the object when you're done with it.
2599
+
2600
+    If the etag argument is supplied, it will be used as the value of an
2601
+    If-None-Match request header.
2602
+
2603
+    If the modified argument is supplied, it can be a tuple of 9 integers
2604
+    (as returned by gmtime() in the standard Python time module) or a date
2605
+    string in any format supported by feedparser. Regardless, it MUST
2606
+    be in GMT (Greenwich Mean Time). It will be reformatted into an
2607
+    RFC 1123-compliant date and used as the value of an If-Modified-Since
2608
+    request header.
2609
+
2610
+    If the agent argument is supplied, it will be used as the value of a
2611
+    User-Agent request header.
2612
+
2613
+    If the referrer argument is supplied, it will be used as the value of a
2614
+    Referer[sic] request header.
2615
+
2616
+    If handlers is supplied, it is a list of handlers used to build a
2617
+    urllib2 opener.
2618
+    """
2619
+
2620
+    if hasattr(url_file_stream_or_string, 'read'):
2621
+        return url_file_stream_or_string
2622
+
2623
+    if url_file_stream_or_string == '-':
2624
+        return sys.stdin
2625
+
2626
+    if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):
2627
+        if not agent:
2628
+            agent = USER_AGENT
2629
+        # test for inline user:password for basic auth
2630
+        auth = None
2631
+        if base64:
2632
+            urltype, rest = urllib.splittype(url_file_stream_or_string)
2633
+            realhost, rest = urllib.splithost(rest)
2634
+            if realhost:
2635
+                user_passwd, realhost = urllib.splituser(realhost)
2636
+                if user_passwd:
2637
+                    url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
2638
+                    auth = base64.encodestring(user_passwd).strip()
2639
+
2640
+        # iri support
2641
+        try:
2642
+            if isinstance(url_file_stream_or_string,unicode):
2643
+                url_file_stream_or_string = url_file_stream_or_string.encode('idna')
2644
+            else:
2645
+                url_file_stream_or_string = url_file_stream_or_string.decode('utf-8').encode('idna')
2646
+        except:
2647
+            pass
2648
+
2649
+        # try to open with urllib2 (to use optional headers)
2650
+        request = urllib2.Request(url_file_stream_or_string)
2651
+        request.add_header('User-Agent', agent)
2652
+        if etag:
2653
+            request.add_header('If-None-Match', etag)
2654
+        if type(modified) == type(''):
2655
+            modified = _parse_date(modified)
2656
+        if modified:
2657
+            # format into an RFC 1123-compliant timestamp. We can't use
2658
+            # time.strftime() since the %a and %b directives can be affected
2659
+            # by the current locale, but RFC 2616 states that dates must be
2660
+            # in English.
2661
+            short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
2662
+            months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
2663
+            request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
2664
+        if referrer:
2665
+            request.add_header('Referer', referrer)
2666
+        if gzip and zlib:
2667
+            request.add_header('Accept-encoding', 'gzip, deflate')
2668
+        elif gzip:
2669
+            request.add_header('Accept-encoding', 'gzip')
2670
+        elif zlib:
2671
+            request.add_header('Accept-encoding', 'deflate')
2672
+        else:
2673
+            request.add_header('Accept-encoding', '')
2674
+        if auth:
2675
+            request.add_header('Authorization', 'Basic %s' % auth)
2676
+        if ACCEPT_HEADER:
2677
+            request.add_header('Accept', ACCEPT_HEADER)
2678
+        request.add_header('A-IM', 'feed') # RFC 3229 support
2679
+        opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers))
2680
+        opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
2681
+        try:
2682
+            return opener.open(request)
2683
+        finally:
2684
+            opener.close() # JohnD
2685
+
2686
+    # try to open with native open function (if url_file_stream_or_string is a filename)
2687
+    try:
2688
+        return open(url_file_stream_or_string)
2689
+    except:
2690
+        pass
2691
+
2692
+    # treat url_file_stream_or_string as string
2693
+    return _StringIO(str(url_file_stream_or_string))
2694
+
2695
+_date_handlers = []
2696
+def registerDateHandler(func):
2697
+    '''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
2698
+    _date_handlers.insert(0, func)
2699
+
2700
+# ISO-8601 date parsing routines written by Fazal Majid.
2701
+# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
2702
+# parser is beyond the scope of feedparser and would be a worthwhile addition
2703
+# to the Python library.
2704
+# A single regular expression cannot parse ISO 8601 date formats into groups
2705
+# as the standard is highly irregular (for instance is 030104 2003-01-04 or
2706
+# 0301-04-01), so we use templates instead.
2707
+# Please note the order in templates is significant because we need a
2708
+# greedy match.
2709
+_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-0MM?-?DD', 'YYYY-MM', 'YYYY-?OOO',
2710
+                'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
2711
+                '-YY-?MM', '-OOO', '-YY',
2712
+                '--MM-?DD', '--MM',
2713
+                '---DD',
2714
+                'CC', '']
2715
+_iso8601_re = [
2716
+    tmpl.replace(
2717
+    'YYYY', r'(?P<year>\d{4})').replace(
2718
+    'YY', r'(?P<year>\d\d)').replace(
2719
+    'MM', r'(?P<month>[01]\d)').replace(
2720
+    'DD', r'(?P<day>[0123]\d)').replace(
2721
+    'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
2722
+    'CC', r'(?P<century>\d\d$)')
2723
+    + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
2724
+    + r'(:(?P<second>\d{2}(\.\d*)?))?'
2725
+    + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
2726
+    for tmpl in _iso8601_tmpl]
2727
+del tmpl
2728
+_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
2729
+del regex
2730
+def _parse_date_iso8601(dateString):
2731
+    '''Parse a variety of ISO-8601-compatible formats like 20040105'''
2732
+    m = None
2733
+    for _iso8601_match in _iso8601_matches:
2734
+        m = _iso8601_match(dateString)
2735
+        if m: break
2736
+    if not m: return
2737
+    if m.span() == (0, 0): return
2738
+    params = m.groupdict()
2739
+    ordinal = params.get('ordinal', 0)
2740
+    if ordinal:
2741
+        ordinal = int(ordinal)
2742
+    else:
2743
+        ordinal = 0
2744
+    year = params.get('year', '--')
2745
+    if not year or year == '--':
2746
+        year = time.gmtime()[0]
2747
+    elif len(year) == 2:
2748
+        # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
2749
+        year = 100 * int(time.gmtime()[0] / 100) + int(year)
2750
+    else:
2751
+        year = int(year)
2752
+    month = params.get('month', '-')
2753
+    if not month or month == '-':
2754
+        # ordinals are NOT normalized by mktime, we simulate them
2755
+        # by setting month=1, day=ordinal
2756
+        if ordinal:
2757
+            month = 1
2758
+        else:
2759
+            month = time.gmtime()[1]
2760
+    month = int(month)
2761
+    day = params.get('day', 0)
2762
+    if not day:
2763
+        # see above
2764
+        if ordinal:
2765
+            day = ordinal
2766
+        elif params.get('century', 0) or \
2767
+                 params.get('year', 0) or params.get('month', 0):
2768
+            day = 1
2769
+        else:
2770
+            day = time.gmtime()[2]
2771
+    else:
2772
+        day = int(day)
2773
+    # special case of the century - is the first year of the 21st century
2774
+    # 2000 or 2001 ? The debate goes on...
2775
+    if 'century' in params.keys():
2776
+        year = (int(params['century']) - 1) * 100 + 1
2777
+    # in ISO 8601 most fields are optional
2778
+    for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
2779
+        if not params.get(field, None):
2780
+            params[field] = 0
2781
+    hour = int(params.get('hour', 0))
2782
+    minute = int(params.get('minute', 0))
2783
+    second = int(float(params.get('second', 0)))
2784
+    # weekday is normalized by mktime(), we can ignore it
2785
+    weekday = 0
2786
+    daylight_savings_flag = -1
2787
+    tm = [year, month, day, hour, minute, second, weekday,
2788
+          ordinal, daylight_savings_flag]
2789
+    # ISO 8601 time zone adjustments
2790
+    tz = params.get('tz')
2791
+    if tz and tz != 'Z':
2792
+        if tz[0] == '-':
2793
+            tm[3] += int(params.get('tzhour', 0))
2794
+            tm[4] += int(params.get('tzmin', 0))
2795
+        elif tz[0] == '+':
2796
+            tm[3] -= int(params.get('tzhour', 0))
2797
+            tm[4] -= int(params.get('tzmin', 0))
2798
+        else:
2799
+            return None
2800
+    # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
2801
+    # which is guaranteed to normalize d/m/y/h/m/s.
2802
+    # Many implementations have bugs, but we'll pretend they don't.
2803
+    return time.localtime(time.mktime(tm))
2804
+registerDateHandler(_parse_date_iso8601)
2805
+
2806
+# 8-bit date handling routines written by ytrewq1.
2807
+_korean_year  = u'\ub144' # b3e2 in euc-kr
2808
+_korean_month = u'\uc6d4' # bff9 in euc-kr
2809
+_korean_day   = u'\uc77c' # c0cf in euc-kr
2810
+_korean_am    = u'\uc624\uc804' # bfc0 c0fc in euc-kr
2811
+_korean_pm    = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
2812
+
2813
+_korean_onblog_date_re = \
2814
+    re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
2815
+               (_korean_year, _korean_month, _korean_day))
2816
+_korean_nate_date_re = \
2817
+    re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
2818
+               (_korean_am, _korean_pm))
2819
+def _parse_date_onblog(dateString):
2820
+    '''Parse a string according to the OnBlog 8-bit date format'''
2821
+    m = _korean_onblog_date_re.match(dateString)
2822
+    if not m: return
2823
+    w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
2824
+                {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
2825
+                 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
2826
+                 'zonediff': '+09:00'}
2827
+    if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate)
2828
+    return _parse_date_w3dtf(w3dtfdate)
2829
+registerDateHandler(_parse_date_onblog)
2830
+
2831
+def _parse_date_nate(dateString):
2832
+    '''Parse a string according to the Nate 8-bit date format'''
2833
+    m = _korean_nate_date_re.match(dateString)
2834
+    if not m: return
2835
+    hour = int(m.group(5))
2836
+    ampm = m.group(4)
2837
+    if (ampm == _korean_pm):
2838
+        hour += 12
2839
+    hour = str(hour)
2840
+    if len(hour) == 1:
2841
+        hour = '0' + hour
2842
+    w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
2843
+                {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
2844
+                 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
2845
+                 'zonediff': '+09:00'}
2846
+    if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate)
2847
+    return _parse_date_w3dtf(w3dtfdate)
2848
+registerDateHandler(_parse_date_nate)
2849
+
2850
+_mssql_date_re = \
2851
+    re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')
2852
+def _parse_date_mssql(dateString):
2853
+    '''Parse a string according to the MS SQL date format'''
2854
+    m = _mssql_date_re.match(dateString)
2855
+    if not m: return
2856
+    w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
2857
+                {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
2858
+                 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
2859
+                 'zonediff': '+09:00'}
2860
+    if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate)
2861
+    return _parse_date_w3dtf(w3dtfdate)
2862
+registerDateHandler(_parse_date_mssql)
2863
+
2864
+# Unicode strings for Greek date strings
2865
+_greek_months = \
2866
+  { \
2867
+   u'\u0399\u03b1\u03bd': u'Jan',       # c9e1ed in iso-8859-7
2868
+   u'\u03a6\u03b5\u03b2': u'Feb',       # d6e5e2 in iso-8859-7
2869
+   u'\u039c\u03ac\u03ce': u'Mar',       # ccdcfe in iso-8859-7
2870
+   u'\u039c\u03b1\u03ce': u'Mar',       # cce1fe in iso-8859-7
2871
+   u'\u0391\u03c0\u03c1': u'Apr',       # c1f0f1 in iso-8859-7
2872
+   u'\u039c\u03ac\u03b9': u'May',       # ccdce9 in iso-8859-7
2873
+   u'\u039c\u03b1\u03ca': u'May',       # cce1fa in iso-8859-7
2874
+   u'\u039c\u03b1\u03b9': u'May',       # cce1e9 in iso-8859-7
2875
+   u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
2876
+   u'\u0399\u03bf\u03bd': u'Jun',       # c9efed in iso-8859-7
2877
+   u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
2878
+   u'\u0399\u03bf\u03bb': u'Jul',       # c9f9eb in iso-8859-7
2879
+   u'\u0391\u03cd\u03b3': u'Aug',       # c1fde3 in iso-8859-7
2880
+   u'\u0391\u03c5\u03b3': u'Aug',       # c1f5e3 in iso-8859-7
2881
+   u'\u03a3\u03b5\u03c0': u'Sep',       # d3e5f0 in iso-8859-7
2882
+   u'\u039f\u03ba\u03c4': u'Oct',       # cfeaf4 in iso-8859-7
2883
+   u'\u039d\u03bf\u03ad': u'Nov',       # cdefdd in iso-8859-7
2884
+   u'\u039d\u03bf\u03b5': u'Nov',       # cdefe5 in iso-8859-7
2885
+   u'\u0394\u03b5\u03ba': u'Dec',       # c4e5ea in iso-8859-7
2886
+  }
2887
+
2888
+_greek_wdays = \
2889
+  { \
2890
+   u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
2891
+   u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
2892
+   u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
2893
+   u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
2894
+   u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
2895
+   u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
2896
+   u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7
2897
+  }
2898
+
2899
+_greek_date_format_re = \
2900
+    re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
2901
+
2902
+def _parse_date_greek(dateString):
2903
+    '''Parse a string according to a Greek 8-bit date format.'''
2904
+    m = _greek_date_format_re.match(dateString)
2905
+    if not m: return
2906
+    try:
2907
+        wday = _greek_wdays[m.group(1)]
2908
+        month = _greek_months[m.group(3)]
2909
+    except:
2910
+        return
2911
+    rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
2912
+                 {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
2913
+                  'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
2914
+                  'zonediff': m.group(8)}
2915
+    if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date)
2916
+    return _parse_date_rfc822(rfc822date)
2917
+registerDateHandler(_parse_date_greek)
2918
+
2919
+# Unicode strings for Hungarian date strings
2920
+_hungarian_months = \
2921
+  { \
2922
+    u'janu\u00e1r':   u'01',  # e1 in iso-8859-2
2923
+    u'febru\u00e1ri': u'02',  # e1 in iso-8859-2
2924
+    u'm\u00e1rcius':  u'03',  # e1 in iso-8859-2
2925
+    u'\u00e1prilis':  u'04',  # e1 in iso-8859-2
2926
+    u'm\u00e1ujus':   u'05',  # e1 in iso-8859-2
2927
+    u'j\u00fanius':   u'06',  # fa in iso-8859-2
2928
+    u'j\u00falius':   u'07',  # fa in iso-8859-2
2929
+    u'augusztus':     u'08',
2930
+    u'szeptember':    u'09',
2931
+    u'okt\u00f3ber':  u'10',  # f3 in iso-8859-2
2932
+    u'november':      u'11',
2933
+    u'december':      u'12',
2934
+  }
2935
+
2936
+_hungarian_date_format_re = \
2937
+  re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
2938
+
2939
+def _parse_date_hungarian(dateString):
2940
+    '''Parse a string according to a Hungarian 8-bit date format.'''
2941
+    m = _hungarian_date_format_re.match(dateString)
2942
+    if not m: return
2943
+    try:
2944
+        month = _hungarian_months[m.group(2)]
2945
+        day = m.group(3)
2946
+        if len(day) == 1:
2947
+            day = '0' + day
2948
+        hour = m.group(4)
2949
+        if len(hour) == 1:
2950
+            hour = '0' + hour
2951
+    except:
2952
+        return
2953
+    w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
2954
+                {'year': m.group(1), 'month': month, 'day': day,\
2955
+                 'hour': hour, 'minute': m.group(5),\
2956
+                 'zonediff': m.group(6)}
2957
+    if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate)
2958
+    return _parse_date_w3dtf(w3dtfdate)
2959
+registerDateHandler(_parse_date_hungarian)
2960
+
2961
+# W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by
2962
+# Drake and licensed under the Python license.  Removed all range checking
2963
+# for month, day, hour, minute, and second, since mktime will normalize
2964
+# these later
2965
+def _parse_date_w3dtf(dateString):
2966
+    def __extract_date(m):
2967
+        year = int(m.group('year'))
2968
+        if year < 100:
2969
+            year = 100 * int(time.gmtime()[0] / 100) + int(year)
2970
+        if year < 1000:
2971
+            return 0, 0, 0
2972
+        julian = m.group('julian')
2973
+        if julian:
2974
+            julian = int(julian)
2975
+            month = julian / 30 + 1
2976
+            day = julian % 30 + 1
2977
+            jday = None
2978
+            while jday != julian:
2979
+                t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))
2980
+                jday = time.gmtime(t)[-2]
2981
+                diff = abs(jday - julian)
2982
+                if jday > julian:
2983
+                    if diff < day:
2984
+                        day = day - diff
2985
+                    else:
2986
+                        month = month - 1
2987
+                        day = 31
2988
+                elif jday < julian:
2989
+                    if day + diff < 28:
2990
+                       day = day + diff
2991
+                    else:
2992
+                        month = month + 1
2993
+            return year, month, day
2994
+        month = m.group('month')
2995
+        day = 1
2996
+        if month is None:
2997
+            month = 1
2998
+        else:
2999
+            month = int(month)
3000
+            day = m.group('day')
3001
+            if day:
3002
+                day = int(day)
3003
+            else:
3004
+                day = 1
3005
+        return year, month, day
3006
+
3007
+    def __extract_time(m):
3008
+        if not m:
3009
+            return 0, 0, 0
3010
+        hours = m.group('hours')
3011
+        if not hours:
3012
+            return 0, 0, 0
3013
+        hours = int(hours)
3014
+        minutes = int(m.group('minutes'))
3015
+        seconds = m.group('seconds')
3016
+        if seconds:
3017
+            seconds = int(seconds)
3018
+        else:
3019
+            seconds = 0
3020
+        return hours, minutes, seconds
3021
+
3022
+    def __extract_tzd(m):
3023
+        '''Return the Time Zone Designator as an offset in seconds from UTC.'''
3024
+        if not m:
3025
+            return 0
3026
+        tzd = m.group('tzd')
3027
+        if not tzd:
3028
+            return 0
3029
+        if tzd == 'Z':
3030
+            return 0
3031
+        hours = int(m.group('tzdhours'))
3032
+        minutes = m.group('tzdminutes')
3033
+        if minutes:
3034
+            minutes = int(minutes)
3035
+        else:
3036
+            minutes = 0
3037
+        offset = (hours*60 + minutes) * 60
3038
+        if tzd[0] == '+':
3039
+            return -offset
3040
+        return offset
3041
+
3042
+    __date_re = ('(?P<year>\d\d\d\d)'
3043
+                 '(?:(?P<dsep>-|)'
3044
+                 '(?:(?P<julian>\d\d\d)'
3045
+                 '|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?')
3046
+    __tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)'
3047
+    __tzd_rx = re.compile(__tzd_re)
3048
+    __time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)'
3049
+                 '(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?'
3050
+                 + __tzd_re)
3051
+    __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re)
3052
+    __datetime_rx = re.compile(__datetime_re)
3053
+    m = __datetime_rx.match(dateString)
3054
+    if (m is None) or (m.group() != dateString): return
3055
+    gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
3056
+    if gmt[0] == 0: return
3057
+    return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)
3058
+registerDateHandler(_parse_date_w3dtf)
3059
+
3060
+def _parse_date_rfc822(dateString):
3061
+    '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date'''
3062
+    data = dateString.split()
3063
+    if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:
3064
+        del data[0]
3065
+    if len(data) == 4:
3066
+        s = data[3]
3067
+        i = s.find('+')
3068
+        if i > 0:
3069
+            data[3:] = [s[:i], s[i+1:]]
3070
+        else:
3071
+            data.append('')
3072
+        dateString = " ".join(data)
3073
+    if len(data) < 5:
3074
+        dateString += ' 00:00:00 GMT'
3075
+    tm = rfc822.parsedate_tz(dateString)
3076
+    if tm:
3077
+        return time.gmtime(rfc822.mktime_tz(tm))
3078
+# rfc822.py defines several time zones, but we define some extra ones.
3079
+# 'ET' is equivalent to 'EST', etc.
3080
+_additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
3081
+rfc822._timezones.update(_additional_timezones)
3082
+registerDateHandler(_parse_date_rfc822)
3083
+
3084
+def _parse_date_perforce(aDateString):
3085
+        """parse a date in yyyy/mm/dd hh:mm:ss TTT format"""
3086
+        # Fri, 2006/09/15 08:19:53 EDT
3087
+        _my_date_pattern = re.compile( \
3088
+                r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})')
3089
+
3090
+        dow, year, month, day, hour, minute, second, tz = \
3091
+                _my_date_pattern.search(aDateString).groups()
3092
+        months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
3093
+        dateString = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz)
3094
+        tm = rfc822.parsedate_tz(dateString)
3095
+        if tm:
3096
+                return time.gmtime(rfc822.mktime_tz(tm))
3097
+registerDateHandler(_parse_date_perforce)
3098
+
3099
+def _parse_date(dateString):
3100
+    '''Parses a variety of date formats into a 9-tuple in GMT'''
3101
+    for handler in _date_handlers:
3102
+        try:
3103
+            date9tuple = handler(dateString)
3104
+            if not date9tuple: continue
3105
+            if len(date9tuple) != 9:
3106
+                if _debug: sys.stderr.write('date handler function must return 9-tuple\n')
3107
+                raise ValueError
3108
+            map(int, date9tuple)
3109
+            return date9tuple
3110
+        except Exception as e:
3111
+            if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e)))
3112
+            pass
3113
+    return None
3114
+
3115
+def _getCharacterEncoding(http_headers, xml_data):
3116
+    '''Get the character encoding of the XML document
3117
+
3118
+    http_headers is a dictionary
3119
+    xml_data is a raw string (not Unicode)
3120
+
3121
+    This is so much trickier than it sounds, it's not even funny.
3122
+    According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
3123
+    is application/xml, application/*+xml,
3124
+    application/xml-external-parsed-entity, or application/xml-dtd,
3125
+    the encoding given in the charset parameter of the HTTP Content-Type
3126
+    takes precedence over the encoding given in the XML prefix within the
3127
+    document, and defaults to 'utf-8' if neither are specified.  But, if
3128
+    the HTTP Content-Type is text/xml, text/*+xml, or
3129
+    text/xml-external-parsed-entity, the encoding given in the XML prefix
3130
+    within the document is ALWAYS IGNORED and only the encoding given in
3131
+    the charset parameter of the HTTP Content-Type header should be
3132
+    respected, and it defaults to 'us-ascii' if not specified.
3133
+
3134
+    Furthermore, discussion on the atom-syntax mailing list with the
3135
+    author of RFC 3023 leads me to the conclusion that any document
3136
+    served with a Content-Type of text/* and no charset parameter
3137
+    must be treated as us-ascii.  (We now do this.)  And also that it
3138
+    must always be flagged as non-well-formed.  (We now do this too.)
3139
+
3140
+    If Content-Type is unspecified (input was local file or non-HTTP source)
3141
+    or unrecognized (server just got it totally wrong), then go by the
3142
+    encoding given in the XML prefix of the document and default to
3143
+    'iso-8859-1' as per the HTTP specification (RFC 2616).
3144
+
3145
+    Then, assuming we didn't find a character encoding in the HTTP headers
3146
+    (and the HTTP Content-type allowed us to look in the body), we need
3147
+    to sniff the first few bytes of the XML data and try to determine
3148
+    whether the encoding is ASCII-compatible.  Section F of the XML
3149
+    specification shows the way here:
3150
+    http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
3151
+
3152
+    If the sniffed encoding is not ASCII-compatible, we need to make it
3153
+    ASCII compatible so that we can sniff further into the XML declaration
3154
+    to find the encoding attribute, which will tell us the true encoding.
3155
+
3156
+    Of course, none of this guarantees that we will be able to parse the
3157
+    feed in the declared character encoding (assuming it was declared
3158
+    correctly, which many are not).  CJKCodecs and iconv_codec help a lot;
3159
+    you should definitely install them if you can.
3160
+    http://cjkpython.i18n.org/
3161
+    '''
3162
+
3163
+    def _parseHTTPContentType(content_type):
3164
+        '''takes HTTP Content-Type header and returns (content type, charset)
3165
+
3166
+        If no charset is specified, returns (content type, '')
3167
+        If no content type is specified, returns ('', '')
3168
+        Both return parameters are guaranteed to be lowercase strings
3169
+        '''
3170
+        content_type = content_type or ''
3171
+        content_type, params = cgi.parse_header(content_type)
3172
+        return content_type, params.get('charset', '').replace("'", '')
3173
+
3174
+    sniffed_xml_encoding = ''
3175
+    xml_encoding = ''
3176
+    true_encoding = ''
3177
+    http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type'))
3178
+    # Must sniff for non-ASCII-compatible character encodings before
3179
+    # searching for XML declaration.  This heuristic is defined in
3180
+    # section F of the XML specification:
3181
+    # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
3182
+    try:
3183
+        if xml_data[:4] == '\x4c\x6f\xa7\x94':
3184
+            # EBCDIC
3185
+            xml_data = _ebcdic_to_ascii(xml_data)
3186
+        elif xml_data[:4] == '\x00\x3c\x00\x3f':
3187
+            # UTF-16BE
3188
+            sniffed_xml_encoding = 'utf-16be'
3189
+            xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
3190
+        elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'):
3191
+            # UTF-16BE with BOM
3192
+            sniffed_xml_encoding = 'utf-16be'
3193
+            xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
3194
+        elif xml_data[:4] == '\x3c\x00\x3f\x00':
3195
+            # UTF-16LE
3196
+            sniffed_xml_encoding = 'utf-16le'
3197
+            xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
3198
+        elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'):
3199
+            # UTF-16LE with BOM
3200
+            sniffed_xml_encoding = 'utf-16le'
3201
+            xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
3202
+        elif xml_data[:4] == '\x00\x00\x00\x3c':
3203
+            # UTF-32BE
3204
+            sniffed_xml_encoding = 'utf-32be'
3205
+            xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
3206
+        elif xml_data[:4] == '\x3c\x00\x00\x00':
3207
+            # UTF-32LE
3208
+            sniffed_xml_encoding = 'utf-32le'
3209
+            xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
3210
+        elif xml_data[:4] == '\x00\x00\xfe\xff':
3211
+            # UTF-32BE with BOM
3212
+            sniffed_xml_encoding = 'utf-32be'
3213
+            xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
3214
+        elif xml_data[:4] == '\xff\xfe\x00\x00':
3215
+            # UTF-32LE with BOM
3216
+            sniffed_xml_encoding = 'utf-32le'
3217
+            xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
3218
+        elif xml_data[:3] == '\xef\xbb\xbf':
3219
+            # UTF-8 with BOM
3220
+            sniffed_xml_encoding = 'utf-8'
3221
+            xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
3222
+        else:
3223
+            # ASCII-compatible
3224
+            pass
3225
+        xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
3226
+    except:
3227
+        xml_encoding_match = None
3228
+    if xml_encoding_match:
3229
+        xml_encoding = xml_encoding_match.groups()[0].lower()
3230
+        if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):
3231
+            xml_encoding = sniffed_xml_encoding
3232
+    acceptable_content_type = 0
3233
+    application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')
3234
+    text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
3235
+    if (http_content_type in application_content_types) or \
3236
+       (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
3237
+        acceptable_content_type = 1
3238
+        true_encoding = http_encoding or xml_encoding or 'utf-8'
3239
+    elif (http_content_type in text_content_types) or \
3240
+         (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):
3241
+        acceptable_content_type = 1
3242
+        true_encoding = http_encoding or 'us-ascii'
3243
+    elif http_content_type.startswith('text/'):
3244
+        true_encoding = http_encoding or 'us-ascii'
3245
+    elif http_headers and (not http_headers.has_key('content-type')):
3246
+        true_encoding = xml_encoding or 'iso-8859-1'
3247
+    else:
3248
+        true_encoding = xml_encoding or 'utf-8'
3249
+    # some feeds claim to be gb2312 but are actually gb18030.
3250
+    # apparently MSIE and Firefox both do the following switch:
3251
+    if true_encoding.lower() == 'gb2312':
3252
+        true_encoding = 'gb18030'
3253
+    return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
3254
+
3255
+def _toUTF8(data, encoding):
3256
+    '''Changes an XML data stream on the fly to specify a new encoding
3257
+
3258
+    data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
3259
+    encoding is a string recognized by encodings.aliases
3260
+    '''
3261
+    if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)
3262
+    # strip Byte Order Mark (if present)
3263
+    if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'):
3264
+        if _debug:
3265
+            sys.stderr.write('stripping BOM\n')
3266
+            if encoding != 'utf-16be':
3267
+                sys.stderr.write('trying utf-16be instead\n')
3268
+        encoding = 'utf-16be'
3269
+        data = data[2:]
3270
+    elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'):
3271
+        if _debug:
3272
+            sys.stderr.write('stripping BOM\n')
3273
+            if encoding != 'utf-16le':
3274
+                sys.stderr.write('trying utf-16le instead\n')
3275
+        encoding = 'utf-16le'
3276
+        data = data[2:]
3277
+    elif data[:3] == '\xef\xbb\xbf':
3278
+        if _debug:
3279
+            sys.stderr.write('stripping BOM\n')
3280
+            if encoding != 'utf-8':
3281
+                sys.stderr.write('trying utf-8 instead\n')
3282
+        encoding = 'utf-8'
3283
+        data = data[3:]
3284
+    elif data[:4] == '\x00\x00\xfe\xff':
3285
+        if _debug:
3286
+            sys.stderr.write('stripping BOM\n')
3287
+            if encoding != 'utf-32be':
3288
+                sys.stderr.write('trying utf-32be instead\n')
3289
+        encoding = 'utf-32be'
3290
+        data = data[4:]
3291
+    elif data[:4] == '\xff\xfe\x00\x00':
3292
+        if _debug:
3293
+            sys.stderr.write('stripping BOM\n')
3294
+            if encoding != 'utf-32le':
3295
+                sys.stderr.write('trying utf-32le instead\n')
3296
+        encoding = 'utf-32le'
3297
+        data = data[4:]
3298
+    newdata = unicode(data, encoding)
3299
+    if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)
3300
+    declmatch = re.compile('^<\?xml[^>]*?>')
3301
+    newdecl = '''<?xml version='1.0' encoding='utf-8'?>'''
3302
+    if declmatch.search(newdata):
3303
+        newdata = declmatch.sub(newdecl, newdata)
3304
+    else:
3305
+        newdata = newdecl + u'\n' + newdata
3306
+    return newdata.encode('utf-8')
3307
+
3308
+def _stripDoctype(data):
3309
+    '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
3310
+
3311
+    rss_version may be 'rss091n' or None
3312
+    stripped_data is the same XML document, minus the DOCTYPE
3313
+    '''
3314
+    start = re.search('<\w',data)
3315
+    start = start and start.start() or -1
3316
+    head,data = data[:start+1], data[start+1:]
3317
+
3318
+    entity_pattern = re.compile(r'^\s*<!ENTITY([^>]*?)>', re.MULTILINE)
3319
+    entity_results=entity_pattern.findall(head)
3320
+    head = entity_pattern.sub('', head)
3321
+    doctype_pattern = re.compile(r'^\s*<!DOCTYPE([^>]*?)>', re.MULTILINE)
3322
+    doctype_results = doctype_pattern.findall(head)
3323
+    doctype = doctype_results and doctype_results[0] or ''
3324
+    if doctype.lower().count('netscape'):
3325
+        version = 'rss091n'
3326
+    else:
3327
+        version = None
3328
+
3329
+    # only allow in 'safe' inline entity definitions
3330
+    replacement=''
3331
+    if len(doctype_results)==1 and entity_results:
3332
+       safe_pattern=re.compile('\s+(\w+)\s+"(&#\w+;|[^&"]*)"')
3333
+       safe_entities=filter(lambda e: safe_pattern.match(e),entity_results)
3334
+       if safe_entities:
3335
+           replacement='<!DOCTYPE feed [\n  <!ENTITY %s>\n]>' % '>\n  <!ENTITY '.join(safe_entities)
3336
+    data = doctype_pattern.sub(replacement, head) + data
3337
+
3338
+    return version, data, dict(replacement and safe_pattern.findall(replacement))
3339
+
3340
+def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):
3341
+    '''Parse a feed from a URL, file, stream, or string'''
3342
+    result = FeedParserDict()
3343
+    result['feed'] = FeedParserDict()
3344
+    result['entries'] = []
3345
+    if _XML_AVAILABLE:
3346
+        result['bozo'] = 0
3347
+    if type(handlers) == types.InstanceType:
3348
+        handlers = [handlers]
3349
+    try:
3350
+        f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)
3351
+        data = f.read()
3352
+    except Exception as e:
3353
+        result['bozo'] = 1
3354
+        result['bozo_exception'] = e
3355
+        data = ''
3356
+        f = None
3357
+
3358
+    # if feed is gzip-compressed, decompress it
3359
+    if f and data and hasattr(f, 'headers'):
3360
+        if gzip and f.headers.get('content-encoding', '') == 'gzip':
3361
+            try:
3362
+                data = gzip.GzipFile(fileobj=_StringIO(data)).read()
3363
+            except Exception as e:
3364
+                # Some feeds claim to be gzipped but they're not, so
3365
+                # we get garbage.  Ideally, we should re-request the
3366
+                # feed without the 'Accept-encoding: gzip' header,
3367
+                # but we don't.
3368
+                result['bozo'] = 1
3369
+                result['bozo_exception'] = e
3370
+                data = ''
3371
+        elif zlib and f.headers.get('content-encoding', '') == 'deflate':
3372
+            try:
3373
+                data = zlib.decompress(data, -zlib.MAX_WBITS)
3374
+            except Exception as e:
3375
+                result['bozo'] = 1
3376
+                result['bozo_exception'] = e
3377
+                data = ''
3378
+
3379
+    # save HTTP headers
3380
+    if hasattr(f, 'info'):
3381
+        info = f.info()
3382
+        etag = info.getheader('ETag')
3383
+        if etag:
3384
+            result['etag'] = etag
3385
+        last_modified = info.getheader('Last-Modified')
3386
+        if last_modified:
3387
+            result['modified'] = _parse_date(last_modified)
3388
+    if hasattr(f, 'url'):
3389
+        result['href'] = f.url
3390
+        result['status'] = 200
3391
+    if hasattr(f, 'status'):
3392
+        result['status'] = f.status
3393
+    if hasattr(f, 'headers'):
3394
+        result['headers'] = f.headers.dict
3395
+    if hasattr(f, 'close'):
3396
+        f.close()
3397
+
3398
+    # there are four encodings to keep track of:
3399
+    # - http_encoding is the encoding declared in the Content-Type HTTP header
3400
+    # - xml_encoding is the encoding declared in the <?xml declaration
3401
+    # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data
3402
+    # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
3403
+    http_headers = result.get('headers', {})
3404
+    result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \
3405
+        _getCharacterEncoding(http_headers, data)
3406
+    if http_headers and (not acceptable_content_type):
3407
+        if http_headers.has_key('content-type'):
3408
+            bozo_message = '%s is not an XML media type' % http_headers['content-type']
3409
+        else:
3410
+            bozo_message = 'no Content-type specified'
3411
+        result['bozo'] = 1
3412
+        result['bozo_exception'] = NonXMLContentType(bozo_message)
3413
+
3414
+    result['version'], data, entities = _stripDoctype(data)
3415
+
3416
+    baseuri = http_headers.get('content-location', result.get('href'))
3417
+    baselang = http_headers.get('content-language', None)
3418
+
3419
+    # if server sent 304, we're done
3420
+    if result.get('status', 0) == 304:
3421
+        result['version'] = ''
3422
+        result['debug_message'] = 'The feed has not changed since you last checked, ' + \
3423
+            'so the server sent no data.  This is a feature, not a bug!'
3424
+        return result
3425
+
3426
+    # if there was a problem downloading, we're done
3427
+    if not data:
3428
+        return result
3429
+
3430
+    # determine character encoding
3431
+    use_strict_parser = 0
3432
+    known_encoding = 0
3433
+    tried_encodings = []
3434
+    # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
3435
+    for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding):
3436
+        if not proposed_encoding: continue
3437
+        if proposed_encoding in tried_encodings: continue
3438
+        tried_encodings.append(proposed_encoding)
3439
+        try:
3440
+            data = _toUTF8(data, proposed_encoding)
3441
+            known_encoding = use_strict_parser = 1
3442
+            break
3443
+        except:
3444
+            pass
3445
+    # if no luck and we have auto-detection library, try that
3446
+    if (not known_encoding) and chardet:
3447
+        try:
3448
+            proposed_encoding = chardet.detect(data)['encoding']
3449
+            if proposed_encoding and (proposed_encoding not in tried_encodings):
3450
+                tried_encodings.append(proposed_encoding)
3451
+                data = _toUTF8(data, proposed_encoding)
3452
+                known_encoding = use_strict_parser = 1
3453
+        except:
3454
+            pass
3455
+    # if still no luck and we haven't tried utf-8 yet, try that
3456
+    if (not known_encoding) and ('utf-8' not in tried_encodings):
3457
+        try:
3458
+            proposed_encoding = 'utf-8'
3459
+            tried_encodings.append(proposed_encoding)
3460
+            data = _toUTF8(data, proposed_encoding)
3461
+            known_encoding = use_strict_parser = 1
3462
+        except:
3463
+            pass
3464
+    # if still no luck and we haven't tried windows-1252 yet, try that
3465
+    if (not known_encoding) and ('windows-1252' not in tried_encodings):
3466
+        try:
3467
+            proposed_encoding = 'windows-1252'
3468
+            tried_encodings.append(proposed_encoding)
3469
+            data = _toUTF8(data, proposed_encoding)
3470
+            known_encoding = use_strict_parser = 1
3471
+        except:
3472
+            pass
3473
+    # if still no luck and we haven't tried iso-8859-2 yet, try that.
3474
+    if (not known_encoding) and ('iso-8859-2' not in tried_encodings):
3475
+        try:
3476
+            proposed_encoding = 'iso-8859-2'
3477
+            tried_encodings.append(proposed_encoding)
3478
+            data = _toUTF8(data, proposed_encoding)
3479
+            known_encoding = use_strict_parser = 1
3480
+        except:
3481
+            pass
3482
+    # if still no luck, give up
3483
+    if not known_encoding:
3484
+        result['bozo'] = 1
3485
+        result['bozo_exception'] = CharacterEncodingUnknown( \
3486
+            'document encoding unknown, I tried ' + \
3487
+            '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' % \
3488
+            (result['encoding'], xml_encoding))
3489
+        result['encoding'] = ''
3490
+    elif proposed_encoding != result['encoding']:
3491
+        result['bozo'] = 1
3492
+        result['bozo_exception'] = CharacterEncodingOverride( \
3493
+            'documented declared as %s, but parsed as %s' % \
3494
+            (result['encoding'], proposed_encoding))
3495
+        result['encoding'] = proposed_encoding
3496
+
3497
+    if not _XML_AVAILABLE:
3498
+        use_strict_parser = 0
3499
+    if use_strict_parser:
3500
+        # initialize the SAX parser
3501
+        feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
3502
+        saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
3503
+        saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
3504
+        saxparser.setContentHandler(feedparser)
3505
+        saxparser.setErrorHandler(feedparser)
3506
+        source = xml.sax.xmlreader.InputSource()
3507
+        source.setByteStream(_StringIO(data))
3508
+        if hasattr(saxparser, '_ns_stack'):
3509
+            # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
3510
+            # PyXML doesn't have this problem, and it doesn't have _ns_stack either
3511
+            saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
3512
+        try:
3513
+            saxparser.parse(source)
3514
+        except Exception, e:
3515
+            if _debug:
3516
+                import traceback
3517
+                traceback.print_stack()
3518
+                traceback.print_exc()
3519
+                sys.stderr.write('xml parsing failed\n')
3520
+            result['bozo'] = 1
3521
+            result['bozo_exception'] = feedparser.exc or e
3522
+            use_strict_parser = 0
3523
+    if not use_strict_parser:
3524
+        feedparser = _LooseFeedParser(baseuri, baselang, known_encoding and 'utf-8' or '', entities)
3525
+        feedparser.feed(data)
3526
+    result['feed'] = feedparser.feeddata
3527
+    result['entries'] = feedparser.entries
3528
+    result['version'] = result['version'] or feedparser.version
3529
+    result['namespaces'] = feedparser.namespacesInUse
3530
+    return result
3531
+
3532
+class Serializer:
3533
+    def __init__(self, results):
3534
+        self.results = results
3535
+
3536
+class TextSerializer(Serializer):
3537
+    def write(self, stream=sys.stdout):
3538
+        self._writer(stream, self.results, '')
3539
+
3540
+    def _writer(self, stream, node, prefix):
3541
+        if not node: return
3542
+        if hasattr(node, 'keys'):
3543
+            keys = node.keys()
3544
+            keys.sort()
3545
+            for k in keys:
3546
+                if k in ('description', 'link'): continue
3547
+                if node.has_key(k + '_detail'): continue
3548
+                if node.has_key(k + '_parsed'): continue
3549
+                self._writer(stream, node[k], prefix + k + '.')
3550
+        elif type(node) == types.ListType:
3551
+            index = 0
3552
+            for n in node:
3553
+                self._writer(stream, n, prefix[:-1] + '[' + str(index) + '].')
3554
+                index += 1
3555
+        else:
3556
+            try:
3557
+                s = str(node).encode('utf-8')
3558
+                s = s.replace('\\', '\\\\')
3559
+                s = s.replace('\r', '')
3560
+                s = s.replace('\n', r'\n')
3561
+                stream.write(prefix[:-1])
3562
+                stream.write('=')
3563
+                stream.write(s)
3564
+                stream.write('\n')
3565
+            except:
3566
+                pass
3567
+
3568
+class PprintSerializer(Serializer):
3569
+    def write(self, stream=sys.stdout):
3570
+        if self.results.has_key('href'):
3571
+            stream.write(self.results['href'] + '\n\n')
3572
+        from pprint import pprint
3573
+        pprint(self.results, stream)
3574
+        stream.write('\n')
3575
+
3576
+if __name__ == '__main__':
3577
+    try:
3578
+        from optparse import OptionParser
3579
+    except:
3580
+        OptionParser = None
3581
+
3582
+    if OptionParser:
3583
+        optionParser = OptionParser(version=__version__, usage="%prog [options] url_or_filename_or_-")
3584
+        optionParser.set_defaults(format="pprint")
3585
+        optionParser.add_option("-A", "--user-agent", dest="agent", metavar="AGENT", help="User-Agent for HTTP URLs")
3586
+        optionParser.add_option("-e", "--referer", "--referrer", dest="referrer", metavar="URL", help="Referrer for HTTP URLs")
3587
+        optionParser.add_option("-t", "--etag", dest="etag", metavar="TAG", help="ETag/If-None-Match for HTTP URLs")
3588
+        optionParser.add_option("-m", "--last-modified", dest="modified", metavar="DATE", help="Last-modified/If-Modified-Since for HTTP URLs (any supported date format)")
3589
+        optionParser.add_option("-f", "--format", dest="format", metavar="FORMAT", help="output results in FORMAT (text, pprint)")
3590
+        optionParser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False, help="write debugging information to stderr")
3591
+        (options, urls) = optionParser.parse_args()
3592
+        if options.verbose:
3593
+            _debug = 1
3594
+        if not urls:
3595
+            optionParser.print_help()
3596
+            sys.exit(0)
3597
+    else:
3598
+        if not sys.argv[1:]:
3599
+            print __doc__
3600
+            sys.exit(0)
3601
+        class _Options:
3602
+            etag = modified = agent = referrer = None
3603
+            format = 'pprint'
3604
+        options = _Options()
3605
+        urls = sys.argv[1:]
3606
+
3607
+    zopeCompatibilityHack()
3608
+
3609
+    serializer = globals().get(options.format.capitalize() + 'Serializer', Serializer)
3610
+    for url in urls:
3611
+        results = parse(url, etag=options.etag, modified=options.modified, agent=options.agent, referrer=options.referrer)
3612
+        serializer(results).write(sys.stdout)
... ...
@@ -0,0 +1,630 @@
1
+#!/usr/bin/python2.5
2
+# chmod 755 me, and make sure I have UNIX style newlines.
3
+#
4
+# techcrunch.py
5
+#
6
+# http://feeds.feedburner.com/TechCrunch
7
+# feed = feedparser.parse( 'http://feeds.feedburner.com/TechCrunch' )
8
+# feed.entries[14]['feedburner_origlink'], feed.entries[14]['slash_comments']
9
+#
10
+# TODO:
11
+# 1. Deep links: '<a class="post_more_link snap_nopreview" href="http://www.crunchgear.com/2010/08/21/fanboyism-when-expression-meets-desperation/">Read the rest of this entry &raquo;</a>'
12
+#   link = "http://techcrunch.com/2010/08/21/fanboyism-when-expression-meets-desperation/"
13
+
14
+import feedparser
15
+import yaml
16
+import sys
17
+import os
18
+import time
19
+import StringIO
20
+import codecs
21
+import traceback
22
+import calendar
23
+import pickle
24
+import exceptions
25
+import urllib
26
+import urllib2
27
+import httplib
28
+import shutil
29
+import glob
30
+import smtplib
31
+import bisect
32
+import analysis
33
+import simplejson as json
34
+import cookielib
35
+
36
+debug = True
37
+any_entry_added = False
38
+
39
+localdir = ''
40
+
41
+html_head = """
42
+<!DOCTYPE HTML PUBLIC '-//W3C//DTD HTML 4.01 Transitional//EN'>
43
+<HTML><HEAD>
44
+  <title>TechCrunch Feed Filter</title>
45
+  <!-- <link rel="alternate" type="application/rss+xml" title="RSS feed" href="http://techcrunch.dlma.com/rss_feed.xml" /> -->
46
+  <link rel="alternate" type="application/rss+xml" title="RSS feed" href="http://feeds.feedburner.com/TrendingAtTechcrunch" />
47
+  <style type="text/css">
48
+    body { font-family: "Arial", san-serif; }
49
+    .author { font-size: smaller; }
50
+    .h3 { font-size: larger; }
51
+    a { text-decoration: none; }
52
+    /* table { border: none; border-collapse:collapse; font-size: large } */
53
+    table { border-collapse: collapse; }
54
+    table.legend { border:1px solid LightSlateGray; font-size: medium; border-collapse:separated; }
55
+    table.legend th { border: 1px solid LightSlateGray; background-color: #E0E0E0; }
56
+    table.legend td { border: 1px solid LightSlateGray; }
57
+    tr.even { background:#%s; padding: 2em; }
58
+    tr.odd { background:#%s; padding-bottom: 2em; }
59
+  </style>
60
+</HEAD>
61
+<BODY>
62
+<div align='center'><h3>TechCrunch Feed Filter</h3></div>
63
+This page shows what analysis is done to filter the noise away from the Techcrunch feed into <a href="http://feeds.feedburner.com/TrendingAtTechcrunch">a more concise feed</a>.<br /><br />
64
+"""
65
+
66
+html_footer = """
67
+</table>
68
+</div><br />
69
+<div align='center'>Thanks to <a href="http://www.feedparser.org/">The Universal Feed Parser module</a>,
70
+<a href="http://pyyaml.org/">PyYAML</a> and <a href="http://code.google.com/apis/chart/">Google Charts</a>.<br /><a href="techcrunch.yaml">raw data</a> &bull; <a href="stats.txt">status</a></div><br />
71
+</BODY>
72
+</HTML>
73
+"""
74
+
75
+img_width = 300
76
+img_height = 50
77
+
78
+series_1_color = "0000FF"
79
+series_2_color = "00AA00"
80
+threshold_color = "FF8C00"
81
+
82
+even_background = "F8F8F8"
83
+#even_background = "FFFFFF"
84
+odd_background = "E8E8E8"
85
+
86
+def asciiize( s ):
87
+    try:
88
+        return s.encode( 'ascii' )
89
+    except UnicodeEncodeError, e:
90
+        return s
91
+    except exceptions.AttributeError, e:
92
+        return s
93
+
94
+def sendEmail( subject, message, toaddrs, fromaddr='"techcrunch.py" <techcrunch@techcrunch.dlma.com>' ):
95
+    """Sends Email"""
96
+    smtp = smtplib.SMTP( 'localhost' )
97
+    smtp.sendmail( fromaddr, \
98
+                   toaddrs, \
99
+                   "Content-Type: text/plain; charset=\"us-ascii\"\r\nFrom: %s\r\nTo: %s\r\nSubject: %s\r\n%s" % \
100
+                   ( fromaddr, ", ".join( toaddrs ), subject, message ) )
101
+    smtp.quit()
102
+
103
+def make_chart_url( time_posted, comment_times, comment_values, retweet_times, retweet_values, met_threshold_pt, bg_color ):
104
+#    comment_times, comment_values = zip( *comments )
105
+#    retweet_times, retweet_values = zip( *retweets )
106
+
107
+    # TODO handle failure cases, -1
108
+
109
+    if not len( comment_times ):
110
+        comment_times = [ time_posted, ]
111
+    if not len( comment_values ):
112
+        comment_values = [ 0, ]
113
+    if not len( retweet_times ):
114
+        retweet_times = [ time_posted, ]
115
+    if not len( retweet_values ):
116
+        retweet_values = [ 0, ]
117
+
118
+#    comment_times = [ (i - time_posted + 900) / 1800 for i in comment_times ]
119
+#    retweet_times = [ (i - time_posted + 900) / 1800 for i in retweet_times ]
120
+    comment_times = [ (i - time_posted) / 1800 for i in comment_times ]
121
+    retweet_times = [ (i - time_posted) / 1800 for i in retweet_times ]
122
+
123
+    min_comment_time = min( comment_times )
124
+    max_comment_time = max( comment_times )
125
+    min_comment_value = min( comment_values )
126
+    max_comment_value = max( comment_values )
127
+    min_retweet_time = min( retweet_times )
128
+    max_retweet_time = max( retweet_times )
129
+    min_retweet_value = min( retweet_values )
130
+    max_retweet_value = max( retweet_values )
131
+
132
+    if len( comment_values ) < 8 and len( comment_values ) > 1:
133
+        # max_comment_value *= 2
134
+        pass
135
+    elif len( comment_values ) == 1:
136
+        min_comment_value = 0
137
+    if len( retweet_values ) < 8 and len( retweet_values ) > 1:
138
+        # max_retweet_value *= 2
139
+        pass
140
+    elif len( retweet_values ) == 1:
141
+        min_retweet_value = 0
142
+
143
+    min_comment_value = 0
144
+    min_retweet_value = 0
145
+
146
+    chart_url = "http://chart.apis.google.com/chart?cht=lxy&chco=%s,%s&chs=%dx%d&chxs=0,%s|1,%s" % \
147
+                ( series_1_color, series_2_color, img_width, img_height, series_1_color, series_2_color )
148
+    chart_url += "&chd=t:%s|%s|%s|%s" % ( ','.join( [ str( n ) for n in comment_times ] ),
149
+                                          ','.join( [ str( n ) for n in comment_values ] ),
150
+                                          ','.join( [ str( n ) for n in retweet_times ] ),
151
+                                          ','.join( [ str( n ) for n in retweet_values ] ) )
152
+    if met_threshold_pt != -1:
153
+        chart_url += "&chm=o,%s,1,%d,10" % ( threshold_color, met_threshold_pt )
154
+    chart_url += "&chxt=y,r&chxl=0:|%d|%d|1:|%d|%d&chds=%d,%d,%d,%d,%d,%d,%d,%d" % \
155
+                 ( min_comment_value, max_comment_value, min_retweet_value, max_retweet_value,
156
+                   0, max( 7, max_comment_time ),
157
+                   min_comment_value, max_comment_value,
158
+                   0, max( 7, max_retweet_time ),
159
+                   min_comment_value, max_retweet_value )
160
+    chart_url += "&chf=bg,s,%s&chdl=comments|retweets" % ( bg_color, )
161
+    return chart_url
162
+
163
+def process_feed( yaml_items ):
164
+    """
165
+    Retrieve the url and process it.
166
+    feed_info (in, out) A tuple that describes an individual feed, like its name and etag.
167
+    """
168
+
169
+    feed = feedparser.parse( 'http://feeds.feedburner.com/TechCrunch' )
170
+    if hasattr( feed, 'status' ):
171
+        if feed.status == 304:
172
+            pass
173
+        else:
174
+            feed_is_modified = True
175
+            if feed.status != 200 and feed.status != 307 and feed.status != 301 and feed.status != 302:
176
+                if feed.status == 503:
177
+                    print "the feed is temporarily unavailable."
178
+                elif feed.status == 400:
179
+                    print "the feed says we made a bad request."
180
+                elif feed.status == 502:
181
+                    print "the feed reported a bad gateway error."
182
+                elif feed.status == 404:
183
+                    print "the feed says the page was not found."
184
+                elif feed.status == 500:
185
+                    print "the feed had an internal server error."
186
+                elif feed.status == 403:
187
+                    print "Access to the feed was forbidden."
188
+                else:
189
+                    print "the feed returned feed.status %d." % ( feed.status, )
190
+            else:
191
+                # Save off this
192
+                f = file( os.path.join( localdir, 'techcrunch_feed.pickle' ), 'wb' )
193
+                try:
194
+                    pickle.dump( feed, f )
195
+                except( pickle.PicklingError, exceptions.TypeError ), e:
196
+                    print "An error occurred while pickling the feed: %s." % \
197
+                          ( # str(e.__class__),
198
+                            str(e) )
199
+                    traceback.print_exc( file = sys.stdout )
200
+                    feed_is_modified = False
201
+                f.close()
202
+
203
+            for i in reversed( feed.entries ):
204
+                process_item( i, yaml_items )
205
+
206
+            # If we have more than 200 items, remove the old ones.
207
+            while len( yaml_items ) > 200:
208
+                yaml_items.pop()
209
+
210
+            cookie = Get_cookie( urllib2.Request( 'http://mediacdn.disqus.com/1078/build/system/count.js' ) )
211
+
212
+            for i in yaml_items:
213
+                # i['title'] = asciiize( i['title'] )
214
+                # i['tags'] = map( asciiize, i['tags'] )
215
+                process_yaml_item( i, cookie )
216
+
217
+    else:
218
+        if hasattr(feed, 'bozo_exception'):
219
+            e = feed.bozo_exception
220
+            if isinstance( e, urllib2.URLError ): # e.__class__ == urllib2.URLError: # and hasattr(e, 'errno') and e.errno == 110:
221
+                print_last_line = True
222
+                if hasattr(e, 'reason'):
223
+                    if e.reason[0] == 110:
224
+                        print "the feed's connection timed out."
225
+                        print_last_line = False
226
+                    elif e.reason[0] == 111:
227
+                        print "the feed's connection was refused."
228
+                        print_last_line = False
229
+                    elif e.reason[0] == 104:
230
+                        print "the feed reset the connection."
231
+                        print_last_line = False
232
+                    else:
233
+                        print "the feed had a URLError with reason %s." % ( str(e.reason), )
234
+                        print_last_line = False
235
+                if print_last_line:
236
+                    print "the feed had a URLError %s" % ( str(e), )
237
+            elif isinstance( e, httplib.BadStatusLine ):
238
+                if hasattr(e, 'message'):
239
+                    print "the feed gave a bad status line %s." % ( str(e.message ), )
240
+                else:
241
+                    print "the feed gave a bad status line."
242
+            else:
243
+                if len( str(e) ):
244
+                    print "the feed bozo_exception: %s \"%s\"" % ( str(e.__class__), str(e) )
245
+                else:
246
+                    print "the feed bozo_exception: %s %s" % ( str(e.__class__), repr(e) )
247
+        else:
248
+            print "the feed returned class %s, %s" % ( str(feed.__class__), str(feed) )
249
+
250
+def process_item( feed_item, yaml_items ):
251
+    # Get the time
252
+    global any_entry_added
253
+    timecode_now = int( time.time() )
254
+    date_parsed = time.gmtime()
255
+    if hasattr( feed_item, 'issued_parsed' ):
256
+        date_parsed = feed_item.issued_parsed
257
+        date_set = True
258
+    elif hasattr( feed_item, 'date_parsed' ):
259
+        date_parsed = feed_item.date_parsed
260
+        date_set = True
261
+    else:
262
+    print "process_item found no timestamp for", asciiize( feed_item.link )
263
+    timecode_parsed = calendar.timegm( date_parsed )
264
+
265
+    # Look for i.feedburner_origlink in yaml_items
266
+    yaml_item = None
267
+    for i in yaml_items:
268
+        if feed_item.feedburner_origlink == i['link']:
269
+            yaml_item = i
270
+            break
271
+    if not yaml_item:
272
+        author = ''
273
+        link = feed_item.link
274
+        if hasattr( feed_item, 'author' ):
275
+            author = asciiize( feed_item.author )
276
+        if hasattr( feed_item, 'feedburner_origlink' ):
277
+            link = feed_item.feedburner_origlink
278
+
279
+         # Make a new yaml_item
280
+        yaml_item = { 'title'               : asciiize( feed_item.title ),
281
+                      'link'                : asciiize( link ),
282
+                      'author'              : author,
283
+                      'tags'                : [],
284
+                      'orig_posted'         : timecode_parsed,
285
+                      'qualified'           : -1,
286
+                      'comment_times'       : [],
287
+                      'comments'            : [],
288
+                      'slash_comment_times' : [],
289
+                      'slash_comments'      : [],
290
+                      'retweet_times'       : [],
291
+                      'retweets'            : []
292
+                    }
293
+        if hasattr( feed_item, 'tags' ):
294
+            for i in feed_item.tags:
295
+                yaml_item['tags'].append( asciiize( i.term ) )
296
+
297
+        yaml_items.insert( 0, yaml_item )
298
+        any_entry_added = True
299
+
300
+    # Maybe check to ensure that this item isn't too old.
301
+    if timecode_parsed < timecode_now - 60 * 30 * 9:
302
+        return
303
+
304
+    # Now, add the new values
305
+    if hasattr( feed_item, 'slash_comments' ) and len( yaml_item['slash_comments'] ) < 8:
306
+        any_entry_added = True
307
+        yaml_item['slash_comment_times'].append( timecode_now )
308
+        yaml_item['slash_comments'].append( int( feed_item.slash_comments ) )
309
+
310
+def process_yaml_item( yaml_item, cookie ):
311
+    global any_entry_added
312
+
313
+    timecode_now = int( time.time() )
314
+    if len( yaml_item['comments'] ) < 8:
315
+        num_comments = Get_num_disqus_comments( yaml_item['link'], cookie )
316
+        if num_comments != -1:
317
+            any_entry_added = True
318
+            yaml_item['comment_times'].append( timecode_now )
319
+            yaml_item['comments'].append( num_comments )
320
+
321
+    if len( yaml_item['retweets'] ) < 8:
322
+        num_retweets = Get_num_retweets( yaml_item['link'] )
323
+        if num_retweets != -1:
324
+            any_entry_added = True
325
+            yaml_item['retweet_times'].append( timecode_now )
326
+            yaml_item['retweets'].append( num_retweets )
327
+
328
+def Get_num_comments( url_string ):
329
+    try:
330
+        f = urllib2.urlopen( url_string )
331
+        data = f.read()
332
+        f.close()
333
+    except urllib2.URLError, e:
334
+        if hasattr( e, 'reason' ):
335
+            print "Get_num_comments got an error:", e.reason
336
+    elif hasattr( e, 'code' ):
337
+            print "Get_num_comments got an error. Code:", e.code
338
+        return -1
339
+    tag_to_find = '<a href="#comments" rel="nofollow">'
340
+    offset = data.find( tag_to_find )
341
+    if offset != -1:
342
+        start_pos = offset + len( tag_to_find )
343
+        end_pos = start_pos
344
+        while str.isdigit( data[ end_pos ] ):
345
+            end_pos += 1
346
+    if end_pos > start_pos:
347
+            return int( data[start_pos:end_pos] )
348
+    return -1
349
+
350
+def Get_cookie( cookie_request ):
351
+    cookie = cookielib.CookieJar()
352
+    try:
353
+        cookie_response = urllib2.urlopen( cookie_request )
354
+        cookie.extract_cookies( cookie_response, cookie_request )
355
+        return cookie
356
+    except urllib2.URLError, e:
357
+        if hasattr( e, 'reason' ):
358
+            print "Get_cookie got an error:", e.reason
359
+    elif hasattr( e, 'code' ):
360
+            print "Get_cookie got an error. Code:", e.code
361
+    return None
362
+
363
+def Get_num_disqus_comments( url_string, cookie ):
364
+
365
+    if cookie == None:
366
+        return -1
367
+
368
+    try:
369
+        f = urllib2.urlopen( url_string )
370
+        data = f.read()
371
+        f.close()
372
+    except urllib2.URLError, e:
373
+        if hasattr( e, 'reason' ):
374
+            print "Get_num_disqus_comments got an error:", e.reason
375
+    elif hasattr( e, 'code' ):
376
+            print "Get_num_disqus_comments got an error. Code:", e.code
377
+        return -1
378
+
379
+    tag_to_find = '<a href="#comments" rel="nofollow"><span class="dsq-postid" rel="'
380
+    disqus_tag_to_find = 'displayCount('
381
+    offset = data.find( tag_to_find )
382
+    if offset != -1:
383
+        start_pos = offset + len( tag_to_find )
384
+        end_pos = start_pos
385
+        while data[ end_pos ] != '"' and end_pos < start_pos + 200:
386
+            end_pos += 1
387
+        if end_pos < start_pos + 200:
388
+            opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cookie ) )
389
+            url_GET_data = urllib.quote_plus( data[start_pos:end_pos] ).replace( '+', '%20' )
390
+            request = urllib2.Request( 'http://disqus.com/forums/techcrunch/count.js?q=1&0=1,' + url_GET_data )
391
+        try:
392
+                response = opener.open( request )
393
+                disqus_data = response.read()
394
+            except urllib2.URLError, e:
395
+                if hasattr( e, 'reason' ):
396
+                    print "Get_num_disqus_comments got an error getting the count:", e.reason
397
+            elif hasattr( e, 'code' ):
398
+                    print "Get_num_disqus_comments got an error getting the count. Code:", e.code
399
+        disqus_data = ""
400
+            disqus_offset = disqus_data.find( disqus_tag_to_find )
401
+            if disqus_offset != -1:
402
+                start_pos = disqus_offset + len( disqus_tag_to_find )
403
+                end_pos = disqus_data.find( '}]})', start_pos )
404
+                if end_pos != -1:
405
+                    return int( json.loads( disqus_data[start_pos:end_pos+3] )['counts'][0]['comments'] )
406
+
407
+    return -1
408
+
409
+def Get_num_retweets( url_string ):
410
+    try:
411
+        f = urllib2.urlopen( 'http://api.tweetmeme.com/button.js?url=%s' % ( url_string ) )
412
+        data = f.read()
413
+        f.close()
414
+    except urllib2.URLError, e:
415
+        if hasattr( e, 'reason' ):
416
+            print "Get_num_retweets got an error:", e.reason
417
+    elif hasattr( e, 'code' ):
418
+            print "Get_num_retweets got an error. Code:", e.code
419
+        return -1
420
+    tag_to_find = '<span class="c">'
421
+    offset = data.find( tag_to_find )
422
+    if offset != -1:
423
+        start_pos = offset + len( tag_to_find )
424
+        end_pos = data.find( '<', start_pos )
425
+        if end_pos != -1:
426
+            return int( data[ start_pos:end_pos ] )
427
+    return -1
428
+
429
+def Save_image( url_string, file_path ):
430
+    try:
431
+        f = urllib2.urlopen( url_string )
432
+        data = f.read()
433
+        f.close()
434
+    except urllib2.URLError, e:
435
+        if hasattr( e, 'reason' ):
436
+            print "Save_image got an error:", e.reason
437
+    elif hasattr( e, 'code' ):
438
+            print "Save_image got an error. Code:", e.code
439
+        return url_string
440
+    if len( data ) > 50:
441
+        f = open( file_path, 'wb' )
442
+        f.write( data )
443
+        f.close()
444
+        return 'cache/' + os.path.basename( file_path )
445
+    return url_string
446
+
447
+def Make_index_html( yaml_items, stats ):
448
+    cur_time = int( time.time() )
449
+    new_index_fullpath = os.path.join( localdir, 'index.html_new' )
450
+    index_fullpath = os.path.join( localdir, 'index.html' )
451
+    cache_path = os.path.join( localdir, 'cache' )
452
+
453
+    files_to_delete = glob.glob( cache_path + '*.png' )
454
+#    shutil.rmtree( cache_path )
455
+#    os.mkdir( cache_path )
456
+
457
+    f = file( new_index_fullpath, 'w' )
458
+    f.write( html_head % ( even_background, odd_background ) )
459
+#    f.write( '<div align="center">\n<table cellpadding="4">' )
460
+
461
+    f.write( '<div align="center">\n<table class="legend">\n<tr><th>0:30</th><th>1:00</th><th>1:30</th><th>2:00</th><th>2:30</th><th>3:00</th><th>3:30</th><th>4:00</th></tr><tr>' )
462
+    for median, mean, std_dev in stats:
463
+    f.write( '<td>med=%1.1f &#956;=%1.1f &#963;=%1.1f&nbsp;</td> ' % ( median, mean, std_dev ) )
464
+    f.write( '</tr>\n</table></div>\n<br />\n' )
465
+
466
+    f.write( '<div align="center">\n<table>\n' )
467
+    image_index = 0
468
+    for i in yaml_items[:40]:
469
+        chart_url = make_chart_url( i['orig_posted'],
470
+                                i['comment_times'],
471
+                                    i['comments'],
472
+                                    i['retweet_times'],
473
+                                    i['retweets'],
474
+                                    i['qualified'],
475
+                    image_index % 2 and even_background or odd_background,
476
+                                  )
477
+        image_url = Save_image( chart_url, os.path.join( cache_path, '%d_%d.png' % ( cur_time, image_index ) ) )
478
+        f.write( '<tr valign="center" class="%s">\n  <td><strong><a href="%s">%s</a></strong> <span class="author">by %s</span></td>\n' % \
479
+                 ( image_index % 2 and "even" or "odd",
480
+           i['link'],
481
+                   i['title'].encode( 'ascii', 'xmlcharrefreplace' ),
482
+           i['author'].encode( 'ascii', 'xmlcharrefreplace' ),
483
+                 )
484
+               )
485
+    f.write( '  <td>%s<td>\n' % ( i['qualified'] != -1 and '<img src="star_30.png" width="30" height="29" />' or '' ) )
486
+        f.write( '  <td><img src="%s" width="%d" height="%d" border="0" /></td></tr>\n' % \
487
+                 ( image_url,
488
+                   img_width,
489
+                   img_height
490
+                 )
491
+               )
492
+        image_index += 1
493
+    f.write( html_footer )
494
+    f.close()
495
+    if os.path.exists( index_fullpath ):
496
+        os.unlink( index_fullpath )
497
+    shutil.move( new_index_fullpath, index_fullpath )
498
+    for fname in files_to_delete:
499
+        os.unlink( fname )
500
+
501
+def Make_feed_file( yaml_items ):
502
+    f = open( os.path.join( localdir, 'rss_feed.xml' ), 'wb' )
503
+    f.write( "<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n<rss version=\"2.0\">\n<channel>\n<title>Trending at TechCrunch</title><link>http://techcrunch.dlma.com</link>" )
504
+    f.write( "<pubDate>%s</pubDate><description>Automatically Generated Feed</description><language>en-us</language>" % ( time.strftime( "%a, %d %b %Y %H:%M:%S +0000", time.gmtime() ) ) )
505
+    count = 0
506
+    for item in yaml_items:
507
+        now = time.strftime( "%a, %d %b %Y %H:%M:%S +0000", time.gmtime( item['orig_posted'] ) )
508
+        if item['qualified'] != -1:
509
+            f.write( "<item><title>%s</title><pubDate>%s</pubDate><link>%s</link><guid isPermaLink=\"false\">%s</guid><description><![CDATA[By: %s]]></description></item>\n" % \
510
+                     ( item['title'].encode( 'ascii', 'xmlcharrefreplace' ), now, item['link'], item['link'], item['author'].encode( 'ascii', 'xmlcharrefreplace' ) ) )
511
+            count += 1
512
+            if count > 14:
513
+                break
514
+    f.write( "</channel></rss>" )
515
+    f.close()
516
+
517
+if __name__=='__main__':
518
+    start_time = time.time()
519
+    progress_text = []
520
+
521
+    old_stdout = sys.stdout
522
+    old_stderr = sys.stderr
523
+    sys.stdout = sys.stderr = StringIO.StringIO()
524
+
525
+    try:
526
+        localdir = os.path.abspath( os.path.dirname( sys.argv[0] ) )
527
+        #
528
+        # Read in techcrunch.yaml
529
+        #
530
+        # [ { 'title'               : 'Title Text',
531
+        #     'link'                : u'http://techcrunch.com/2010/08/17/google-buzz-who-to-follow/',
532
+        #     'author'              : u'MG Siegler',
533
+        #     'orig_posted'         : 1282197199
534
+        #     'tags'                : [ u'Google', u'privacy' ]
535
+        #     'qualified'           : -1
536
+        #     'comment_times'       : [ 1282197199, 1282197407 ]
537
+        #     'comments'            : [ 0, 15 ]
538
+        #     'slash_comment_times' : [ 1282197199, 1282197407 ]
539
+        #     'slash_comments'      : [ 0, 5 ]
540
+        #     'slash_comment_times' : [ 1282197199, 1282197407 ]
541
+        #     'slash_comments'      : [ 0, 3 ]
542
+        #     'retweet_times'       : [ 1282197199, 1282197407 ]
543
+        #     'retweets'            : [ 0, 43 ]
544
+        #    },
545
+        #    { ... }
546
+        #  ]
547
+        #
548
+        yaml_fullpath = os.path.join( localdir, 'techcrunch.yaml' )
549
+        if os.path.exists( yaml_fullpath ):
550
+            f = file( yaml_fullpath, 'rb' )
551
+            items = yaml.load( f )
552
+            f.close()
553
+        else:
554
+            print "could not open", yaml_fullpath
555
+            items = []
556
+
557
+        progress_text = [ "read techcrunch.yaml" ]
558
+        process_feed( items )
559
+
560
+        #
561
+        # If any work was done, then write files.
562
+        #
563
+        if True or any_entry_added:
564
+
565
+            stats = analysis.Process_retweets_for_feed( items )
566
+
567
+            # We'll only look at the stats for the time 1:00 to 1:30 after posting.
568
+        median, mean, sigma = stats[2]
569
+        threshold = median + sigma
570
+        for item in items:
571
+        if item['qualified'] == -1:
572
+            for i in range( len( item['retweet_times'] ) ):
573
+            r_time = item['retweet_times'][i]
574
+                if r_time - item['orig_posted'] < 5400:
575
+                            if item['retweets'][i] >= threshold:
576
+                    item['qualified'] = i
577
+                if r_time - item['orig_posted'] >= 3600:
578
+                    break
579
+
580
+            #
581
+            # Write out the updated yaml file.
582
+            #
583
+            f = file( yaml_fullpath, 'wb' )
584
+            yaml.dump( items, f, width=120 )
585
+            f.close()
586
+            f = file( os.path.join( localdir, 'techcrunch_text.yaml' ), 'w' )
587
+            yaml.dump( items, f, width=120 )
588
+            f.close()
589
+            f = codecs.open( os.path.join( localdir, 'techcrunch_unicode.yaml' ), 'w', 'utf-8' )
590
+            yaml.dump( items, f, encoding='utf-8', width=120 )
591
+            f.close()
592
+
593
+        Make_feed_file( items )
594
+
595
+            Make_index_html( items, stats )
596
+        else:
597
+            print "No entries were added this time."
598
+
599
+    except Exception, e:
600
+        exceptional_text = "An exception occurred: " + str( e.__class__ ) + " " + str(e)
601
+        print exceptional_text, ' '.join( progress_text )
602
+        traceback.print_exc( file = sys.stdout )
603
+        try:
604
+            sendEmail( 'Exception thrown in techcrunch.py',
605
+                       exceptional_text,
606
+                       ( 'david.blume@gmail.com', ) )
607
+        except Exception, e:
608
+            print "Could not send email to notify you of the exception. :("
609
+
610
+    message = sys.stdout.getvalue()
611
+    sys.stdout = old_stdout
612
+    sys.stderr = old_stderr
613
+    if not debug:
614
+        print message
615
+
616
+    # Finally, let's save this to a statistics page
617
+    if os.path.exists( os.path.join( localdir, 'stats.txt' ) ):
618
+        f = open( os.path.join( localdir, 'stats.txt' ))
619
+        try:
620
+            lines = f.readlines()
621
+        finally:
622
+            f.close()
623
+    else:
624
+        lines = []
625
+    lines = lines[:168] # Just keep the past week's worth
626
+    status = len( message.strip() ) and message.strip().replace( '\n', ' - ' ) or "OK"
627
+    lines.insert( 0, "%s %3.0fs %s\n" % ( time.strftime('%H:%M, %Y-%m-%d', time.localtime()), time.time() - start_time, status ))
628
+    f = open( os.path.join( localdir,'stats.txt' ), 'w' )
629
+    f.writelines( lines )
630
+    f.close()
0 631