Original 2010-09-03 version (7d91adf) - techcrunch.git

Original 2010-09-03 version

David Blume commited on 2018-01-20 20:10:33
Showing 4 changed files, with 4291 additions and 0 deletions.

LICENSE.txt 0000000..868be65
README.md 0000000..cb09809
feedparser.py 0000000..3f827a3
techcrunch.py 0000000..357ad3d

LICENSE.txt

...	...	@@ -0,0 +1,19 @@
	1	+Copyright (c) 2018, David Blume
	2	+
	3	+Permission is hereby granted, free of charge, to any person obtaining
	4	+a copy of this software and associated documentation files (the "Software"),
	5	+to deal in the Software without restriction, including without limitation
	6	+the rights to use, copy, modify, merge, publish, distribute, sublicense,
	7	+and/or sell copies of the Software, and to permit persons to whom the
	8	+Software is furnished to do so, subject to the following conditions:
	9	+
	10	+The above copyright notice and this permission notice shall be included
	11	+in all copies or substantial portions of the Software.
	12	+
	13	+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
	14	+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	15	+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	16	+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	17	+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	18	+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
	19	+DEALINGS IN THE SOFTWARE.

README.md

View file @ 7d91adf

...	...	@@ -0,0 +1,30 @@
	1	+[![License](https://img.shields.io/badge/license-MIT_license-blue.svg)](http://git.dlma.com/techcrunch.git/blob/master/LICENSE.txt)
	2	+![python2.x](https://img.shields.io/badge/python-2.x-yellow.svg)
	3	+# TechCrunch Feed Filter
	4	+
	5	+This is a Python script run as a cronjob to read the TechCrunch article feed,
	6	+and decide which articles to include in its own feed.
	7	+
	8	+Here's a [blog post about it](http://david.dlma.com/blog/my-techcrunch-feed-filter).
	9	+
	10	+# History
	11	+
	12	+This was originally archived in a Subversion repo. I'd forgotten about the
	13	+version control and had gotten into the habit of just modifying the production
	14	+site.
	15	+
	16	+* 2010-09-03: Original
	17	+* 2010-09-03: Save off the disqus identifier for use later.
	18	+* 2011-02-04: Algorithm changes (tags and author checked), new chart drawing, spaces used instead of tabs.
	19	+* 2011-02-04: Update to the chart drawing algorithm.
	20	+* 2013-08-04: Miscellaneous changes to techcrunch.py
	21	+* 2015-11-23: Resync svn with production site.
	22	+* 2015-11-27: Remove obsolete disqus and retweet code, and refactor style to be more PEP-8ish.
	23	+
	24	+# Is it any good?
	25	+
	26	+[Yes](https://news.ycombinator.com/item?id=3067434).
	27	+
	28	+# Licence
	29	+
	30	+This software uses the [MIT license](http://git.dlma.com/techcrunch.git/blob/master/LICENSE.txt)

feedparser.py

View file @ 7d91adf

...	...	@@ -0,0 +1,3612 @@
	1	+#!/usr/bin/env python
	2	+"""Universal feed parser
	3	+
	4	+Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
	5	+
	6	+Visit http://feedparser.org/ for the latest version
	7	+Visit http://feedparser.org/docs/ for the latest documentation
	8	+
	9	+Required: Python 2.1 or later
	10	+Recommended: Python 2.3 or later
	11	+Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
	12	+"""
	13	+
	14	+__version__ = "4.2-pre-" + "$Revision: 291 $"[11:14] + "-svn"
	15	+__license__ = """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved.
	16	+
	17	+Redistribution and use in source and binary forms, with or without modification,
	18	+are permitted provided that the following conditions are met:
	19	+
	20	+* Redistributions of source code must retain the above copyright notice,
	21	+ this list of conditions and the following disclaimer.
	22	+* Redistributions in binary form must reproduce the above copyright notice,
	23	+ this list of conditions and the following disclaimer in the documentation
	24	+ and/or other materials provided with the distribution.
	25	+
	26	+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
	27	+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	28	+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	29	+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	30	+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	31	+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	32	+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	33	+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	34	+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	35	+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	36	+POSSIBILITY OF SUCH DAMAGE."""
	37	+__author__ = "Mark Pilgrim <http://diveintomark.org/>"
	38	+__contributors__ = ["Jason Diamond <http://injektilo.org/>",
	39	+ "John Beimler <http://john.beimler.org/>",
	40	+ "Fazal Majid <http://www.majid.info/mylos/weblog/>",
	41	+ "Aaron Swartz <http://aaronsw.com/>",
	42	+ "Kevin Marks <http://epeus.blogspot.com/>",
	43	+ "Sam Ruby <http://intertwingly.net/>"]
	44	+_debug = 0
	45	+
	46	+# HTTP "User-Agent" header to send to servers when downloading feeds.
	47	+# If you are embedding feedparser in a larger application, you should
	48	+# change this to your application name and URL.
	49	+USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__
	50	+
	51	+# HTTP "Accept" header to send to servers when downloading feeds. If you don't
	52	+# want to send an Accept header, set this to None.
	53	+ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,/;q=0.1"
	54	+
	55	+# List of preferred XML parsers, by SAX driver name. These will be tried first,
	56	+# but if they're not installed, Python will keep searching through its own list
	57	+# of pre-installed parsers until it finds one that supports everything we need.
	58	+PREFERRED_XML_PARSERS = ["drv_libxml2"]
	59	+
	60	+# If you want feedparser to automatically run HTML markup through HTML Tidy, set
	61	+# this to 1. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
	62	+# or utidylib <http://utidylib.berlios.de/>.
	63	+TIDY_MARKUP = 0
	64	+
	65	+# List of Python interfaces for HTML Tidy, in order of preference. Only useful
	66	+# if TIDY_MARKUP = 1
	67	+PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
	68	+
	69	+# If you want feedparser to automatically resolve all relative URIs, set this
	70	+# to 1.
	71	+RESOLVE_RELATIVE_URIS = 1
	72	+
	73	+# If you want feedparser to automatically sanitize all potentially unsafe
	74	+# HTML content, set this to 1.
	75	+SANITIZE_HTML = 1
	76	+
	77	+# ---------- required modules (should come with any Python distribution) ----------
	78	+import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
	79	+try:
	80	+ from cStringIO import StringIO as _StringIO
	81	+except:
	82	+ from StringIO import StringIO as _StringIO
	83	+
	84	+# ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
	85	+
	86	+# gzip is included with most Python distributions, but may not be available if you compiled your own
	87	+try:
	88	+ import gzip
	89	+except:
	90	+ gzip = None
	91	+try:
	92	+ import zlib
	93	+except:
	94	+ zlib = None
	95	+
	96	+# If a real XML parser is available, feedparser will attempt to use it. feedparser has
	97	+# been tested with the built-in SAX parser, PyXML, and libxml2. On platforms where the
	98	+# Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
	99	+# versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
	100	+try:
	101	+ import xml.sax
	102	+ xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
	103	+ from xml.sax.saxutils import escape as _xmlescape
	104	+ _XML_AVAILABLE = 1
	105	+except:
	106	+ _XML_AVAILABLE = 0
	107	+ def _xmlescape(data,entities={}):
	108	+ data = data.replace('&', '&')
	109	+ data = data.replace('>', '>')
	110	+ data = data.replace('<', '<')
	111	+ for char, entity in entities:
	112	+ data = data.replace(char, entity)
	113	+ return data
	114	+
	115	+# base64 support for Atom feeds that contain embedded binary data
	116	+try:
	117	+ import base64, binascii
	118	+except:
	119	+ base64 = binascii = None
	120	+
	121	+# cjkcodecs and iconv_codec provide support for more character encodings.
	122	+# Both are available from http://cjkpython.i18n.org/
	123	+try:
	124	+ import cjkcodecs.aliases
	125	+except:
	126	+ pass
	127	+try:
	128	+ import iconv_codec
	129	+except:
	130	+ pass
	131	+
	132	+# chardet library auto-detects character encodings
	133	+# Download from http://chardet.feedparser.org/
	134	+try:
	135	+ import chardet
	136	+ if _debug:
	137	+ import chardet.constants
	138	+ chardet.constants._debug = 1
	139	+except:
	140	+ chardet = None
	141	+
	142	+# reversable htmlentitydefs mappings for Python 2.2
	143	+try:
	144	+ from htmlentitydefs import name2codepoint, codepoint2name
	145	+except:
	146	+ import htmlentitydefs
	147	+ name2codepoint={}
	148	+ codepoint2name={}
	149	+ for (name,codepoint) in htmlentitydefs.entitydefs.iteritems():
	150	+ if codepoint.startswith('&#'): codepoint=unichr(int(codepoint[2:-1]))
	151	+ name2codepoint[name]=ord(codepoint)
	152	+ codepoint2name[ord(codepoint)]=name
	153	+
	154	+# BeautifulSoup parser used for parsing microformats from embedded HTML content
	155	+# http://www.crummy.com/software/BeautifulSoup/
	156	+# feedparser is tested with BeautifulSoup 3.0.x, but it might work with the
	157	+# older 2.x series. If it doesn't, and you can figure out why, I'll accept a
	158	+# patch and modify the compatibility statement accordingly.
	159	+try:
	160	+ import BeautifulSoup
	161	+except:
	162	+ BeautifulSoup = None
	163	+
	164	+# ---------- don't touch these ----------
	165	+class ThingsNobodyCaresAboutButMe(Exception): pass
	166	+class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
	167	+class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
	168	+class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
	169	+class UndeclaredNamespace(Exception): pass
	170	+
	171	+sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
	172	+sgmllib.special = re.compile('<!')
	173	+sgmllib.charref = re.compile('&#(\d+\|x[0-9a-fA-F]+);')
	174	+
	175	+if sgmllib.endbracket.search(' <').start(0):
	176	+ class EndBracketMatch:
	177	+ endbracket = re.compile('''([^'"<>]\|"[^"]"(?=>\|/\|\s\|\w+=)\|'[^']'(?=>\|/\|\s\|\w+=))(?=[<>])\|.?(?=[<>])''')
	178	+ def search(self,string,index=0):
	179	+ self.match = self.endbracket.match(string,index)
	180	+ if self.match: return self
	181	+ def start(self,n):
	182	+ return self.match.end(n)
	183	+ sgmllib.endbracket = EndBracketMatch()
	184	+
	185	+SUPPORTED_VERSIONS = {'': 'unknown',
	186	+ 'rss090': 'RSS 0.90',
	187	+ 'rss091n': 'RSS 0.91 (Netscape)',
	188	+ 'rss091u': 'RSS 0.91 (Userland)',
	189	+ 'rss092': 'RSS 0.92',
	190	+ 'rss093': 'RSS 0.93',
	191	+ 'rss094': 'RSS 0.94',
	192	+ 'rss20': 'RSS 2.0',
	193	+ 'rss10': 'RSS 1.0',
	194	+ 'rss': 'RSS (unknown version)',
	195	+ 'atom01': 'Atom 0.1',
	196	+ 'atom02': 'Atom 0.2',
	197	+ 'atom03': 'Atom 0.3',
	198	+ 'atom10': 'Atom 1.0',
	199	+ 'atom': 'Atom (unknown version)',
	200	+ 'cdf': 'CDF',
	201	+ 'hotrss': 'Hot RSS'
	202	+ }
	203	+
	204	+try:
	205	+ UserDict = dict
	206	+except NameError:
	207	+ # Python 2.1 does not have dict
	208	+ from UserDict import UserDict
	209	+ def dict(aList):
	210	+ rc = {}
	211	+ for k, v in aList:
	212	+ rc[k] = v
	213	+ return rc
	214	+
	215	+class FeedParserDict(UserDict):
	216	+ keymap = {'channel': 'feed',
	217	+ 'items': 'entries',
	218	+ 'guid': 'id',
	219	+ 'date': 'updated',
	220	+ 'date_parsed': 'updated_parsed',
	221	+ 'description': ['subtitle', 'summary'],
	222	+ 'url': ['href'],
	223	+ 'modified': 'updated',
	224	+ 'modified_parsed': 'updated_parsed',
	225	+ 'issued': 'published',
	226	+ 'issued_parsed': 'published_parsed',
	227	+ 'copyright': 'rights',
	228	+ 'copyright_detail': 'rights_detail',
	229	+ 'tagline': 'subtitle',
	230	+ 'tagline_detail': 'subtitle_detail'}
	231	+ def __getitem__(self, key):
	232	+ if key == 'category':
	233	+ return UserDict.__getitem__(self, 'tags')[0]['term']
	234	+ if key == 'enclosures':
	235	+ norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel'])
	236	+ return [norel(link) for link in UserDict.__getitem__(self, 'links') if link['rel']=='enclosure']
	237	+ if key == 'license':
	238	+ for link in UserDict.__getitem__(self, 'links'):
	239	+ if link['rel']=='license' and link.has_key('href'):
	240	+ return link['href']
	241	+ if key == 'categories':
	242	+ return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')]
	243	+ realkey = self.keymap.get(key, key)
	244	+ if type(realkey) == types.ListType:
	245	+ for k in realkey:
	246	+ if UserDict.has_key(self, k):
	247	+ return UserDict.__getitem__(self, k)
	248	+ if UserDict.has_key(self, key):
	249	+ return UserDict.__getitem__(self, key)
	250	+ return UserDict.__getitem__(self, realkey)
	251	+
	252	+ def __setitem__(self, key, value):
	253	+ for k in self.keymap.keys():
	254	+ if key == k:
	255	+ key = self.keymap[k]
	256	+ if type(key) == types.ListType:
	257	+ key = key[0]
	258	+ return UserDict.__setitem__(self, key, value)
	259	+
	260	+ def get(self, key, default=None):
	261	+ if self.has_key(key):
	262	+ return self[key]
	263	+ else:
	264	+ return default
	265	+
	266	+ def setdefault(self, key, value):
	267	+ if not self.has_key(key):
	268	+ self[key] = value
	269	+ return self[key]
	270	+
	271	+ def has_key(self, key):
	272	+ try:
	273	+ return hasattr(self, key) or UserDict.has_key(self, key)
	274	+ except AttributeError:
	275	+ return False
	276	+
	277	+ def __getattr__(self, key):
	278	+ try:
	279	+ return self.__dict__[key]
	280	+ except KeyError:
	281	+ pass
	282	+ try:
	283	+ assert not key.startswith('_')
	284	+ return self.__getitem__(key)
	285	+ except:
	286	+ raise AttributeError, "object has no attribute '%s'" % key
	287	+
	288	+ def __setattr__(self, key, value):
	289	+ if key.startswith('_') or key == 'data':
	290	+ self.__dict__[key] = value
	291	+ else:
	292	+ return self.__setitem__(key, value)
	293	+
	294	+ def __contains__(self, key):
	295	+ return self.has_key(key)
	296	+
	297	+def zopeCompatibilityHack():
	298	+ global FeedParserDict
	299	+ del FeedParserDict
	300	+ def FeedParserDict(aDict=None):
	301	+ rc = {}
	302	+ if aDict:
	303	+ rc.update(aDict)
	304	+ return rc
	305	+
	306	+_ebcdic_to_ascii_map = None
	307	+def _ebcdic_to_ascii(s):
	308	+ global _ebcdic_to_ascii_map
	309	+ if not _ebcdic_to_ascii_map:
	310	+ emap = (
	311	+ 0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
	312	+ 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
	313	+ 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
	314	+ 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
	315	+ 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
	316	+ 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
	317	+ 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
	318	+ 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
	319	+ 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
	320	+ 202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
	321	+ 209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
	322	+ 216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
	323	+ 123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
	324	+ 125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
	325	+ 92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
	326	+ 48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
	327	+ )
	328	+ import string
	329	+ _ebcdic_to_ascii_map = string.maketrans( \
	330	+ ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
	331	+ return s.translate(_ebcdic_to_ascii_map)
	332	+
	333	+_cp1252 = {
	334	+ unichr(128): unichr(8364), # euro sign
	335	+ unichr(130): unichr(8218), # single low-9 quotation mark
	336	+ unichr(131): unichr( 402), # latin small letter f with hook
	337	+ unichr(132): unichr(8222), # double low-9 quotation mark
	338	+ unichr(133): unichr(8230), # horizontal ellipsis
	339	+ unichr(134): unichr(8224), # dagger
	340	+ unichr(135): unichr(8225), # double dagger
	341	+ unichr(136): unichr( 710), # modifier letter circumflex accent
	342	+ unichr(137): unichr(8240), # per mille sign
	343	+ unichr(138): unichr( 352), # latin capital letter s with caron
	344	+ unichr(139): unichr(8249), # single left-pointing angle quotation mark
	345	+ unichr(140): unichr( 338), # latin capital ligature oe
	346	+ unichr(142): unichr( 381), # latin capital letter z with caron
	347	+ unichr(145): unichr(8216), # left single quotation mark
	348	+ unichr(146): unichr(8217), # right single quotation mark
	349	+ unichr(147): unichr(8220), # left double quotation mark
	350	+ unichr(148): unichr(8221), # right double quotation mark
	351	+ unichr(149): unichr(8226), # bullet
	352	+ unichr(150): unichr(8211), # en dash
	353	+ unichr(151): unichr(8212), # em dash
	354	+ unichr(152): unichr( 732), # small tilde
	355	+ unichr(153): unichr(8482), # trade mark sign
	356	+ unichr(154): unichr( 353), # latin small letter s with caron
	357	+ unichr(155): unichr(8250), # single right-pointing angle quotation mark
	358	+ unichr(156): unichr( 339), # latin small ligature oe
	359	+ unichr(158): unichr( 382), # latin small letter z with caron
	360	+ unichr(159): unichr( 376)} # latin capital letter y with diaeresis
	361	+
	362	+_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]://)(/)(.*?)')
	363	+def _urljoin(base, uri):
	364	+ uri = _urifixer.sub(r'\1\3', uri)
	365	+ try:
	366	+ return urlparse.urljoin(base, uri)
	367	+ except:
	368	+ uri = urlparse.urlunparse([urllib.quote(part) for part in urlparse.urlparse(uri)])
	369	+ return urlparse.urljoin(base, uri)
	370	+
	371	+class _FeedParserMixin:
	372	+ namespaces = {'': '',
	373	+ 'http://backend.userland.com/rss': '',
	374	+ 'http://blogs.law.harvard.edu/tech/rss': '',
	375	+ 'http://purl.org/rss/1.0/': '',
	376	+ 'http://my.netscape.com/rdf/simple/0.9/': '',
	377	+ 'http://example.com/newformat#': '',
	378	+ 'http://example.com/necho': '',
	379	+ 'http://purl.org/echo/': '',
	380	+ 'uri/of/echo/namespace#': '',
	381	+ 'http://purl.org/pie/': '',
	382	+ 'http://purl.org/atom/ns#': '',
	383	+ 'http://www.w3.org/2005/Atom': '',
	384	+ 'http://purl.org/rss/1.0/modules/rss091#': '',
	385	+
	386	+ 'http://webns.net/mvcb/': 'admin',
	387	+ 'http://purl.org/rss/1.0/modules/aggregation/': 'ag',
	388	+ 'http://purl.org/rss/1.0/modules/annotate/': 'annotate',
	389	+ 'http://media.tangent.org/rss/1.0/': 'audio',
	390	+ 'http://backend.userland.com/blogChannelModule': 'blogChannel',
	391	+ 'http://web.resource.org/cc/': 'cc',
	392	+ 'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',
	393	+ 'http://purl.org/rss/1.0/modules/company': 'co',
	394	+ 'http://purl.org/rss/1.0/modules/content/': 'content',
	395	+ 'http://my.theinfo.org/changed/1.0/rss/': 'cp',
	396	+ 'http://purl.org/dc/elements/1.1/': 'dc',
	397	+ 'http://purl.org/dc/terms/': 'dcterms',
	398	+ 'http://purl.org/rss/1.0/modules/email/': 'email',
	399	+ 'http://purl.org/rss/1.0/modules/event/': 'ev',
	400	+ 'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner',
	401	+ 'http://freshmeat.net/rss/fm/': 'fm',
	402	+ 'http://xmlns.com/foaf/0.1/': 'foaf',
	403	+ 'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo',
	404	+ 'http://postneo.com/icbm/': 'icbm',
	405	+ 'http://purl.org/rss/1.0/modules/image/': 'image',
	406	+ 'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes',
	407	+ 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes',
	408	+ 'http://purl.org/rss/1.0/modules/link/': 'l',
	409	+ 'http://search.yahoo.com/mrss': 'media',
	410	+ 'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
	411	+ 'http://prismstandard.org/namespaces/1.2/basic/': 'prism',
	412	+ 'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf',
	413	+ 'http://www.w3.org/2000/01/rdf-schema#': 'rdfs',
	414	+ 'http://purl.org/rss/1.0/modules/reference/': 'ref',
	415	+ 'http://purl.org/rss/1.0/modules/richequiv/': 'reqv',
	416	+ 'http://purl.org/rss/1.0/modules/search/': 'search',
	417	+ 'http://purl.org/rss/1.0/modules/slash/': 'slash',
	418	+ 'http://schemas.xmlsoap.org/soap/envelope/': 'soap',
	419	+ 'http://purl.org/rss/1.0/modules/servicestatus/': 'ss',
	420	+ 'http://hacks.benhammersley.com/rss/streaming/': 'str',
	421	+ 'http://purl.org/rss/1.0/modules/subscription/': 'sub',
	422	+ 'http://purl.org/rss/1.0/modules/syndication/': 'sy',
	423	+ 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf',
	424	+ 'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo',
	425	+ 'http://purl.org/rss/1.0/modules/threading/': 'thr',
	426	+ 'http://purl.org/rss/1.0/modules/textinput/': 'ti',
	427	+ 'http://madskills.com/public/xml/rss/module/trackback/':'trackback',
	428	+ 'http://wellformedweb.org/commentAPI/': 'wfw',
	429	+ 'http://purl.org/rss/1.0/modules/wiki/': 'wiki',
	430	+ 'http://www.w3.org/1999/xhtml': 'xhtml',
	431	+ 'http://www.w3.org/1999/xlink': 'xlink',
	432	+ 'http://www.w3.org/XML/1998/namespace': 'xml'
	433	+}
	434	+ _matchnamespaces = {}
	435	+
	436	+ can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo']
	437	+ can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
	438	+ can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
	439	+ html_types = ['text/html', 'application/xhtml+xml']
	440	+
	441	+ def __init__(self, baseuri=None, baselang=None, encoding='utf-8'):
	442	+ if _debug: sys.stderr.write('initializing FeedParser\n')
	443	+ if not self._matchnamespaces:
	444	+ for k, v in self.namespaces.items():
	445	+ self._matchnamespaces[k.lower()] = v
	446	+ self.feeddata = FeedParserDict() # feed-level data
	447	+ self.encoding = encoding # character encoding
	448	+ self.entries = [] # list of entry-level data
	449	+ self.version = '' # feed type/version, see SUPPORTED_VERSIONS
	450	+ self.namespacesInUse = {} # dictionary of namespaces defined by the feed
	451	+
	452	+ # the following are used internally to track state;
	453	+ # this is really out of control and should be refactored
	454	+ self.infeed = 0
	455	+ self.inentry = 0
	456	+ self.incontent = 0
	457	+ self.intextinput = 0
	458	+ self.inimage = 0
	459	+ self.inauthor = 0
	460	+ self.incontributor = 0
	461	+ self.inpublisher = 0
	462	+ self.insource = 0
	463	+ self.sourcedata = FeedParserDict()
	464	+ self.contentparams = FeedParserDict()
	465	+ self._summaryKey = None
	466	+ self.namespacemap = {}
	467	+ self.elementstack = []
	468	+ self.basestack = []
	469	+ self.langstack = []
	470	+ self.baseuri = baseuri or ''
	471	+ self.lang = baselang or None
	472	+ self.svgOK = 0
	473	+ self.hasTitle = 0
	474	+ if baselang:
	475	+ self.feeddata['language'] = baselang.replace('_','-')
	476	+
	477	+ def unknown_starttag(self, tag, attrs):
	478	+ if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
	479	+ # normalize attrs
	480	+ attrs = [(k.lower(), v) for k, v in attrs]
	481	+ attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
	482	+
	483	+ # track xml:base and xml:lang
	484	+ attrsD = dict(attrs)
	485	+ baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
	486	+ if type(baseuri) != type(u''):
	487	+ try:
	488	+ baseuri = unicode(baseuri, self.encoding)
	489	+ except:
	490	+ baseuri = unicode(baseuri, 'iso-8859-1')
	491	+ self.baseuri = _urljoin(self.baseuri, baseuri)
	492	+ lang = attrsD.get('xml:lang', attrsD.get('lang'))
	493	+ if lang == '':
	494	+ # xml:lang could be explicitly set to '', we need to capture that
	495	+ lang = None
	496	+ elif lang is None:
	497	+ # if no xml:lang is specified, use parent lang
	498	+ lang = self.lang
	499	+ if lang:
	500	+ if tag in ('feed', 'rss', 'rdf:RDF'):
	501	+ self.feeddata['language'] = lang.replace('_','-')
	502	+ self.lang = lang
	503	+ self.basestack.append(self.baseuri)
	504	+ self.langstack.append(lang)
	505	+
	506	+ # track namespaces
	507	+ for prefix, uri in attrs:
	508	+ if prefix.startswith('xmlns:'):
	509	+ self.trackNamespace(prefix[6:], uri)
	510	+ elif prefix == 'xmlns':
	511	+ self.trackNamespace(None, uri)
	512	+
	513	+ # track inline content
	514	+ if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
	515	+ if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007
	516	+ # element declared itself as escaped markup, but it isn't really
	517	+ self.contentparams['type'] = 'application/xhtml+xml'
	518	+ if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
	519	+ if tag.find(':') <> -1:
	520	+ prefix, tag = tag.split(':', 1)
	521	+ namespace = self.namespacesInUse.get(prefix, '')
	522	+ if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
	523	+ attrs.append(('xmlns',namespace))
	524	+ if tag=='svg' and namespace=='http://www.w3.org/2000/svg':
	525	+ attrs.append(('xmlns',namespace))
	526	+ if tag == 'svg': self.svgOK += 1
	527	+ return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0)
	528	+
	529	+ # match namespaces
	530	+ if tag.find(':') <> -1:
	531	+ prefix, suffix = tag.split(':', 1)
	532	+ else:
	533	+ prefix, suffix = '', tag
	534	+ prefix = self.namespacemap.get(prefix, prefix)
	535	+ if prefix:
	536	+ prefix = prefix + '_'
	537	+
	538	+ # special hack for better tracking of empty textinput/image elements in illformed feeds
	539	+ if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
	540	+ self.intextinput = 0
	541	+ if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
	542	+ self.inimage = 0
	543	+
	544	+ # call special handler (if defined) or default handler
	545	+ methodname = '_start_' + prefix + suffix
	546	+ try:
	547	+ method = getattr(self, methodname)
	548	+ return method(attrsD)
	549	+ except AttributeError:
	550	+ return self.push(prefix + suffix, 1)
	551	+
	552	+ def unknown_endtag(self, tag):
	553	+ if _debug: sys.stderr.write('end %s\n' % tag)
	554	+ # match namespaces
	555	+ if tag.find(':') <> -1:
	556	+ prefix, suffix = tag.split(':', 1)
	557	+ else:
	558	+ prefix, suffix = '', tag
	559	+ prefix = self.namespacemap.get(prefix, prefix)
	560	+ if prefix:
	561	+ prefix = prefix + '_'
	562	+ if suffix == 'svg' and self.svgOK: self.svgOK -= 1
	563	+
	564	+ # call special handler (if defined) or default handler
	565	+ methodname = '_end_' + prefix + suffix
	566	+ try:
	567	+ if self.svgOK: raise AttributeError()
	568	+ method = getattr(self, methodname)
	569	+ method()
	570	+ except AttributeError:
	571	+ self.pop(prefix + suffix)
	572	+
	573	+ # track inline content
	574	+ if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
	575	+ # element declared itself as escaped markup, but it isn't really
	576	+ if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007
	577	+ self.contentparams['type'] = 'application/xhtml+xml'
	578	+ if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
	579	+ tag = tag.split(':')[-1]
	580	+ self.handle_data('</%s>' % tag, escape=0)
	581	+
	582	+ # track xml:base and xml:lang going out of scope
	583	+ if self.basestack:
	584	+ self.basestack.pop()
	585	+ if self.basestack and self.basestack[-1]:
	586	+ self.baseuri = self.basestack[-1]
	587	+ if self.langstack:
	588	+ self.langstack.pop()
	589	+ if self.langstack: # and (self.langstack[-1] is not None):
	590	+ self.lang = self.langstack[-1]
	591	+
	592	+ def handle_charref(self, ref):
	593	+ # called for each character reference, e.g. for ' ', ref will be '160'
	594	+ if not self.elementstack: return
	595	+ ref = ref.lower()
	596	+ if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
	597	+ text = '&#%s;' % ref
	598	+ else:
	599	+ if ref[0] == 'x':
	600	+ c = int(ref[1:], 16)
	601	+ else:
	602	+ c = int(ref)
	603	+ text = unichr(c).encode('utf-8')
	604	+ self.elementstack[-1][2].append(text)
	605	+
	606	+ def handle_entityref(self, ref):
	607	+ # called for each entity reference, e.g. for '©', ref will be 'copy'
	608	+ if not self.elementstack: return
	609	+ if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref)
	610	+ if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
	611	+ text = '&%s;' % ref
	612	+ elif ref in self.entities.keys():
	613	+ text = self.entities[ref]
	614	+ if text.startswith('&#') and text.endswith(';'):
	615	+ return self.handle_entityref(text)
	616	+ else:
	617	+ try: name2codepoint[ref]
	618	+ except KeyError: text = '&%s;' % ref
	619	+ else: text = unichr(name2codepoint[ref]).encode('utf-8')
	620	+ self.elementstack[-1][2].append(text)
	621	+
	622	+ def handle_data(self, text, escape=1):
	623	+ # called for each block of plain text, i.e. outside of any tag and
	624	+ # not containing any character or entity references
	625	+ if not self.elementstack: return
	626	+ if escape and self.contentparams.get('type') == 'application/xhtml+xml':
	627	+ text = _xmlescape(text)
	628	+ self.elementstack[-1][2].append(text)
	629	+
	630	+ def handle_comment(self, text):
	631	+ # called for each comment, e.g. <!-- insert message here -->
	632	+ pass
	633	+
	634	+ def handle_pi(self, text):
	635	+ # called for each processing instruction, e.g. <?instruction>
	636	+ pass
	637	+
	638	+ def handle_decl(self, text):
	639	+ pass
	640	+
	641	+ def parse_declaration(self, i):
	642	+ # override internal declaration handler to handle CDATA blocks
	643	+ if _debug: sys.stderr.write('entering parse_declaration\n')
	644	+ if self.rawdata[i:i+9] == '<![CDATA[':
	645	+ k = self.rawdata.find(']]>', i)
	646	+ if k == -1: k = len(self.rawdata)
	647	+ self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
	648	+ return k+3
	649	+ else:
	650	+ k = self.rawdata.find('>', i)
	651	+ return k+1
	652	+
	653	+ def mapContentType(self, contentType):
	654	+ contentType = contentType.lower()
	655	+ if contentType == 'text':
	656	+ contentType = 'text/plain'
	657	+ elif contentType == 'html':
	658	+ contentType = 'text/html'
	659	+ elif contentType == 'xhtml':
	660	+ contentType = 'application/xhtml+xml'
	661	+ return contentType
	662	+
	663	+ def trackNamespace(self, prefix, uri):
	664	+ loweruri = uri.lower()
	665	+ if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
	666	+ self.version = 'rss090'
	667	+ if loweruri == 'http://purl.org/rss/1.0/' and not self.version:
	668	+ self.version = 'rss10'
	669	+ if loweruri == 'http://www.w3.org/2005/atom' and not self.version:
	670	+ self.version = 'atom10'
	671	+ if loweruri.find('backend.userland.com/rss') <> -1:
	672	+ # match any backend.userland.com namespace
	673	+ uri = 'http://backend.userland.com/rss'
	674	+ loweruri = uri
	675	+ if self._matchnamespaces.has_key(loweruri):
	676	+ self.namespacemap[prefix] = self._matchnamespaces[loweruri]
	677	+ self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
	678	+ else:
	679	+ self.namespacesInUse[prefix or ''] = uri
	680	+
	681	+ def resolveURI(self, uri):
	682	+ return _urljoin(self.baseuri or '', uri)
	683	+
	684	+ def decodeEntities(self, element, data):
	685	+ return data
	686	+
	687	+ def strattrs(self, attrs):
	688	+ return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'"'})) for t in attrs])
	689	+
	690	+ def push(self, element, expectingText):
	691	+ self.elementstack.append([element, expectingText, []])
	692	+
	693	+ def pop(self, element, stripWhitespace=1):
	694	+ if not self.elementstack: return
	695	+ if self.elementstack[-1][0] != element: return
	696	+
	697	+ element, expectingText, pieces = self.elementstack.pop()
	698	+
	699	+ if self.version == 'atom10' and self.contentparams.get('type','text') == 'application/xhtml+xml':
	700	+ # remove enclosing child element, but only if it is a <div> and
	701	+ # only if all the remaining content is nested underneath it.
	702	+ # This means that the divs would be retained in the following:
	703	+ # <div>foo</div><div>bar</div>
	704	+ while pieces and len(pieces)>1 and not pieces[-1].strip():
	705	+ del pieces[-1]
	706	+ while pieces and len(pieces)>1 and not pieces[0].strip():
	707	+ del pieces[0]
	708	+ if pieces and (pieces[0] == '<div>' or pieces[0].startswith('<div ')) and pieces[-1]=='</div>':
	709	+ depth = 0
	710	+ for piece in pieces[:-1]:
	711	+ if piece.startswith('</'):
	712	+ depth -= 1
	713	+ if depth == 0: break
	714	+ elif piece.startswith('<') and not piece.endswith('/>'):
	715	+ depth += 1
	716	+ else:
	717	+ pieces = pieces[1:-1]
	718	+
	719	+ output = ''.join(pieces)
	720	+ if stripWhitespace:
	721	+ output = output.strip()
	722	+ if not expectingText: return output
	723	+
	724	+ # decode base64 content
	725	+ if base64 and self.contentparams.get('base64', 0):
	726	+ try:
	727	+ output = base64.decodestring(output)
	728	+ except binascii.Error:
	729	+ pass
	730	+ except binascii.Incomplete:
	731	+ pass
	732	+
	733	+ # resolve relative URIs
	734	+ if (element in self.can_be_relative_uri) and output:
	735	+ output = self.resolveURI(output)
	736	+
	737	+ # decode entities within embedded markup
	738	+ if not self.contentparams.get('base64', 0):
	739	+ output = self.decodeEntities(element, output)
	740	+
	741	+ if self.lookslikehtml(output):
	742	+ self.contentparams['type']='text/html'
	743	+
	744	+ # remove temporary cruft from contentparams
	745	+ try:
	746	+ del self.contentparams['mode']
	747	+ except KeyError:
	748	+ pass
	749	+ try:
	750	+ del self.contentparams['base64']
	751	+ except KeyError:
	752	+ pass
	753	+
	754	+ is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types
	755	+ # resolve relative URIs within embedded markup
	756	+ if is_htmlish and RESOLVE_RELATIVE_URIS:
	757	+ if element in self.can_contain_relative_uris:
	758	+ output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html'))
	759	+
	760	+ # parse microformats
	761	+ # (must do this before sanitizing because some microformats
	762	+ # rely on elements that we sanitize)
	763	+ if is_htmlish and element in ['content', 'description', 'summary']:
	764	+ mfresults = _parseMicroformats(output, self.baseuri, self.encoding)
	765	+ if mfresults:
	766	+ for tag in mfresults.get('tags', []):
	767	+ self._addTag(tag['term'], tag['scheme'], tag['label'])
	768	+ for enclosure in mfresults.get('enclosures', []):
	769	+ self._start_enclosure(enclosure)
	770	+ for xfn in mfresults.get('xfn', []):
	771	+ self._addXFN(xfn['relationships'], xfn['href'], xfn['name'])
	772	+ vcard = mfresults.get('vcard')
	773	+ if vcard:
	774	+ self._getContext()['vcard'] = vcard
	775	+
	776	+ # sanitize embedded markup
	777	+ if is_htmlish and SANITIZE_HTML:
	778	+ if element in self.can_contain_dangerous_markup:
	779	+ output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html'))
	780	+
	781	+ if self.encoding and type(output) != type(u''):
	782	+ try:
	783	+ output = unicode(output, self.encoding)
	784	+ except:
	785	+ pass
	786	+
	787	+ # address common error where people take data that is already
	788	+ # utf-8, presume that it is iso-8859-1, and re-encode it.
	789	+ if self.encoding=='utf-8' and type(output) == type(u''):
	790	+ try:
	791	+ output = unicode(output.encode('iso-8859-1'), 'utf-8')
	792	+ except:
	793	+ pass
	794	+
	795	+ # map win-1252 extensions to the proper code points
	796	+ if type(output) == type(u''):
	797	+ output = u''.join([c in _cp1252.keys() and _cp1252[c] or c for c in output])
	798	+
	799	+ # categories/tags/keywords/whatever are handled in _end_category
	800	+ if element == 'category':
	801	+ return output
	802	+
	803	+ if element == 'title' and self.hasTitle:
	804	+ return output
	805	+
	806	+ # store output in appropriate place(s)
	807	+ if self.inentry and not self.insource:
	808	+ if element == 'content':
	809	+ self.entries[-1].setdefault(element, [])
	810	+ contentparams = copy.deepcopy(self.contentparams)
	811	+ contentparams['value'] = output
	812	+ self.entries[-1][element].append(contentparams)
	813	+ elif element == 'link':
	814	+ self.entries[-1][element] = output
	815	+ if output:
	816	+ self.entries[-1]['links'][-1]['href'] = output
	817	+ else:
	818	+ if element == 'description':
	819	+ element = 'summary'
	820	+ self.entries[-1][element] = output
	821	+ if self.incontent:
	822	+ contentparams = copy.deepcopy(self.contentparams)
	823	+ contentparams['value'] = output
	824	+ self.entries[-1][element + '_detail'] = contentparams
	825	+ elif (self.infeed or self.insource):# and (not self.intextinput) and (not self.inimage):
	826	+ context = self._getContext()
	827	+ if element == 'description':
	828	+ element = 'subtitle'
	829	+ context[element] = output
	830	+ if element == 'link':
	831	+ context['links'][-1]['href'] = output
	832	+ elif self.incontent:
	833	+ contentparams = copy.deepcopy(self.contentparams)
	834	+ contentparams['value'] = output
	835	+ context[element + '_detail'] = contentparams
	836	+ return output
	837	+
	838	+ def pushContent(self, tag, attrsD, defaultContentType, expectingText):
	839	+ self.incontent += 1
	840	+ if self.lang: self.lang=self.lang.replace('_','-')
	841	+ self.contentparams = FeedParserDict({
	842	+ 'type': self.mapContentType(attrsD.get('type', defaultContentType)),
	843	+ 'language': self.lang,
	844	+ 'base': self.baseuri})
	845	+ self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
	846	+ self.push(tag, expectingText)
	847	+
	848	+ def popContent(self, tag):
	849	+ value = self.pop(tag)
	850	+ self.incontent -= 1
	851	+ self.contentparams.clear()
	852	+ return value
	853	+
	854	+ # a number of elements in a number of RSS variants are nominally plain
	855	+ # text, but this is routinely ignored. This is an attempt to detect
	856	+ # the most common cases. As false positives often result in silent
	857	+ # data loss, this function errs on the conservative side.
	858	+ def lookslikehtml(self, str):
	859	+ if self.version.startswith('atom'): return
	860	+ if self.contentparams.get('type','text/html') != 'text/plain': return
	861	+
	862	+ # must have a close tag or a entity reference to qualify
	863	+ if not (re.search(r'</(\w+)>',str) or re.search("&#?\w+;",str)): return
	864	+
	865	+ # all tags must be in a restricted subset of valid HTML tags
	866	+ if filter(lambda t: t.lower() not in _HTMLSanitizer.acceptable_elements,
	867	+ re.findall(r'</?(\w+)',str)): return
	868	+
	869	+ # all entities must have been defined as valid HTML entities
	870	+ from htmlentitydefs import entitydefs
	871	+ if filter(lambda e: e not in entitydefs.keys(),
	872	+ re.findall(r'&(\w+);',str)): return
	873	+
	874	+ return 1
	875	+
	876	+ def _mapToStandardPrefix(self, name):
	877	+ colonpos = name.find(':')
	878	+ if colonpos <> -1:
	879	+ prefix = name[:colonpos]
	880	+ suffix = name[colonpos+1:]
	881	+ prefix = self.namespacemap.get(prefix, prefix)
	882	+ name = prefix + ':' + suffix
	883	+ return name
	884	+
	885	+ def _getAttribute(self, attrsD, name):
	886	+ return attrsD.get(self._mapToStandardPrefix(name))
	887	+
	888	+ def _isBase64(self, attrsD, contentparams):
	889	+ if attrsD.get('mode', '') == 'base64':
	890	+ return 1
	891	+ if self.contentparams['type'].startswith('text/'):
	892	+ return 0
	893	+ if self.contentparams['type'].endswith('+xml'):
	894	+ return 0
	895	+ if self.contentparams['type'].endswith('/xml'):
	896	+ return 0
	897	+ return 1
	898	+
	899	+ def _itsAnHrefDamnIt(self, attrsD):
	900	+ href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
	901	+ if href:
	902	+ try:
	903	+ del attrsD['url']
	904	+ except KeyError:
	905	+ pass
	906	+ try:
	907	+ del attrsD['uri']
	908	+ except KeyError:
	909	+ pass
	910	+ attrsD['href'] = href
	911	+ return attrsD
	912	+
	913	+ def _save(self, key, value):
	914	+ context = self._getContext()
	915	+ context.setdefault(key, value)
	916	+
	917	+ def _start_rss(self, attrsD):
	918	+ versionmap = {'0.91': 'rss091u',
	919	+ '0.92': 'rss092',
	920	+ '0.93': 'rss093',
	921	+ '0.94': 'rss094'}
	922	+ if not self.version:
	923	+ attr_version = attrsD.get('version', '')
	924	+ version = versionmap.get(attr_version)
	925	+ if version:
	926	+ self.version = version
	927	+ elif attr_version.startswith('2.'):
	928	+ self.version = 'rss20'
	929	+ else:
	930	+ self.version = 'rss'
	931	+
	932	+ def _start_dlhottitles(self, attrsD):
	933	+ self.version = 'hotrss'
	934	+
	935	+ def _start_channel(self, attrsD):
	936	+ self.infeed = 1
	937	+ self._cdf_common(attrsD)
	938	+ _start_feedinfo = _start_channel
	939	+
	940	+ def _cdf_common(self, attrsD):
	941	+ if attrsD.has_key('lastmod'):
	942	+ self._start_modified({})
	943	+ self.elementstack[-1][-1] = attrsD['lastmod']
	944	+ self._end_modified()
	945	+ if attrsD.has_key('href'):
	946	+ self._start_link({})
	947	+ self.elementstack[-1][-1] = attrsD['href']
	948	+ self._end_link()
	949	+
	950	+ def _start_feed(self, attrsD):
	951	+ self.infeed = 1
	952	+ versionmap = {'0.1': 'atom01',
	953	+ '0.2': 'atom02',
	954	+ '0.3': 'atom03'}
	955	+ if not self.version:
	956	+ attr_version = attrsD.get('version')
	957	+ version = versionmap.get(attr_version)
	958	+ if version:
	959	+ self.version = version
	960	+ else:
	961	+ self.version = 'atom'
	962	+
	963	+ def _end_channel(self):
	964	+ self.infeed = 0
	965	+ _end_feed = _end_channel
	966	+
	967	+ def _start_image(self, attrsD):
	968	+ context = self._getContext()
	969	+ context.setdefault('image', FeedParserDict())
	970	+ self.inimage = 1
	971	+ self.hasTitle = 0
	972	+ self.push('image', 0)
	973	+
	974	+ def _end_image(self):
	975	+ self.pop('image')
	976	+ self.inimage = 0
	977	+
	978	+ def _start_textinput(self, attrsD):
	979	+ context = self._getContext()
	980	+ context.setdefault('textinput', FeedParserDict())
	981	+ self.intextinput = 1
	982	+ self.hasTitle = 0
	983	+ self.push('textinput', 0)
	984	+ _start_textInput = _start_textinput
	985	+
	986	+ def _end_textinput(self):
	987	+ self.pop('textinput')
	988	+ self.intextinput = 0
	989	+ _end_textInput = _end_textinput
	990	+
	991	+ def _start_author(self, attrsD):
	992	+ self.inauthor = 1
	993	+ self.push('author', 1)
	994	+ _start_managingeditor = _start_author
	995	+ _start_dc_author = _start_author
	996	+ _start_dc_creator = _start_author
	997	+ _start_itunes_author = _start_author
	998	+
	999	+ def _end_author(self):
	1000	+ self.pop('author')
	1001	+ self.inauthor = 0
	1002	+ self._sync_author_detail()
	1003	+ _end_managingeditor = _end_author
	1004	+ _end_dc_author = _end_author
	1005	+ _end_dc_creator = _end_author
	1006	+ _end_itunes_author = _end_author
	1007	+
	1008	+ def _start_itunes_owner(self, attrsD):
	1009	+ self.inpublisher = 1
	1010	+ self.push('publisher', 0)
	1011	+
	1012	+ def _end_itunes_owner(self):
	1013	+ self.pop('publisher')
	1014	+ self.inpublisher = 0
	1015	+ self._sync_author_detail('publisher')
	1016	+
	1017	+ def _start_contributor(self, attrsD):
	1018	+ self.incontributor = 1
	1019	+ context = self._getContext()
	1020	+ context.setdefault('contributors', [])
	1021	+ context['contributors'].append(FeedParserDict())
	1022	+ self.push('contributor', 0)
	1023	+
	1024	+ def _end_contributor(self):
	1025	+ self.pop('contributor')
	1026	+ self.incontributor = 0
	1027	+
	1028	+ def _start_dc_contributor(self, attrsD):
	1029	+ self.incontributor = 1
	1030	+ context = self._getContext()
	1031	+ context.setdefault('contributors', [])
	1032	+ context['contributors'].append(FeedParserDict())
	1033	+ self.push('name', 0)
	1034	+
	1035	+ def _end_dc_contributor(self):
	1036	+ self._end_name()
	1037	+ self.incontributor = 0
	1038	+
	1039	+ def _start_name(self, attrsD):
	1040	+ self.push('name', 0)
	1041	+ _start_itunes_name = _start_name
	1042	+
	1043	+ def _end_name(self):
	1044	+ value = self.pop('name')
	1045	+ if self.inpublisher:
	1046	+ self._save_author('name', value, 'publisher')
	1047	+ elif self.inauthor:
	1048	+ self._save_author('name', value)
	1049	+ elif self.incontributor:
	1050	+ self._save_contributor('name', value)
	1051	+ elif self.intextinput:
	1052	+ context = self._getContext()
	1053	+ context['name'] = value
	1054	+ _end_itunes_name = _end_name
	1055	+
	1056	+ def _start_width(self, attrsD):
	1057	+ self.push('width', 0)
	1058	+
	1059	+ def _end_width(self):
	1060	+ value = self.pop('width')
	1061	+ try:
	1062	+ value = int(value)
	1063	+ except:
	1064	+ value = 0
	1065	+ if self.inimage:
	1066	+ context = self._getContext()
	1067	+ context['width'] = value
	1068	+
	1069	+ def _start_height(self, attrsD):
	1070	+ self.push('height', 0)
	1071	+
	1072	+ def _end_height(self):
	1073	+ value = self.pop('height')
	1074	+ try:
	1075	+ value = int(value)
	1076	+ except:
	1077	+ value = 0
	1078	+ if self.inimage:
	1079	+ context = self._getContext()
	1080	+ context['height'] = value
	1081	+
	1082	+ def _start_url(self, attrsD):
	1083	+ self.push('href', 1)
	1084	+ _start_homepage = _start_url
	1085	+ _start_uri = _start_url
	1086	+
	1087	+ def _end_url(self):
	1088	+ value = self.pop('href')
	1089	+ if self.inauthor:
	1090	+ self._save_author('href', value)
	1091	+ elif self.incontributor:
	1092	+ self._save_contributor('href', value)
	1093	+ _end_homepage = _end_url
	1094	+ _end_uri = _end_url
	1095	+
	1096	+ def _start_email(self, attrsD):
	1097	+ self.push('email', 0)
	1098	+ _start_itunes_email = _start_email
	1099	+
	1100	+ def _end_email(self):
	1101	+ value = self.pop('email')
	1102	+ if self.inpublisher:
	1103	+ self._save_author('email', value, 'publisher')
	1104	+ elif self.inauthor:
	1105	+ self._save_author('email', value)
	1106	+ elif self.incontributor:
	1107	+ self._save_contributor('email', value)
	1108	+ _end_itunes_email = _end_email
	1109	+
	1110	+ def _getContext(self):
	1111	+ if self.insource:
	1112	+ context = self.sourcedata
	1113	+ elif self.inimage:
	1114	+ context = self.feeddata['image']
	1115	+ elif self.intextinput:
	1116	+ context = self.feeddata['textinput']
	1117	+ elif self.inentry:
	1118	+ context = self.entries[-1]
	1119	+ else:
	1120	+ context = self.feeddata
	1121	+ return context
	1122	+
	1123	+ def _save_author(self, key, value, prefix='author'):
	1124	+ context = self._getContext()
	1125	+ context.setdefault(prefix + '_detail', FeedParserDict())
	1126	+ context[prefix + '_detail'][key] = value
	1127	+ self._sync_author_detail()
	1128	+
	1129	+ def _save_contributor(self, key, value):
	1130	+ context = self._getContext()
	1131	+ context.setdefault('contributors', [FeedParserDict()])
	1132	+ context['contributors'][-1][key] = value
	1133	+
	1134	+ def _sync_author_detail(self, key='author'):
	1135	+ context = self._getContext()
	1136	+ detail = context.get('%s_detail' % key)
	1137	+ if detail:
	1138	+ name = detail.get('name')
	1139	+ email = detail.get('email')
	1140	+ if name and email:
	1141	+ context[key] = '%s (%s)' % (name, email)
	1142	+ elif name:
	1143	+ context[key] = name
	1144	+ elif email:
	1145	+ context[key] = email
	1146	+ else:
	1147	+ author, email = context.get(key), None
	1148	+ if not author: return
	1149	+ emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)\|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}\|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author)
	1150	+ if emailmatch:
	1151	+ email = emailmatch.group(0)
	1152	+ # probably a better way to do the following, but it passes all the tests
	1153	+ author = author.replace(email, '')
	1154	+ author = author.replace('()', '')
	1155	+ author = author.replace('<>', '')
	1156	+ author = author.replace('<>', '')
	1157	+ author = author.strip()
	1158	+ if author and (author[0] == '('):
	1159	+ author = author[1:]
	1160	+ if author and (author[-1] == ')'):
	1161	+ author = author[:-1]
	1162	+ author = author.strip()
	1163	+ if author or email:
	1164	+ context.setdefault('%s_detail' % key, FeedParserDict())
	1165	+ if author:
	1166	+ context['%s_detail' % key]['name'] = author
	1167	+ if email:
	1168	+ context['%s_detail' % key]['email'] = email
	1169	+
	1170	+ def _start_subtitle(self, attrsD):
	1171	+ self.pushContent('subtitle', attrsD, 'text/plain', 1)
	1172	+ _start_tagline = _start_subtitle
	1173	+ _start_itunes_subtitle = _start_subtitle
	1174	+
	1175	+ def _end_subtitle(self):
	1176	+ self.popContent('subtitle')
	1177	+ _end_tagline = _end_subtitle
	1178	+ _end_itunes_subtitle = _end_subtitle
	1179	+
	1180	+ def _start_rights(self, attrsD):
	1181	+ self.pushContent('rights', attrsD, 'text/plain', 1)
	1182	+ _start_dc_rights = _start_rights
	1183	+ _start_copyright = _start_rights
	1184	+
	1185	+ def _end_rights(self):
	1186	+ self.popContent('rights')
	1187	+ _end_dc_rights = _end_rights
	1188	+ _end_copyright = _end_rights
	1189	+
	1190	+ def _start_item(self, attrsD):
	1191	+ self.entries.append(FeedParserDict())
	1192	+ self.push('item', 0)
	1193	+ self.inentry = 1
	1194	+ self.guidislink = 0
	1195	+ self.hasTitle = 0
	1196	+ id = self._getAttribute(attrsD, 'rdf:about')
	1197	+ if id:
	1198	+ context = self._getContext()
	1199	+ context['id'] = id
	1200	+ self._cdf_common(attrsD)
	1201	+ _start_entry = _start_item
	1202	+ _start_product = _start_item
	1203	+
	1204	+ def _end_item(self):
	1205	+ self.pop('item')
	1206	+ self.inentry = 0
	1207	+ _end_entry = _end_item
	1208	+
	1209	+ def _start_dc_language(self, attrsD):
	1210	+ self.push('language', 1)
	1211	+ _start_language = _start_dc_language
	1212	+
	1213	+ def _end_dc_language(self):
	1214	+ self.lang = self.pop('language')
	1215	+ _end_language = _end_dc_language
	1216	+
	1217	+ def _start_dc_publisher(self, attrsD):
	1218	+ self.push('publisher', 1)
	1219	+ _start_webmaster = _start_dc_publisher
	1220	+
	1221	+ def _end_dc_publisher(self):
	1222	+ self.pop('publisher')
	1223	+ self._sync_author_detail('publisher')
	1224	+ _end_webmaster = _end_dc_publisher
	1225	+
	1226	+ def _start_published(self, attrsD):
	1227	+ self.push('published', 1)
	1228	+ _start_dcterms_issued = _start_published
	1229	+ _start_issued = _start_published
	1230	+
	1231	+ def _end_published(self):
	1232	+ value = self.pop('published')
	1233	+ self._save('published_parsed', _parse_date(value))
	1234	+ _end_dcterms_issued = _end_published
	1235	+ _end_issued = _end_published
	1236	+
	1237	+ def _start_updated(self, attrsD):
	1238	+ self.push('updated', 1)
	1239	+ _start_modified = _start_updated
	1240	+ _start_dcterms_modified = _start_updated
	1241	+ _start_pubdate = _start_updated
	1242	+ _start_dc_date = _start_updated
	1243	+
	1244	+ def _end_updated(self):
	1245	+ value = self.pop('updated')
	1246	+ parsed_value = _parse_date(value)
	1247	+ self._save('updated_parsed', parsed_value)
	1248	+ _end_modified = _end_updated
	1249	+ _end_dcterms_modified = _end_updated
	1250	+ _end_pubdate = _end_updated
	1251	+ _end_dc_date = _end_updated
	1252	+
	1253	+ def _start_created(self, attrsD):
	1254	+ self.push('created', 1)
	1255	+ _start_dcterms_created = _start_created
	1256	+
	1257	+ def _end_created(self):
	1258	+ value = self.pop('created')
	1259	+ self._save('created_parsed', _parse_date(value))
	1260	+ _end_dcterms_created = _end_created
	1261	+
	1262	+ def _start_expirationdate(self, attrsD):
	1263	+ self.push('expired', 1)
	1264	+
	1265	+ def _end_expirationdate(self):
	1266	+ self._save('expired_parsed', _parse_date(self.pop('expired')))
	1267	+
	1268	+ def _start_cc_license(self, attrsD):
	1269	+ context = self._getContext()
	1270	+ value = self._getAttribute(attrsD, 'rdf:resource')
	1271	+ attrsD = FeedParserDict()
	1272	+ attrsD['rel']='license'
	1273	+ if value: attrsD['href']=value
	1274	+ context.setdefault('links', []).append(attrsD)
	1275	+
	1276	+ def _start_creativecommons_license(self, attrsD):
	1277	+ self.push('license', 1)
	1278	+ _start_creativeCommons_license = _start_creativecommons_license
	1279	+
	1280	+ def _end_creativecommons_license(self):
	1281	+ value = self.pop('license')
	1282	+ context = self._getContext()
	1283	+ attrsD = FeedParserDict()
	1284	+ attrsD['rel']='license'
	1285	+ if value: attrsD['href']=value
	1286	+ context.setdefault('links', []).append(attrsD)
	1287	+ del context['license']
	1288	+ _end_creativeCommons_license = _end_creativecommons_license
	1289	+
	1290	+ def _addXFN(self, relationships, href, name):
	1291	+ context = self._getContext()
	1292	+ xfn = context.setdefault('xfn', [])
	1293	+ value = FeedParserDict({'relationships': relationships, 'href': href, 'name': name})
	1294	+ if value not in xfn:
	1295	+ xfn.append(value)
	1296	+
	1297	+ def _addTag(self, term, scheme, label):
	1298	+ context = self._getContext()
	1299	+ tags = context.setdefault('tags', [])
	1300	+ if (not term) and (not scheme) and (not label): return
	1301	+ value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
	1302	+ if value not in tags:
	1303	+ tags.append(value)
	1304	+
	1305	+ def _start_category(self, attrsD):
	1306	+ if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD))
	1307	+ term = attrsD.get('term')
	1308	+ scheme = attrsD.get('scheme', attrsD.get('domain'))
	1309	+ label = attrsD.get('label')
	1310	+ self._addTag(term, scheme, label)
	1311	+ self.push('category', 1)
	1312	+ _start_dc_subject = _start_category
	1313	+ _start_keywords = _start_category
	1314	+
	1315	+ def _end_itunes_keywords(self):
	1316	+ for term in self.pop('itunes_keywords').split():
	1317	+ self._addTag(term, 'http://www.itunes.com/', None)
	1318	+
	1319	+ def _start_itunes_category(self, attrsD):
	1320	+ self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None)
	1321	+ self.push('category', 1)
	1322	+
	1323	+ def _end_category(self):
	1324	+ value = self.pop('category')
	1325	+ if not value: return
	1326	+ context = self._getContext()
	1327	+ tags = context['tags']
	1328	+ if value and len(tags) and not tags[-1]['term']:
	1329	+ tags[-1]['term'] = value
	1330	+ else:
	1331	+ self._addTag(value, None, None)
	1332	+ _end_dc_subject = _end_category
	1333	+ _end_keywords = _end_category
	1334	+ _end_itunes_category = _end_category
	1335	+
	1336	+ def _start_cloud(self, attrsD):
	1337	+ self._getContext()['cloud'] = FeedParserDict(attrsD)
	1338	+
	1339	+ def _start_link(self, attrsD):
	1340	+ attrsD.setdefault('rel', 'alternate')
	1341	+ if attrsD['rel'] == 'self':
	1342	+ attrsD.setdefault('type', 'application/atom+xml')
	1343	+ else:
	1344	+ attrsD.setdefault('type', 'text/html')
	1345	+ context = self._getContext()
	1346	+ attrsD = self._itsAnHrefDamnIt(attrsD)
	1347	+ if attrsD.has_key('href'):
	1348	+ attrsD['href'] = self.resolveURI(attrsD['href'])
	1349	+ if attrsD.get('rel')=='enclosure' and not context.get('id'):
	1350	+ context['id'] = attrsD.get('href')
	1351	+ expectingText = self.infeed or self.inentry or self.insource
	1352	+ context.setdefault('links', [])
	1353	+ context['links'].append(FeedParserDict(attrsD))
	1354	+ if attrsD.has_key('href'):
	1355	+ expectingText = 0
	1356	+ if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
	1357	+ context['link'] = attrsD['href']
	1358	+ else:
	1359	+ self.push('link', expectingText)
	1360	+ _start_producturl = _start_link
	1361	+
	1362	+ def _end_link(self):
	1363	+ value = self.pop('link')
	1364	+ context = self._getContext()
	1365	+ _end_producturl = _end_link
	1366	+
	1367	+ def _start_guid(self, attrsD):
	1368	+ self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
	1369	+ self.push('id', 1)
	1370	+
	1371	+ def _end_guid(self):
	1372	+ value = self.pop('id')
	1373	+ self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
	1374	+ if self.guidislink:
	1375	+ # guid acts as link, but only if 'ispermalink' is not present or is 'true',
	1376	+ # and only if the item doesn't already have a link element
	1377	+ self._save('link', value)
	1378	+
	1379	+ def _start_title(self, attrsD):
	1380	+ if self.svgOK: return self.unknown_starttag('title', attrsD.items())
	1381	+ self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
	1382	+ _start_dc_title = _start_title
	1383	+ _start_media_title = _start_title
	1384	+
	1385	+ def _end_title(self):
	1386	+ if self.svgOK: return
	1387	+ value = self.popContent('title')
	1388	+ if not value: return
	1389	+ context = self._getContext()
	1390	+ self.hasTitle = 1
	1391	+ _end_dc_title = _end_title
	1392	+
	1393	+ def _end_media_title(self):
	1394	+ hasTitle = self.hasTitle
	1395	+ self._end_title()
	1396	+ self.hasTitle = hasTitle
	1397	+
	1398	+ def _start_description(self, attrsD):
	1399	+ context = self._getContext()
	1400	+ if context.has_key('summary'):
	1401	+ self._summaryKey = 'content'
	1402	+ self._start_content(attrsD)
	1403	+ else:
	1404	+ self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)
	1405	+ _start_dc_description = _start_description
	1406	+
	1407	+ def _start_abstract(self, attrsD):
	1408	+ self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
	1409	+
	1410	+ def _end_description(self):
	1411	+ if self._summaryKey == 'content':
	1412	+ self._end_content()
	1413	+ else:
	1414	+ value = self.popContent('description')
	1415	+ self._summaryKey = None
	1416	+ _end_abstract = _end_description
	1417	+ _end_dc_description = _end_description
	1418	+
	1419	+ def _start_info(self, attrsD):
	1420	+ self.pushContent('info', attrsD, 'text/plain', 1)
	1421	+ _start_feedburner_browserfriendly = _start_info
	1422	+
	1423	+ def _end_info(self):
	1424	+ self.popContent('info')
	1425	+ _end_feedburner_browserfriendly = _end_info
	1426	+
	1427	+ def _start_generator(self, attrsD):
	1428	+ if attrsD:
	1429	+ attrsD = self._itsAnHrefDamnIt(attrsD)
	1430	+ if attrsD.has_key('href'):
	1431	+ attrsD['href'] = self.resolveURI(attrsD['href'])
	1432	+ self._getContext()['generator_detail'] = FeedParserDict(attrsD)
	1433	+ self.push('generator', 1)
	1434	+
	1435	+ def _end_generator(self):
	1436	+ value = self.pop('generator')
	1437	+ context = self._getContext()
	1438	+ if context.has_key('generator_detail'):
	1439	+ context['generator_detail']['name'] = value
	1440	+
	1441	+ def _start_admin_generatoragent(self, attrsD):
	1442	+ self.push('generator', 1)
	1443	+ value = self._getAttribute(attrsD, 'rdf:resource')
	1444	+ if value:
	1445	+ self.elementstack[-1][2].append(value)
	1446	+ self.pop('generator')
	1447	+ self._getContext()['generator_detail'] = FeedParserDict({'href': value})
	1448	+
	1449	+ def _start_admin_errorreportsto(self, attrsD):
	1450	+ self.push('errorreportsto', 1)
	1451	+ value = self._getAttribute(attrsD, 'rdf:resource')
	1452	+ if value:
	1453	+ self.elementstack[-1][2].append(value)
	1454	+ self.pop('errorreportsto')
	1455	+
	1456	+ def _start_summary(self, attrsD):
	1457	+ context = self._getContext()
	1458	+ if context.has_key('summary'):
	1459	+ self._summaryKey = 'content'
	1460	+ self._start_content(attrsD)
	1461	+ else:
	1462	+ self._summaryKey = 'summary'
	1463	+ self.pushContent(self._summaryKey, attrsD, 'text/plain', 1)
	1464	+ _start_itunes_summary = _start_summary
	1465	+
	1466	+ def _end_summary(self):
	1467	+ if self._summaryKey == 'content':
	1468	+ self._end_content()
	1469	+ else:
	1470	+ self.popContent(self._summaryKey or 'summary')
	1471	+ self._summaryKey = None
	1472	+ _end_itunes_summary = _end_summary
	1473	+
	1474	+ def _start_enclosure(self, attrsD):
	1475	+ attrsD = self._itsAnHrefDamnIt(attrsD)
	1476	+ context = self._getContext()
	1477	+ attrsD['rel']='enclosure'
	1478	+ context.setdefault('links', []).append(FeedParserDict(attrsD))
	1479	+ href = attrsD.get('href')
	1480	+ if href and not context.get('id'):
	1481	+ context['id'] = href
	1482	+
	1483	+ def _start_source(self, attrsD):
	1484	+ self.insource = 1
	1485	+ self.hasTitle = 0
	1486	+
	1487	+ def _end_source(self):
	1488	+ self.insource = 0
	1489	+ self._getContext()['source'] = copy.deepcopy(self.sourcedata)
	1490	+ self.sourcedata.clear()
	1491	+
	1492	+ def _start_content(self, attrsD):
	1493	+ self.pushContent('content', attrsD, 'text/plain', 1)
	1494	+ src = attrsD.get('src')
	1495	+ if src:
	1496	+ self.contentparams['src'] = src
	1497	+ self.push('content', 1)
	1498	+
	1499	+ def _start_prodlink(self, attrsD):
	1500	+ self.pushContent('content', attrsD, 'text/html', 1)
	1501	+
	1502	+ def _start_body(self, attrsD):
	1503	+ self.pushContent('content', attrsD, 'application/xhtml+xml', 1)
	1504	+ _start_xhtml_body = _start_body
	1505	+
	1506	+ def _start_content_encoded(self, attrsD):
	1507	+ self.pushContent('content', attrsD, 'text/html', 1)
	1508	+ _start_fullitem = _start_content_encoded
	1509	+
	1510	+ def _end_content(self):
	1511	+ copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)
	1512	+ value = self.popContent('content')
	1513	+ if copyToDescription:
	1514	+ self._save('description', value)
	1515	+
	1516	+ _end_body = _end_content
	1517	+ _end_xhtml_body = _end_content
	1518	+ _end_content_encoded = _end_content
	1519	+ _end_fullitem = _end_content
	1520	+ _end_prodlink = _end_content
	1521	+
	1522	+ def _start_itunes_image(self, attrsD):
	1523	+ self.push('itunes_image', 0)
	1524	+ self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
	1525	+ _start_itunes_link = _start_itunes_image
	1526	+
	1527	+ def _end_itunes_block(self):
	1528	+ value = self.pop('itunes_block', 0)
	1529	+ self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
	1530	+
	1531	+ def _end_itunes_explicit(self):
	1532	+ value = self.pop('itunes_explicit', 0)
	1533	+ self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0
	1534	+
	1535	+if _XML_AVAILABLE:
	1536	+ class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
	1537	+ def __init__(self, baseuri, baselang, encoding):
	1538	+ if _debug: sys.stderr.write('trying StrictFeedParser\n')
	1539	+ xml.sax.handler.ContentHandler.__init__(self)
	1540	+ _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
	1541	+ self.bozo = 0
	1542	+ self.exc = None
	1543	+
	1544	+ def startPrefixMapping(self, prefix, uri):
	1545	+ self.trackNamespace(prefix, uri)
	1546	+
	1547	+ def startElementNS(self, name, qname, attrs):
	1548	+ namespace, localname = name
	1549	+ lowernamespace = str(namespace or '').lower()
	1550	+ if lowernamespace.find('backend.userland.com/rss') <> -1:
	1551	+ # match any backend.userland.com namespace
	1552	+ namespace = 'http://backend.userland.com/rss'
	1553	+ lowernamespace = namespace
	1554	+ if qname and qname.find(':') > 0:
	1555	+ givenprefix = qname.split(':')[0]
	1556	+ else:
	1557	+ givenprefix = None
	1558	+ prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
	1559	+ if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):
	1560	+ raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
	1561	+ localname = str(localname).lower()
	1562	+
	1563	+ # qname implementation is horribly broken in Python 2.1 (it
	1564	+ # doesn't report any), and slightly broken in Python 2.2 (it
	1565	+ # doesn't report the xml: namespace). So we match up namespaces
	1566	+ # with a known list first, and then possibly override them with
	1567	+ # the qnames the SAX parser gives us (if indeed it gives us any
	1568	+ # at all). Thanks to MatejC for helping me test this and
	1569	+ # tirelessly telling me that it didn't work yet.
	1570	+ attrsD = {}
	1571	+ if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
	1572	+ attrsD['xmlns']=namespace
	1573	+ if localname=='svg' and namespace=='http://www.w3.org/2000/svg':
	1574	+ attrsD['xmlns']=namespace
	1575	+
	1576	+ if prefix:
	1577	+ localname = prefix.lower() + ':' + localname
	1578	+ elif namespace and not qname: #Expat
	1579	+ for name,value in self.namespacesInUse.items():
	1580	+ if name and value == namespace:
	1581	+ localname = name + ':' + localname
	1582	+ break
	1583	+ if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname))
	1584	+
	1585	+ for (namespace, attrlocalname), attrvalue in attrs._attrs.items():
	1586	+ lowernamespace = (namespace or '').lower()
	1587	+ prefix = self._matchnamespaces.get(lowernamespace, '')
	1588	+ if prefix:
	1589	+ attrlocalname = prefix + ':' + attrlocalname
	1590	+ attrsD[str(attrlocalname).lower()] = attrvalue
	1591	+ for qname in attrs.getQNames():
	1592	+ attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
	1593	+ self.unknown_starttag(localname, attrsD.items())
	1594	+
	1595	+ def characters(self, text):
	1596	+ self.handle_data(text)
	1597	+
	1598	+ def endElementNS(self, name, qname):
	1599	+ namespace, localname = name
	1600	+ lowernamespace = str(namespace or '').lower()
	1601	+ if qname and qname.find(':') > 0:
	1602	+ givenprefix = qname.split(':')[0]
	1603	+ else:
	1604	+ givenprefix = ''
	1605	+ prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
	1606	+ if prefix:
	1607	+ localname = prefix + ':' + localname
	1608	+ elif namespace and not qname: #Expat
	1609	+ for name,value in self.namespacesInUse.items():
	1610	+ if name and value == namespace:
	1611	+ localname = name + ':' + localname
	1612	+ break
	1613	+ localname = str(localname).lower()
	1614	+ self.unknown_endtag(localname)
	1615	+
	1616	+ def error(self, exc):
	1617	+ self.bozo = 1
	1618	+ self.exc = exc
	1619	+
	1620	+ def fatalError(self, exc):
	1621	+ self.error(exc)
	1622	+ raise exc
	1623	+
	1624	+class _BaseHTMLProcessor(sgmllib.SGMLParser):
	1625	+ special = re.compile('''[<>'"]''')
	1626	+ bare_ampersand = re.compile("&(?!#\d+;\|#x[0-9a-fA-F]+;\|\w+;)")
	1627	+ elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
	1628	+ 'img', 'input', 'isindex', 'link', 'meta', 'param']
	1629	+
	1630	+ def __init__(self, encoding, type):
	1631	+ self.encoding = encoding
	1632	+ self.type = type
	1633	+ if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
	1634	+ sgmllib.SGMLParser.__init__(self)
	1635	+
	1636	+ def reset(self):
	1637	+ self.pieces = []
	1638	+ sgmllib.SGMLParser.reset(self)
	1639	+
	1640	+ def _shorttag_replace(self, match):
	1641	+ tag = match.group(1)
	1642	+ if tag in self.elements_no_end_tag:
	1643	+ return '<' + tag + ' />'
	1644	+ else:
	1645	+ return '<' + tag + '></' + tag + '>'
	1646	+
	1647	+ def parse_starttag(self,i):
	1648	+ j=sgmllib.SGMLParser.parse_starttag(self, i)
	1649	+ if self.type == 'application/xhtml+xml':
	1650	+ if j>2 and self.rawdata[j-2:j]=='/>':
	1651	+ self.unknown_endtag(self.lasttag)
	1652	+ return j
	1653	+
	1654	+ def feed(self, data):
	1655	+ data = re.compile(r'<!((?!DOCTYPE\|--\|\[))', re.IGNORECASE).sub(r'<!\1', data)
	1656	+ #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
	1657	+ data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data)
	1658	+ data = data.replace(''', "'")
	1659	+ data = data.replace('"', '"')
	1660	+ if self.encoding and type(data) == type(u''):
	1661	+ data = data.encode(self.encoding)
	1662	+ sgmllib.SGMLParser.feed(self, data)
	1663	+ sgmllib.SGMLParser.close(self)
	1664	+
	1665	+ def normalize_attrs(self, attrs):
	1666	+ if not attrs: return attrs
	1667	+ # utility method to be called by descendants
	1668	+ attrs = dict([(k.lower(), v) for k, v in attrs]).items()
	1669	+ attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
	1670	+ attrs.sort()
	1671	+ return attrs
	1672	+
	1673	+ def unknown_starttag(self, tag, attrs):
	1674	+ # called for each start tag
	1675	+ # attrs is a list of (attr, value) tuples
	1676	+ # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
	1677	+ if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
	1678	+ uattrs = []
	1679	+ strattrs=''
	1680	+ if attrs:
	1681	+ for key, value in attrs:
	1682	+ value=value.replace('>','>').replace('<','<').replace('"','"')
	1683	+ value = self.bare_ampersand.sub("&", value)
	1684	+ # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
	1685	+ if type(value) != type(u''):
	1686	+ try:
	1687	+ value = unicode(value, self.encoding)
	1688	+ except:
	1689	+ value = unicode(value, 'iso-8859-1')
	1690	+ uattrs.append((unicode(key, self.encoding), value))
	1691	+ strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs])
	1692	+ if self.encoding:
	1693	+ try:
	1694	+ strattrs=strattrs.encode(self.encoding)
	1695	+ except:
	1696	+ pass
	1697	+ if tag in self.elements_no_end_tag:
	1698	+ self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
	1699	+ else:
	1700	+ self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
	1701	+
	1702	+ def unknown_endtag(self, tag):
	1703	+ # called for each end tag, e.g. for </pre>, tag will be 'pre'
	1704	+ # Reconstruct the original end tag.
	1705	+ if tag not in self.elements_no_end_tag:
	1706	+ self.pieces.append("</%(tag)s>" % locals())
	1707	+
	1708	+ def handle_charref(self, ref):
	1709	+ # called for each character reference, e.g. for ' ', ref will be '160'
	1710	+ # Reconstruct the original character reference.
	1711	+ if ref.startswith('x'):
	1712	+ value = unichr(int(ref[1:],16))
	1713	+ else:
	1714	+ value = unichr(int(ref))
	1715	+
	1716	+ if value in _cp1252.keys():
	1717	+ self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:])
	1718	+ else:
	1719	+ self.pieces.append('&#%(ref)s;' % locals())
	1720	+
	1721	+ def handle_entityref(self, ref):
	1722	+ # called for each entity reference, e.g. for '©', ref will be 'copy'
	1723	+ # Reconstruct the original entity reference.
	1724	+ if name2codepoint.has_key(ref):
	1725	+ self.pieces.append('&%(ref)s;' % locals())
	1726	+ else:
	1727	+ self.pieces.append('&%(ref)s' % locals())
	1728	+
	1729	+ def handle_data(self, text):
	1730	+ # called for each block of plain text, i.e. outside of any tag and
	1731	+ # not containing any character or entity references
	1732	+ # Store the original text verbatim.
	1733	+ if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
	1734	+ self.pieces.append(text)
	1735	+
	1736	+ def handle_comment(self, text):
	1737	+ # called for each HTML comment, e.g. <!-- insert Javascript code here -->
	1738	+ # Reconstruct the original comment.
	1739	+ self.pieces.append('<!--%(text)s-->' % locals())
	1740	+
	1741	+ def handle_pi(self, text):
	1742	+ # called for each processing instruction, e.g. <?instruction>
	1743	+ # Reconstruct original processing instruction.
	1744	+ self.pieces.append('<?%(text)s>' % locals())
	1745	+
	1746	+ def handle_decl(self, text):
	1747	+ # called for the DOCTYPE, if present, e.g.
	1748	+ # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
	1749	+ # "http://www.w3.org/TR/html4/loose.dtd">
	1750	+ # Reconstruct original DOCTYPE
	1751	+ self.pieces.append('<!%(text)s>' % locals())
	1752	+
	1753	+ _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]\s').match
	1754	+ def _scan_name(self, i, declstartpos):
	1755	+ rawdata = self.rawdata
	1756	+ n = len(rawdata)
	1757	+ if i == n:
	1758	+ return None, -1
	1759	+ m = self._new_declname_match(rawdata, i)
	1760	+ if m:
	1761	+ s = m.group()
	1762	+ name = s.strip()
	1763	+ if (i + len(s)) == n:
	1764	+ return None, -1 # end of buffer
	1765	+ return name.lower(), m.end()
	1766	+ else:
	1767	+ self.handle_data(rawdata)
	1768	+# self.updatepos(declstartpos, i)
	1769	+ return None, -1
	1770	+
	1771	+ def convert_charref(self, name):
	1772	+ return '&#%s;' % name
	1773	+
	1774	+ def convert_entityref(self, name):
	1775	+ return '&%s;' % name
	1776	+
	1777	+ def output(self):
	1778	+ '''Return processed HTML as a single string'''
	1779	+ return ''.join([str(p) for p in self.pieces])
	1780	+
	1781	+class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
	1782	+ def __init__(self, baseuri, baselang, encoding, entities):
	1783	+ sgmllib.SGMLParser.__init__(self)
	1784	+ _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
	1785	+ _BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml')
	1786	+ self.entities=entities
	1787	+
	1788	+ def decodeEntities(self, element, data):
	1789	+ data = data.replace('<', '<')
	1790	+ data = data.replace('<', '<')
	1791	+ data = data.replace('<', '<')
	1792	+ data = data.replace('>', '>')
	1793	+ data = data.replace('>', '>')
	1794	+ data = data.replace('>', '>')
	1795	+ data = data.replace('&', '&')
	1796	+ data = data.replace('&', '&')
	1797	+ data = data.replace('"', '"')
	1798	+ data = data.replace('"', '"')
	1799	+ data = data.replace(''', ''')
	1800	+ data = data.replace(''', ''')
	1801	+ if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
	1802	+ data = data.replace('<', '<')
	1803	+ data = data.replace('>', '>')
	1804	+ data = data.replace('&', '&')
	1805	+ data = data.replace('"', '"')
	1806	+ data = data.replace(''', "'")
	1807	+ return data
	1808	+
	1809	+ def strattrs(self, attrs):
	1810	+ return ''.join([' %s="%s"' % (n,v.replace('"','"')) for n,v in attrs])
	1811	+
	1812	+class _MicroformatsParser:
	1813	+ STRING = 1
	1814	+ DATE = 2
	1815	+ URI = 3
	1816	+ NODE = 4
	1817	+ EMAIL = 5
	1818	+
	1819	+ known_xfn_relationships = ['contact', 'acquaintance', 'friend', 'met', 'co-worker', 'coworker', 'colleague', 'co-resident', 'coresident', 'neighbor', 'child', 'parent', 'sibling', 'brother', 'sister', 'spouse', 'wife', 'husband', 'kin', 'relative', 'muse', 'crush', 'date', 'sweetheart', 'me']
	1820	+ known_binary_extensions = ['zip','rar','exe','gz','tar','tgz','tbz2','bz2','z','7z','dmg','img','sit','sitx','hqx','deb','rpm','bz2','jar','rar','iso','bin','msi','mp2','mp3','ogg','ogm','mp4','m4v','m4a','avi','wma','wmv']
	1821	+
	1822	+ def __init__(self, data, baseuri, encoding):
	1823	+ self.document = BeautifulSoup.BeautifulSoup(data)
	1824	+ self.baseuri = baseuri
	1825	+ self.encoding = encoding
	1826	+ if type(data) == type(u''):
	1827	+ data = data.encode(encoding)
	1828	+ self.tags = []
	1829	+ self.enclosures = []
	1830	+ self.xfn = []
	1831	+ self.vcard = None
	1832	+
	1833	+ def vcardEscape(self, s):
	1834	+ if type(s) in (type(''), type(u'')):
	1835	+ s = s.replace(',', '\\,').replace(';', '\\;').replace('\n', '\\n')
	1836	+ return s
	1837	+
	1838	+ def vcardFold(self, s):
	1839	+ s = re.sub(';+$', '', s)
	1840	+ sFolded = ''
	1841	+ iMax = 75
	1842	+ sPrefix = ''
	1843	+ while len(s) > iMax:
	1844	+ sFolded += sPrefix + s[:iMax] + '\n'
	1845	+ s = s[iMax:]
	1846	+ sPrefix = ' '
	1847	+ iMax = 74
	1848	+ sFolded += sPrefix + s
	1849	+ return sFolded
	1850	+
	1851	+ def normalize(self, s):
	1852	+ return re.sub(r'\s+', ' ', s).strip()
	1853	+
	1854	+ def unique(self, aList):
	1855	+ results = []
	1856	+ for element in aList:
	1857	+ if element not in results:
	1858	+ results.append(element)
	1859	+ return results
	1860	+
	1861	+ def toISO8601(self, dt):
	1862	+ return time.strftime('%Y-%m-%dT%H:%M:%SZ', dt)
	1863	+
	1864	+ def getPropertyValue(self, elmRoot, sProperty, iPropertyType=4, bAllowMultiple=0, bAutoEscape=0):
	1865	+ all = lambda x: 1
	1866	+ sProperty = sProperty.lower()
	1867	+ bFound = 0
	1868	+ bNormalize = 1
	1869	+ propertyMatch = {'class': re.compile(r'\b%s\b' % sProperty)}
	1870	+ if bAllowMultiple and (iPropertyType != self.NODE):
	1871	+ snapResults = []
	1872	+ containers = elmRoot(['ul', 'ol'], propertyMatch)
	1873	+ for container in containers:
	1874	+ snapResults.extend(container('li'))
	1875	+ bFound = (len(snapResults) != 0)
	1876	+ if not bFound:
	1877	+ snapResults = elmRoot(all, propertyMatch)
	1878	+ bFound = (len(snapResults) != 0)
	1879	+ if (not bFound) and (sProperty == 'value'):
	1880	+ snapResults = elmRoot('pre')
	1881	+ bFound = (len(snapResults) != 0)
	1882	+ bNormalize = not bFound
	1883	+ if not bFound:
	1884	+ snapResults = [elmRoot]
	1885	+ bFound = (len(snapResults) != 0)
	1886	+ arFilter = []
	1887	+ if sProperty == 'vcard':
	1888	+ snapFilter = elmRoot(all, propertyMatch)
	1889	+ for node in snapFilter:
	1890	+ if node.findParent(all, propertyMatch):
	1891	+ arFilter.append(node)
	1892	+ arResults = []
	1893	+ for node in snapResults:
	1894	+ if node not in arFilter:
	1895	+ arResults.append(node)
	1896	+ bFound = (len(arResults) != 0)
	1897	+ if not bFound:
	1898	+ if bAllowMultiple: return []
	1899	+ elif iPropertyType == self.STRING: return ''
	1900	+ elif iPropertyType == self.DATE: return None
	1901	+ elif iPropertyType == self.URI: return ''
	1902	+ elif iPropertyType == self.NODE: return None
	1903	+ else: return None
	1904	+ arValues = []
	1905	+ for elmResult in arResults:
	1906	+ sValue = None
	1907	+ if iPropertyType == self.NODE:
	1908	+ if bAllowMultiple:
	1909	+ arValues.append(elmResult)
	1910	+ continue
	1911	+ else:
	1912	+ return elmResult
	1913	+ sNodeName = elmResult.name.lower()
	1914	+ if (iPropertyType == self.EMAIL) and (sNodeName == 'a'):
	1915	+ sValue = (elmResult.get('href') or '').split('mailto:').pop().split('?')[0]
	1916	+ if sValue:
	1917	+ sValue = bNormalize and self.normalize(sValue) or sValue.strip()
	1918	+ if (not sValue) and (sNodeName == 'abbr'):
	1919	+ sValue = elmResult.get('title')
	1920	+ if sValue:
	1921	+ sValue = bNormalize and self.normalize(sValue) or sValue.strip()
	1922	+ if (not sValue) and (iPropertyType == self.URI):
	1923	+ if sNodeName == 'a': sValue = elmResult.get('href')
	1924	+ elif sNodeName == 'img': sValue = elmResult.get('src')
	1925	+ elif sNodeName == 'object': sValue = elmResult.get('data')
	1926	+ if sValue:
	1927	+ sValue = bNormalize and self.normalize(sValue) or sValue.strip()
	1928	+ if (not sValue) and (sNodeName == 'img'):
	1929	+ sValue = elmResult.get('alt')
	1930	+ if sValue:
	1931	+ sValue = bNormalize and self.normalize(sValue) or sValue.strip()
	1932	+ if not sValue:
	1933	+ sValue = elmResult.renderContents()
	1934	+ sValue = re.sub(r'<\S[^>]*>', '', sValue)
	1935	+ sValue = sValue.replace('\r\n', '\n')
	1936	+ sValue = sValue.replace('\r', '\n')
	1937	+ if sValue:
	1938	+ sValue = bNormalize and self.normalize(sValue) or sValue.strip()
	1939	+ if not sValue: continue
	1940	+ if iPropertyType == self.DATE:
	1941	+ sValue = _parse_date_iso8601(sValue)
	1942	+ if bAllowMultiple:
	1943	+ arValues.append(bAutoEscape and self.vcardEscape(sValue) or sValue)
	1944	+ else:
	1945	+ return bAutoEscape and self.vcardEscape(sValue) or sValue
	1946	+ return arValues
	1947	+
	1948	+ def findVCards(self, elmRoot, bAgentParsing=0):
	1949	+ sVCards = ''
	1950	+
	1951	+ if not bAgentParsing:
	1952	+ arCards = self.getPropertyValue(elmRoot, 'vcard', bAllowMultiple=1)
	1953	+ else:
	1954	+ arCards = [elmRoot]
	1955	+
	1956	+ for elmCard in arCards:
	1957	+ arLines = []
	1958	+
	1959	+ def processSingleString(sProperty):
	1960	+ sValue = self.getPropertyValue(elmCard, sProperty, self.STRING, bAutoEscape=1)
	1961	+ if sValue:
	1962	+ arLines.append(self.vcardFold(sProperty.upper() + ':' + sValue))
	1963	+ return sValue or ''
	1964	+
	1965	+ def processSingleURI(sProperty):
	1966	+ sValue = self.getPropertyValue(elmCard, sProperty, self.URI)
	1967	+ if sValue:
	1968	+ sContentType = ''
	1969	+ sEncoding = ''
	1970	+ sValueKey = ''
	1971	+ if sValue.startswith('data:'):
	1972	+ sEncoding = ';ENCODING=b'
	1973	+ sContentType = sValue.split(';')[0].split('/').pop()
	1974	+ sValue = sValue.split(',', 1).pop()
	1975	+ else:
	1976	+ elmValue = self.getPropertyValue(elmCard, sProperty)
	1977	+ if elmValue:
	1978	+ if sProperty != 'url':
	1979	+ sValueKey = ';VALUE=uri'
	1980	+ sContentType = elmValue.get('type', '').strip().split('/').pop().strip()
	1981	+ sContentType = sContentType.upper()
	1982	+ if sContentType == 'OCTET-STREAM':
	1983	+ sContentType = ''
	1984	+ if sContentType:
	1985	+ sContentType = ';TYPE=' + sContentType.upper()
	1986	+ arLines.append(self.vcardFold(sProperty.upper() + sEncoding + sContentType + sValueKey + ':' + sValue))
	1987	+
	1988	+ def processTypeValue(sProperty, arDefaultType, arForceType=None):
	1989	+ arResults = self.getPropertyValue(elmCard, sProperty, bAllowMultiple=1)
	1990	+ for elmResult in arResults:
	1991	+ arType = self.getPropertyValue(elmResult, 'type', self.STRING, 1, 1)
	1992	+ if arForceType:
	1993	+ arType = self.unique(arForceType + arType)
	1994	+ if not arType:
	1995	+ arType = arDefaultType
	1996	+ sValue = self.getPropertyValue(elmResult, 'value', self.EMAIL, 0)
	1997	+ if sValue:
	1998	+ arLines.append(self.vcardFold(sProperty.upper() + ';TYPE=' + ','.join(arType) + ':' + sValue))
	1999	+
	2000	+ # AGENT
	2001	+ # must do this before all other properties because it is destructive
	2002	+ # (removes nested class="vcard" nodes so they don't interfere with
	2003	+ # this vcard's other properties)
	2004	+ arAgent = self.getPropertyValue(elmCard, 'agent', bAllowMultiple=1)
	2005	+ for elmAgent in arAgent:
	2006	+ if re.compile(r'\bvcard\b').search(elmAgent.get('class')):
	2007	+ sAgentValue = self.findVCards(elmAgent, 1) + '\n'
	2008	+ sAgentValue = sAgentValue.replace('\n', '\\n')
	2009	+ sAgentValue = sAgentValue.replace(';', '\\;')
	2010	+ if sAgentValue:
	2011	+ arLines.append(self.vcardFold('AGENT:' + sAgentValue))
	2012	+ elmAgent['class'] = ''
	2013	+ elmAgent.contents = []
	2014	+ else:
	2015	+ sAgentValue = self.getPropertyValue(elmAgent, 'value', self.URI, bAutoEscape=1);
	2016	+ if sAgentValue:
	2017	+ arLines.append(self.vcardFold('AGENT;VALUE=uri:' + sAgentValue))
	2018	+
	2019	+ # FN (full name)
	2020	+ sFN = processSingleString('fn')
	2021	+
	2022	+ # N (name)
	2023	+ elmName = self.getPropertyValue(elmCard, 'n')
	2024	+ if elmName:
	2025	+ sFamilyName = self.getPropertyValue(elmName, 'family-name', self.STRING, bAutoEscape=1)
	2026	+ sGivenName = self.getPropertyValue(elmName, 'given-name', self.STRING, bAutoEscape=1)
	2027	+ arAdditionalNames = self.getPropertyValue(elmName, 'additional-name', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'additional-names', self.STRING, 1, 1)
	2028	+ arHonorificPrefixes = self.getPropertyValue(elmName, 'honorific-prefix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-prefixes', self.STRING, 1, 1)
	2029	+ arHonorificSuffixes = self.getPropertyValue(elmName, 'honorific-suffix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-suffixes', self.STRING, 1, 1)
	2030	+ arLines.append(self.vcardFold('N:' + sFamilyName + ';' +
	2031	+ sGivenName + ';' +
	2032	+ ','.join(arAdditionalNames) + ';' +
	2033	+ ','.join(arHonorificPrefixes) + ';' +
	2034	+ ','.join(arHonorificSuffixes)))
	2035	+ elif sFN:
	2036	+ # implied "N" optimization
	2037	+ # http://microformats.org/wiki/hcard#Implied_.22N.22_Optimization
	2038	+ arNames = self.normalize(sFN).split()
	2039	+ if len(arNames) == 2:
	2040	+ bFamilyNameFirst = (arNames[0].endswith(',') or
	2041	+ len(arNames[1]) == 1 or
	2042	+ ((len(arNames[1]) == 2) and (arNames[1].endswith('.'))))
	2043	+ if bFamilyNameFirst:
	2044	+ arLines.append(self.vcardFold('N:' + arNames[0] + ';' + arNames[1]))
	2045	+ else:
	2046	+ arLines.append(self.vcardFold('N:' + arNames[1] + ';' + arNames[0]))
	2047	+
	2048	+ # SORT-STRING
	2049	+ sSortString = self.getPropertyValue(elmCard, 'sort-string', self.STRING, bAutoEscape=1)
	2050	+ if sSortString:
	2051	+ arLines.append(self.vcardFold('SORT-STRING:' + sSortString))
	2052	+
	2053	+ # NICKNAME
	2054	+ arNickname = self.getPropertyValue(elmCard, 'nickname', self.STRING, 1, 1)
	2055	+ if arNickname:
	2056	+ arLines.append(self.vcardFold('NICKNAME:' + ','.join(arNickname)))
	2057	+
	2058	+ # PHOTO
	2059	+ processSingleURI('photo')
	2060	+
	2061	+ # BDAY
	2062	+ dtBday = self.getPropertyValue(elmCard, 'bday', self.DATE)
	2063	+ if dtBday:
	2064	+ arLines.append(self.vcardFold('BDAY:' + self.toISO8601(dtBday)))
	2065	+
	2066	+ # ADR (address)
	2067	+ arAdr = self.getPropertyValue(elmCard, 'adr', bAllowMultiple=1)
	2068	+ for elmAdr in arAdr:
	2069	+ arType = self.getPropertyValue(elmAdr, 'type', self.STRING, 1, 1)
	2070	+ if not arType:
	2071	+ arType = ['intl','postal','parcel','work'] # default adr types, see RFC 2426 section 3.2.1
	2072	+ sPostOfficeBox = self.getPropertyValue(elmAdr, 'post-office-box', self.STRING, 0, 1)
	2073	+ sExtendedAddress = self.getPropertyValue(elmAdr, 'extended-address', self.STRING, 0, 1)
	2074	+ sStreetAddress = self.getPropertyValue(elmAdr, 'street-address', self.STRING, 0, 1)
	2075	+ sLocality = self.getPropertyValue(elmAdr, 'locality', self.STRING, 0, 1)
	2076	+ sRegion = self.getPropertyValue(elmAdr, 'region', self.STRING, 0, 1)
	2077	+ sPostalCode = self.getPropertyValue(elmAdr, 'postal-code', self.STRING, 0, 1)
	2078	+ sCountryName = self.getPropertyValue(elmAdr, 'country-name', self.STRING, 0, 1)
	2079	+ arLines.append(self.vcardFold('ADR;TYPE=' + ','.join(arType) + ':' +
	2080	+ sPostOfficeBox + ';' +
	2081	+ sExtendedAddress + ';' +
	2082	+ sStreetAddress + ';' +
	2083	+ sLocality + ';' +
	2084	+ sRegion + ';' +
	2085	+ sPostalCode + ';' +
	2086	+ sCountryName))
	2087	+
	2088	+ # LABEL
	2089	+ processTypeValue('label', ['intl','postal','parcel','work'])
	2090	+
	2091	+ # TEL (phone number)
	2092	+ processTypeValue('tel', ['voice'])
	2093	+
	2094	+ # EMAIL
	2095	+ processTypeValue('email', ['internet'], ['internet'])
	2096	+
	2097	+ # MAILER
	2098	+ processSingleString('mailer')
	2099	+
	2100	+ # TZ (timezone)
	2101	+ processSingleString('tz')
	2102	+
	2103	+ # GEO (geographical information)
	2104	+ elmGeo = self.getPropertyValue(elmCard, 'geo')
	2105	+ if elmGeo:
	2106	+ sLatitude = self.getPropertyValue(elmGeo, 'latitude', self.STRING, 0, 1)
	2107	+ sLongitude = self.getPropertyValue(elmGeo, 'longitude', self.STRING, 0, 1)
	2108	+ arLines.append(self.vcardFold('GEO:' + sLatitude + ';' + sLongitude))
	2109	+
	2110	+ # TITLE
	2111	+ processSingleString('title')
	2112	+
	2113	+ # ROLE
	2114	+ processSingleString('role')
	2115	+
	2116	+ # LOGO
	2117	+ processSingleURI('logo')
	2118	+
	2119	+ # ORG (organization)
	2120	+ elmOrg = self.getPropertyValue(elmCard, 'org')
	2121	+ if elmOrg:
	2122	+ sOrganizationName = self.getPropertyValue(elmOrg, 'organization-name', self.STRING, 0, 1)
	2123	+ if not sOrganizationName:
	2124	+ # implied "organization-name" optimization
	2125	+ # http://microformats.org/wiki/hcard#Implied_.22organization-name.22_Optimization
	2126	+ sOrganizationName = self.getPropertyValue(elmCard, 'org', self.STRING, 0, 1)
	2127	+ if sOrganizationName:
	2128	+ arLines.append(self.vcardFold('ORG:' + sOrganizationName))
	2129	+ else:
	2130	+ arOrganizationUnit = self.getPropertyValue(elmOrg, 'organization-unit', self.STRING, 1, 1)
	2131	+ arLines.append(self.vcardFold('ORG:' + sOrganizationName + ';' + ';'.join(arOrganizationUnit)))
	2132	+
	2133	+ # CATEGORY
	2134	+ arCategory = self.getPropertyValue(elmCard, 'category', self.STRING, 1, 1) + self.getPropertyValue(elmCard, 'categories', self.STRING, 1, 1)
	2135	+ if arCategory:
	2136	+ arLines.append(self.vcardFold('CATEGORIES:' + ','.join(arCategory)))
	2137	+
	2138	+ # NOTE
	2139	+ processSingleString('note')
	2140	+
	2141	+ # REV
	2142	+ processSingleString('rev')
	2143	+
	2144	+ # SOUND
	2145	+ processSingleURI('sound')
	2146	+
	2147	+ # UID
	2148	+ processSingleString('uid')
	2149	+
	2150	+ # URL
	2151	+ processSingleURI('url')
	2152	+
	2153	+ # CLASS
	2154	+ processSingleString('class')
	2155	+
	2156	+ # KEY
	2157	+ processSingleURI('key')
	2158	+
	2159	+ if arLines:
	2160	+ arLines = ['BEGIN:vCard','VERSION:3.0'] + arLines + ['END:vCard']
	2161	+ sVCards += '\n'.join(arLines) + '\n'
	2162	+
	2163	+ return sVCards.strip()
	2164	+
	2165	+ def isProbablyDownloadable(self, elm):
	2166	+ attrsD = elm.attrMap
	2167	+ if not attrsD.has_key('href'): return 0
	2168	+ linktype = attrsD.get('type', '').strip()
	2169	+ if linktype.startswith('audio/') or \
	2170	+ linktype.startswith('video/') or \
	2171	+ (linktype.startswith('application/') and not linktype.endswith('xml')):
	2172	+ return 1
	2173	+ path = urlparse.urlparse(attrsD['href'])[2]
	2174	+ if path.find('.') == -1: return 0
	2175	+ fileext = path.split('.').pop().lower()
	2176	+ return fileext in self.known_binary_extensions
	2177	+
	2178	+ def findTags(self):
	2179	+ all = lambda x: 1
	2180	+ for elm in self.document(all, {'rel': re.compile(r'\btag\b')}):
	2181	+ href = elm.get('href')
	2182	+ if not href: continue
	2183	+ urlscheme, domain, path, params, query, fragment = \
	2184	+ urlparse.urlparse(_urljoin(self.baseuri, href))
	2185	+ segments = path.split('/')
	2186	+ tag = segments.pop()
	2187	+ if not tag:
	2188	+ tag = segments.pop()
	2189	+ tagscheme = urlparse.urlunparse((urlscheme, domain, '/'.join(segments), '', '', ''))
	2190	+ if not tagscheme.endswith('/'):
	2191	+ tagscheme += '/'
	2192	+ self.tags.append(FeedParserDict({"term": tag, "scheme": tagscheme, "label": elm.string or ''}))
	2193	+
	2194	+ def findEnclosures(self):
	2195	+ all = lambda x: 1
	2196	+ enclosure_match = re.compile(r'\benclosure\b')
	2197	+ for elm in self.document(all, {'href': re.compile(r'.+')}):
	2198	+ if not enclosure_match.search(elm.get('rel', '')) and not self.isProbablyDownloadable(elm): continue
	2199	+ if elm.attrMap not in self.enclosures:
	2200	+ self.enclosures.append(elm.attrMap)
	2201	+ if elm.string and not elm.get('title'):
	2202	+ self.enclosures[-1]['title'] = elm.string
	2203	+
	2204	+ def findXFN(self):
	2205	+ all = lambda x: 1
	2206	+ for elm in self.document(all, {'rel': re.compile('.+'), 'href': re.compile('.+')}):
	2207	+ rels = elm.get('rel', '').split()
	2208	+ xfn_rels = []
	2209	+ for rel in rels:
	2210	+ if rel in self.known_xfn_relationships:
	2211	+ xfn_rels.append(rel)
	2212	+ if xfn_rels:
	2213	+ self.xfn.append({"relationships": xfn_rels, "href": elm.get('href', ''), "name": elm.string})
	2214	+
	2215	+def _parseMicroformats(htmlSource, baseURI, encoding):
	2216	+ if not BeautifulSoup: return
	2217	+ if _debug: sys.stderr.write('entering _parseMicroformats\n')
	2218	+ p = _MicroformatsParser(htmlSource, baseURI, encoding)
	2219	+ p.vcard = p.findVCards(p.document)
	2220	+ p.findTags()
	2221	+ p.findEnclosures()
	2222	+ p.findXFN()
	2223	+ return {"tags": p.tags, "enclosures": p.enclosures, "xfn": p.xfn, "vcard": p.vcard}
	2224	+
	2225	+class _RelativeURIResolver(_BaseHTMLProcessor):
	2226	+ relative_uris = [('a', 'href'),
	2227	+ ('applet', 'codebase'),
	2228	+ ('area', 'href'),
	2229	+ ('blockquote', 'cite'),
	2230	+ ('body', 'background'),
	2231	+ ('del', 'cite'),
	2232	+ ('form', 'action'),
	2233	+ ('frame', 'longdesc'),
	2234	+ ('frame', 'src'),
	2235	+ ('iframe', 'longdesc'),
	2236	+ ('iframe', 'src'),
	2237	+ ('head', 'profile'),
	2238	+ ('img', 'longdesc'),
	2239	+ ('img', 'src'),
	2240	+ ('img', 'usemap'),
	2241	+ ('input', 'src'),
	2242	+ ('input', 'usemap'),
	2243	+ ('ins', 'cite'),
	2244	+ ('link', 'href'),
	2245	+ ('object', 'classid'),
	2246	+ ('object', 'codebase'),
	2247	+ ('object', 'data'),
	2248	+ ('object', 'usemap'),
	2249	+ ('q', 'cite'),
	2250	+ ('script', 'src')]
	2251	+
	2252	+ def __init__(self, baseuri, encoding, type):
	2253	+ _BaseHTMLProcessor.__init__(self, encoding, type)
	2254	+ self.baseuri = baseuri
	2255	+
	2256	+ def resolveURI(self, uri):
	2257	+ return _urljoin(self.baseuri, uri.strip())
	2258	+
	2259	+ def unknown_starttag(self, tag, attrs):
	2260	+ attrs = self.normalize_attrs(attrs)
	2261	+ attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
	2262	+ _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
	2263	+
	2264	+def _resolveRelativeURIs(htmlSource, baseURI, encoding, type):
	2265	+ if _debug: sys.stderr.write('entering _resolveRelativeURIs\n')
	2266	+ p = _RelativeURIResolver(baseURI, encoding, type)
	2267	+ p.feed(htmlSource)
	2268	+ return p.output()
	2269	+
	2270	+class _HTMLSanitizer(_BaseHTMLProcessor):
	2271	+ acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'article',
	2272	+ 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button', 'canvas',
	2273	+ 'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'command',
	2274	+ 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir',
	2275	+ 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', 'figure', 'footer',
	2276	+ 'font', 'form', 'header', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i',
	2277	+ 'img', 'input', 'ins', 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map',
	2278	+ 'menu', 'meter', 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup',
	2279	+ 'option', 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
	2280	+ 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', 'sub',
	2281	+ 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', 'th', 'thead',
	2282	+ 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript']
	2283	+
	2284	+ acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
	2285	+ 'action', 'align', 'alt', 'autoplay', 'autocomplete', 'autofocus', 'axis',
	2286	+ 'background', 'balance', 'bgcolor', 'bgproperties', 'border',
	2287	+ 'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
	2288	+ 'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
	2289	+ 'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols',
	2290	+ 'colspan', 'compact', 'contenteditable', 'coords', 'data', 'datafld',
	2291	+ 'datapagesize', 'datasrc', 'datetime', 'default', 'delay', 'dir',
	2292	+ 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for',
	2293	+ 'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus',
	2294	+ 'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode',
	2295	+ 'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc',
	2296	+ 'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max',
	2297	+ 'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref',
	2298	+ 'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size',
	2299	+ 'prompt', 'pqg', 'radiogroup', 'readonly', 'rel', 'repeat-max',
	2300	+ 'repeat-min', 'replace', 'required', 'rev', 'rightspacing', 'rows',
	2301	+ 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src',
	2302	+ 'start', 'step', 'summary', 'suppress', 'tabindex', 'target', 'template',
	2303	+ 'title', 'toppadding', 'type', 'unselectable', 'usemap', 'urn', 'valign',
	2304	+ 'value', 'variable', 'volume', 'vspace', 'vrml', 'width', 'wrap',
	2305	+ 'xml:lang']
	2306	+
	2307	+ unacceptable_elements_with_end_tag = ['script', 'applet', 'style']
	2308	+
	2309	+ acceptable_css_properties = ['azimuth', 'background-color',
	2310	+ 'border-bottom-color', 'border-collapse', 'border-color',
	2311	+ 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
	2312	+ 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
	2313	+ 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
	2314	+ 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
	2315	+ 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
	2316	+ 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
	2317	+ 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
	2318	+ 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
	2319	+ 'white-space', 'width']
	2320	+
	2321	+ # survey of common keywords found in feeds
	2322	+ acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
	2323	+ 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
	2324	+ 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
	2325	+ 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
	2326	+ 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
	2327	+ 'transparent', 'underline', 'white', 'yellow']
	2328	+
	2329	+ valid_css_values = re.compile('^(#[0-9a-f]+\|rgb$\d+%?,\d%?,?\d%?$?\|' +
	2330	+ '\d{0,2}\.?\d{0,2}(cm\|em\|ex\|in\|mm\|pc\|pt\|px\|%\|,\|\))?)$')
	2331	+
	2332	+ mathml_elements = ['annotation', 'annotation-xml', 'maction', 'math',
	2333	+ 'merror', 'mfenced', 'mfrac', 'mi', 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded',
	2334	+ 'mphantom', 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle',
	2335	+ 'msub', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
	2336	+ 'munderover', 'none', 'semantics']
	2337	+
	2338	+ mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
	2339	+ 'columnalign', 'close', 'columnlines', 'columnspacing', 'columnspan', 'depth',
	2340	+ 'display', 'displaystyle', 'encoding', 'equalcolumns', 'equalrows',
	2341	+ 'fence', 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness',
	2342	+ 'lspace', 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant',
	2343	+ 'maxsize', 'minsize', 'open', 'other', 'rowalign', 'rowalign', 'rowalign',
	2344	+ 'rowlines', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
	2345	+ 'separator', 'separators', 'stretchy', 'width', 'width', 'xlink:href',
	2346	+ 'xlink:show', 'xlink:type', 'xmlns', 'xmlns:xlink']
	2347	+
	2348	+ # svgtiny - foreignObject + linearGradient + radialGradient + stop
	2349	+ svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
	2350	+ 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject',
	2351	+ 'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
	2352	+ 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath',
	2353	+ 'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop',
	2354	+ 'svg', 'switch', 'text', 'title', 'tspan', 'use']
	2355	+
	2356	+ # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
	2357	+ svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
	2358	+ 'arabic-form', 'ascent', 'attributeName', 'attributeType',
	2359	+ 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
	2360	+ 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
	2361	+ 'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
	2362	+ 'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
	2363	+ 'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
	2364	+ 'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
	2365	+ 'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines',
	2366	+ 'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid',
	2367	+ 'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max',
	2368	+ 'min', 'name', 'offset', 'opacity', 'orient', 'origin',
	2369	+ 'overline-position', 'overline-thickness', 'panose-1', 'path',
	2370	+ 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY',
	2371	+ 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
	2372	+ 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
	2373	+ 'stop-color', 'stop-opacity', 'strikethrough-position',
	2374	+ 'strikethrough-thickness', 'stroke', 'stroke-dasharray',
	2375	+ 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
	2376	+ 'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage',
	2377	+ 'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
	2378	+ 'underline-position', 'underline-thickness', 'unicode', 'unicode-range',
	2379	+ 'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width',
	2380	+ 'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
	2381	+ 'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
	2382	+ 'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1',
	2383	+ 'y2', 'zoomAndPan']
	2384	+
	2385	+ svg_attr_map = None
	2386	+ svg_elem_map = None
	2387	+
	2388	+ acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
	2389	+ 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
	2390	+ 'stroke-opacity']
	2391	+
	2392	+ def reset(self):
	2393	+ _BaseHTMLProcessor.reset(self)
	2394	+ self.unacceptablestack = 0
	2395	+ self.mathmlOK = 0
	2396	+ self.svgOK = 0
	2397	+
	2398	+ def unknown_starttag(self, tag, attrs):
	2399	+ acceptable_attributes = self.acceptable_attributes
	2400	+ keymap = {}
	2401	+ if not tag in self.acceptable_elements or self.svgOK:
	2402	+ if tag in self.unacceptable_elements_with_end_tag:
	2403	+ self.unacceptablestack += 1
	2404	+
	2405	+ # not otherwise acceptable, perhaps it is MathML or SVG?
	2406	+ if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs:
	2407	+ self.mathmlOK += 1
	2408	+ if tag=='svg' and ('xmlns','http://www.w3.org/2000/svg') in attrs:
	2409	+ self.svgOK += 1
	2410	+
	2411	+ # chose acceptable attributes based on tag class, else bail
	2412	+ if self.mathmlOK and tag in self.mathml_elements:
	2413	+ acceptable_attributes = self.mathml_attributes
	2414	+ elif self.svgOK and tag in self.svg_elements:
	2415	+ # for most vocabularies, lowercasing is a good idea. Many
	2416	+ # svg elements, however, are camel case
	2417	+ if not self.svg_attr_map:
	2418	+ lower=[attr.lower() for attr in self.svg_attributes]
	2419	+ mix=[a for a in self.svg_attributes if a not in lower]
	2420	+ self.svg_attributes = lower
	2421	+ self.svg_attr_map = dict([(a.lower(),a) for a in mix])
	2422	+
	2423	+ lower=[attr.lower() for attr in self.svg_elements]
	2424	+ mix=[a for a in self.svg_elements if a not in lower]
	2425	+ self.svg_elements = lower
	2426	+ self.svg_elem_map = dict([(a.lower(),a) for a in mix])
	2427	+ acceptable_attributes = self.svg_attributes
	2428	+ tag = self.svg_elem_map.get(tag,tag)
	2429	+ keymap = self.svg_attr_map
	2430	+ elif not tag in self.acceptable_elements:
	2431	+ return
	2432	+
	2433	+ # declare xlink namespace, if needed
	2434	+ if self.mathmlOK or self.svgOK:
	2435	+ if filter(lambda (n,v): n.startswith('xlink:'),attrs):
	2436	+ if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs:
	2437	+ attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink'))
	2438	+
	2439	+ clean_attrs = []
	2440	+ for key, value in self.normalize_attrs(attrs):
	2441	+ if key in acceptable_attributes:
	2442	+ key=keymap.get(key,key)
	2443	+ clean_attrs.append((key,value))
	2444	+ elif key=='style':
	2445	+ clean_value = self.sanitize_style(value)
	2446	+ if clean_value: clean_attrs.append((key,clean_value))
	2447	+ _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs)
	2448	+
	2449	+ def unknown_endtag(self, tag):
	2450	+ if not tag in self.acceptable_elements:
	2451	+ if tag in self.unacceptable_elements_with_end_tag:
	2452	+ self.unacceptablestack -= 1
	2453	+ if self.mathmlOK and tag in self.mathml_elements:
	2454	+ if tag == 'math' and self.mathmlOK: self.mathmlOK -= 1
	2455	+ elif self.svgOK and tag in self.svg_elements:
	2456	+ tag = self.svg_elem_map.get(tag,tag)
	2457	+ if tag == 'svg' and self.svgOK: self.svgOK -= 1
	2458	+ else:
	2459	+ return
	2460	+ _BaseHTMLProcessor.unknown_endtag(self, tag)
	2461	+
	2462	+ def handle_pi(self, text):
	2463	+ pass
	2464	+
	2465	+ def handle_decl(self, text):
	2466	+ pass
	2467	+
	2468	+ def handle_data(self, text):
	2469	+ if not self.unacceptablestack:
	2470	+ _BaseHTMLProcessor.handle_data(self, text)
	2471	+
	2472	+ def sanitize_style(self, style):
	2473	+ # disallow urls
	2474	+ style=re.compile('url\s$\s[^\s)]+?\s$\s').sub(' ',style)
	2475	+
	2476	+ # gauntlet
	2477	+ if not re.match("""^([:,;#%.\sa-zA-Z0-9!]\|\w-\w\|'[\s\w]+'\|"[\s\w]+"\|$[\d,\s]+$)*$""", style): return ''
	2478	+ if not re.match("^(\s[-\w]+\s:\s[^:;](;\|$))*$", style): return ''
	2479	+
	2480	+ clean = []
	2481	+ for prop,value in re.findall("([-\w]+)\s:\s([^:;]*)",style):
	2482	+ if not value: continue
	2483	+ if prop.lower() in self.acceptable_css_properties:
	2484	+ clean.append(prop + ': ' + value + ';')
	2485	+ elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
	2486	+ for keyword in value.split():
	2487	+ if not keyword in self.acceptable_css_keywords and \
	2488	+ not self.valid_css_values.match(keyword):
	2489	+ break
	2490	+ else:
	2491	+ clean.append(prop + ': ' + value + ';')
	2492	+ elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
	2493	+ clean.append(prop + ': ' + value + ';')
	2494	+
	2495	+ return ' '.join(clean)
	2496	+
	2497	+
	2498	+def _sanitizeHTML(htmlSource, encoding, type):
	2499	+ p = _HTMLSanitizer(encoding, type)
	2500	+ p.feed(htmlSource)
	2501	+ data = p.output()
	2502	+ if TIDY_MARKUP:
	2503	+ # loop through list of preferred Tidy interfaces looking for one that's installed,
	2504	+ # then set up a common _tidy function to wrap the interface-specific API.
	2505	+ _tidy = None
	2506	+ for tidy_interface in PREFERRED_TIDY_INTERFACES:
	2507	+ try:
	2508	+ if tidy_interface == "uTidy":
	2509	+ from tidy import parseString as _utidy
	2510	+ def _tidy(data, **kwargs):
	2511	+ return str(_utidy(data, **kwargs))
	2512	+ break
	2513	+ elif tidy_interface == "mxTidy":
	2514	+ from mx.Tidy import Tidy as _mxtidy
	2515	+ def _tidy(data, **kwargs):
	2516	+ nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs)
	2517	+ return data
	2518	+ break
	2519	+ except:
	2520	+ pass
	2521	+ if _tidy:
	2522	+ utf8 = type(data) == type(u'')
	2523	+ if utf8:
	2524	+ data = data.encode('utf-8')
	2525	+ data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8")
	2526	+ if utf8:
	2527	+ data = unicode(data, 'utf-8')
	2528	+ if data.count('<body'):
	2529	+ data = data.split('<body', 1)[1]
	2530	+ if data.count('>'):
	2531	+ data = data.split('>', 1)[1]
	2532	+ if data.count('</body'):
	2533	+ data = data.split('</body', 1)[0]
	2534	+ data = data.strip().replace('\r\n', '\n')
	2535	+ return data
	2536	+
	2537	+class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
	2538	+ def http_error_default(self, req, fp, code, msg, headers):
	2539	+ if ((code / 100) == 3) and (code != 304):
	2540	+ return self.http_error_302(req, fp, code, msg, headers)
	2541	+ infourl = urllib.addinfourl(fp, headers, req.get_full_url())
	2542	+ infourl.status = code
	2543	+ return infourl
	2544	+
	2545	+ def http_error_302(self, req, fp, code, msg, headers):
	2546	+ if headers.dict.has_key('location'):
	2547	+ infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
	2548	+ else:
	2549	+ infourl = urllib.addinfourl(fp, headers, req.get_full_url())
	2550	+ if not hasattr(infourl, 'status'):
	2551	+ infourl.status = code
	2552	+ return infourl
	2553	+
	2554	+ def http_error_301(self, req, fp, code, msg, headers):
	2555	+ if headers.dict.has_key('location'):
	2556	+ infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
	2557	+ else:
	2558	+ infourl = urllib.addinfourl(fp, headers, req.get_full_url())
	2559	+ if not hasattr(infourl, 'status'):
	2560	+ infourl.status = code
	2561	+ return infourl
	2562	+
	2563	+ http_error_300 = http_error_302
	2564	+ http_error_303 = http_error_302
	2565	+ http_error_307 = http_error_302
	2566	+
	2567	+ def http_error_401(self, req, fp, code, msg, headers):
	2568	+ # Check if
	2569	+ # - server requires digest auth, AND
	2570	+ # - we tried (unsuccessfully) with basic auth, AND
	2571	+ # - we're using Python 2.3.3 or later (digest auth is irreparably broken in earlier versions)
	2572	+ # If all conditions hold, parse authentication information
	2573	+ # out of the Authorization header we sent the first time
	2574	+ # (for the username and password) and the WWW-Authenticate
	2575	+ # header the server sent back (for the realm) and retry
	2576	+ # the request with the appropriate digest auth headers instead.
	2577	+ # This evil genius hack has been brought to you by Aaron Swartz.
	2578	+ host = urlparse.urlparse(req.get_full_url())[1]
	2579	+ try:
	2580	+ assert sys.version.split()[0] >= '2.3.3'
	2581	+ assert base64 != None
	2582	+ user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':')
	2583	+ realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
	2584	+ self.add_password(realm, host, user, passw)
	2585	+ retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
	2586	+ self.reset_retry_count()
	2587	+ return retry
	2588	+ except:
	2589	+ return self.http_error_default(req, fp, code, msg, headers)
	2590	+
	2591	+def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers):
	2592	+ """URL, filename, or string --> stream
	2593	+
	2594	+ This function lets you define parsers that take any input source
	2595	+ (URL, pathname to local or network file, or actual data as a string)
	2596	+ and deal with it in a uniform manner. Returned object is guaranteed
	2597	+ to have all the basic stdio read methods (read, readline, readlines).
	2598	+ Just .close() the object when you're done with it.
	2599	+
	2600	+ If the etag argument is supplied, it will be used as the value of an
	2601	+ If-None-Match request header.
	2602	+
	2603	+ If the modified argument is supplied, it can be a tuple of 9 integers
	2604	+ (as returned by gmtime() in the standard Python time module) or a date
	2605	+ string in any format supported by feedparser. Regardless, it MUST
	2606	+ be in GMT (Greenwich Mean Time). It will be reformatted into an
	2607	+ RFC 1123-compliant date and used as the value of an If-Modified-Since
	2608	+ request header.
	2609	+
	2610	+ If the agent argument is supplied, it will be used as the value of a
	2611	+ User-Agent request header.
	2612	+
	2613	+ If the referrer argument is supplied, it will be used as the value of a
	2614	+ Referer[sic] request header.
	2615	+
	2616	+ If handlers is supplied, it is a list of handlers used to build a
	2617	+ urllib2 opener.
	2618	+ """
	2619	+
	2620	+ if hasattr(url_file_stream_or_string, 'read'):
	2621	+ return url_file_stream_or_string
	2622	+
	2623	+ if url_file_stream_or_string == '-':
	2624	+ return sys.stdin
	2625	+
	2626	+ if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):
	2627	+ if not agent:
	2628	+ agent = USER_AGENT
	2629	+ # test for inline user:password for basic auth
	2630	+ auth = None
	2631	+ if base64:
	2632	+ urltype, rest = urllib.splittype(url_file_stream_or_string)
	2633	+ realhost, rest = urllib.splithost(rest)
	2634	+ if realhost:
	2635	+ user_passwd, realhost = urllib.splituser(realhost)
	2636	+ if user_passwd:
	2637	+ url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
	2638	+ auth = base64.encodestring(user_passwd).strip()
	2639	+
	2640	+ # iri support
	2641	+ try:
	2642	+ if isinstance(url_file_stream_or_string,unicode):
	2643	+ url_file_stream_or_string = url_file_stream_or_string.encode('idna')
	2644	+ else:
	2645	+ url_file_stream_or_string = url_file_stream_or_string.decode('utf-8').encode('idna')
	2646	+ except:
	2647	+ pass
	2648	+
	2649	+ # try to open with urllib2 (to use optional headers)
	2650	+ request = urllib2.Request(url_file_stream_or_string)
	2651	+ request.add_header('User-Agent', agent)
	2652	+ if etag:
	2653	+ request.add_header('If-None-Match', etag)
	2654	+ if type(modified) == type(''):
	2655	+ modified = _parse_date(modified)
	2656	+ if modified:
	2657	+ # format into an RFC 1123-compliant timestamp. We can't use
	2658	+ # time.strftime() since the %a and %b directives can be affected
	2659	+ # by the current locale, but RFC 2616 states that dates must be
	2660	+ # in English.
	2661	+ short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
	2662	+ months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
	2663	+ request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
	2664	+ if referrer:
	2665	+ request.add_header('Referer', referrer)
	2666	+ if gzip and zlib:
	2667	+ request.add_header('Accept-encoding', 'gzip, deflate')
	2668	+ elif gzip:
	2669	+ request.add_header('Accept-encoding', 'gzip')
	2670	+ elif zlib:
	2671	+ request.add_header('Accept-encoding', 'deflate')
	2672	+ else:
	2673	+ request.add_header('Accept-encoding', '')
	2674	+ if auth:
	2675	+ request.add_header('Authorization', 'Basic %s' % auth)
	2676	+ if ACCEPT_HEADER:
	2677	+ request.add_header('Accept', ACCEPT_HEADER)
	2678	+ request.add_header('A-IM', 'feed') # RFC 3229 support
	2679	+ opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers))
	2680	+ opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
	2681	+ try:
	2682	+ return opener.open(request)
	2683	+ finally:
	2684	+ opener.close() # JohnD
	2685	+
	2686	+ # try to open with native open function (if url_file_stream_or_string is a filename)
	2687	+ try:
	2688	+ return open(url_file_stream_or_string)
	2689	+ except:
	2690	+ pass
	2691	+
	2692	+ # treat url_file_stream_or_string as string
	2693	+ return _StringIO(str(url_file_stream_or_string))
	2694	+
	2695	+_date_handlers = []
	2696	+def registerDateHandler(func):
	2697	+ '''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
	2698	+ _date_handlers.insert(0, func)
	2699	+
	2700	+# ISO-8601 date parsing routines written by Fazal Majid.
	2701	+# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
	2702	+# parser is beyond the scope of feedparser and would be a worthwhile addition
	2703	+# to the Python library.
	2704	+# A single regular expression cannot parse ISO 8601 date formats into groups
	2705	+# as the standard is highly irregular (for instance is 030104 2003-01-04 or
	2706	+# 0301-04-01), so we use templates instead.
	2707	+# Please note the order in templates is significant because we need a
	2708	+# greedy match.
	2709	+_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-0MM?-?DD', 'YYYY-MM', 'YYYY-?OOO',
	2710	+ 'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
	2711	+ '-YY-?MM', '-OOO', '-YY',
	2712	+ '--MM-?DD', '--MM',
	2713	+ '---DD',
	2714	+ 'CC', '']
	2715	+_iso8601_re = [
	2716	+ tmpl.replace(
	2717	+ 'YYYY', r'(?P<year>\d{4})').replace(
	2718	+ 'YY', r'(?P<year>\d\d)').replace(
	2719	+ 'MM', r'(?P<month>[01]\d)').replace(
	2720	+ 'DD', r'(?P<day>[0123]\d)').replace(
	2721	+ 'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
	2722	+ 'CC', r'(?P<century>\d\d$)')
	2723	+ + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
	2724	+ + r'(:(?P<second>\d{2}(\.\d*)?))?'
	2725	+ + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?\|Z)?)?'
	2726	+ for tmpl in _iso8601_tmpl]
	2727	+del tmpl
	2728	+_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
	2729	+del regex
	2730	+def _parse_date_iso8601(dateString):
	2731	+ '''Parse a variety of ISO-8601-compatible formats like 20040105'''
	2732	+ m = None
	2733	+ for _iso8601_match in _iso8601_matches:
	2734	+ m = _iso8601_match(dateString)
	2735	+ if m: break
	2736	+ if not m: return
	2737	+ if m.span() == (0, 0): return
	2738	+ params = m.groupdict()
	2739	+ ordinal = params.get('ordinal', 0)
	2740	+ if ordinal:
	2741	+ ordinal = int(ordinal)
	2742	+ else:
	2743	+ ordinal = 0
	2744	+ year = params.get('year', '--')
	2745	+ if not year or year == '--':
	2746	+ year = time.gmtime()[0]
	2747	+ elif len(year) == 2:
	2748	+ # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
	2749	+ year = 100 * int(time.gmtime()[0] / 100) + int(year)
	2750	+ else:
	2751	+ year = int(year)
	2752	+ month = params.get('month', '-')
	2753	+ if not month or month == '-':
	2754	+ # ordinals are NOT normalized by mktime, we simulate them
	2755	+ # by setting month=1, day=ordinal
	2756	+ if ordinal:
	2757	+ month = 1
	2758	+ else:
	2759	+ month = time.gmtime()[1]
	2760	+ month = int(month)
	2761	+ day = params.get('day', 0)
	2762	+ if not day:
	2763	+ # see above
	2764	+ if ordinal:
	2765	+ day = ordinal
	2766	+ elif params.get('century', 0) or \
	2767	+ params.get('year', 0) or params.get('month', 0):
	2768	+ day = 1
	2769	+ else:
	2770	+ day = time.gmtime()[2]
	2771	+ else:
	2772	+ day = int(day)
	2773	+ # special case of the century - is the first year of the 21st century
	2774	+ # 2000 or 2001 ? The debate goes on...
	2775	+ if 'century' in params.keys():
	2776	+ year = (int(params['century']) - 1) * 100 + 1
	2777	+ # in ISO 8601 most fields are optional
	2778	+ for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
	2779	+ if not params.get(field, None):
	2780	+ params[field] = 0
	2781	+ hour = int(params.get('hour', 0))
	2782	+ minute = int(params.get('minute', 0))
	2783	+ second = int(float(params.get('second', 0)))
	2784	+ # weekday is normalized by mktime(), we can ignore it
	2785	+ weekday = 0
	2786	+ daylight_savings_flag = -1
	2787	+ tm = [year, month, day, hour, minute, second, weekday,
	2788	+ ordinal, daylight_savings_flag]
	2789	+ # ISO 8601 time zone adjustments
	2790	+ tz = params.get('tz')
	2791	+ if tz and tz != 'Z':
	2792	+ if tz[0] == '-':
	2793	+ tm[3] += int(params.get('tzhour', 0))
	2794	+ tm[4] += int(params.get('tzmin', 0))
	2795	+ elif tz[0] == '+':
	2796	+ tm[3] -= int(params.get('tzhour', 0))
	2797	+ tm[4] -= int(params.get('tzmin', 0))
	2798	+ else:
	2799	+ return None
	2800	+ # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
	2801	+ # which is guaranteed to normalize d/m/y/h/m/s.
	2802	+ # Many implementations have bugs, but we'll pretend they don't.
	2803	+ return time.localtime(time.mktime(tm))
	2804	+registerDateHandler(_parse_date_iso8601)
	2805	+
	2806	+# 8-bit date handling routines written by ytrewq1.
	2807	+_korean_year = u'\ub144' # b3e2 in euc-kr
	2808	+_korean_month = u'\uc6d4' # bff9 in euc-kr
	2809	+_korean_day = u'\uc77c' # c0cf in euc-kr
	2810	+_korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr
	2811	+_korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
	2812	+
	2813	+_korean_onblog_date_re = \
	2814	+ re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
	2815	+ (_korean_year, _korean_month, _korean_day))
	2816	+_korean_nate_date_re = \
	2817	+ re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s\|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
	2818	+ (_korean_am, _korean_pm))
	2819	+def _parse_date_onblog(dateString):
	2820	+ '''Parse a string according to the OnBlog 8-bit date format'''
	2821	+ m = _korean_onblog_date_re.match(dateString)
	2822	+ if not m: return
	2823	+ w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
	2824	+ {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
	2825	+ 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
	2826	+ 'zonediff': '+09:00'}
	2827	+ if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate)
	2828	+ return _parse_date_w3dtf(w3dtfdate)
	2829	+registerDateHandler(_parse_date_onblog)
	2830	+
	2831	+def _parse_date_nate(dateString):
	2832	+ '''Parse a string according to the Nate 8-bit date format'''
	2833	+ m = _korean_nate_date_re.match(dateString)
	2834	+ if not m: return
	2835	+ hour = int(m.group(5))
	2836	+ ampm = m.group(4)
	2837	+ if (ampm == _korean_pm):
	2838	+ hour += 12
	2839	+ hour = str(hour)
	2840	+ if len(hour) == 1:
	2841	+ hour = '0' + hour
	2842	+ w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
	2843	+ {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
	2844	+ 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
	2845	+ 'zonediff': '+09:00'}
	2846	+ if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate)
	2847	+ return _parse_date_w3dtf(w3dtfdate)
	2848	+registerDateHandler(_parse_date_nate)
	2849	+
	2850	+_mssql_date_re = \
	2851	+ re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')
	2852	+def _parse_date_mssql(dateString):
	2853	+ '''Parse a string according to the MS SQL date format'''
	2854	+ m = _mssql_date_re.match(dateString)
	2855	+ if not m: return
	2856	+ w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
	2857	+ {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
	2858	+ 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
	2859	+ 'zonediff': '+09:00'}
	2860	+ if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate)
	2861	+ return _parse_date_w3dtf(w3dtfdate)
	2862	+registerDateHandler(_parse_date_mssql)
	2863	+
	2864	+# Unicode strings for Greek date strings
	2865	+_greek_months = \
	2866	+ { \
	2867	+ u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7
	2868	+ u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7
	2869	+ u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7
	2870	+ u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7
	2871	+ u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7
	2872	+ u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7
	2873	+ u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7
	2874	+ u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7
	2875	+ u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
	2876	+ u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7
	2877	+ u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
	2878	+ u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7
	2879	+ u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7
	2880	+ u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7
	2881	+ u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7
	2882	+ u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7
	2883	+ u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7
	2884	+ u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7
	2885	+ u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7
	2886	+ }
	2887	+
	2888	+_greek_wdays = \
	2889	+ { \
	2890	+ u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
	2891	+ u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
	2892	+ u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
	2893	+ u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
	2894	+ u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
	2895	+ u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
	2896	+ u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7
	2897	+ }
	2898	+
	2899	+_greek_date_format_re = \
	2900	+ re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
	2901	+
	2902	+def _parse_date_greek(dateString):
	2903	+ '''Parse a string according to a Greek 8-bit date format.'''
	2904	+ m = _greek_date_format_re.match(dateString)
	2905	+ if not m: return
	2906	+ try:
	2907	+ wday = _greek_wdays[m.group(1)]
	2908	+ month = _greek_months[m.group(3)]
	2909	+ except:
	2910	+ return
	2911	+ rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
	2912	+ {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
	2913	+ 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
	2914	+ 'zonediff': m.group(8)}
	2915	+ if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date)
	2916	+ return _parse_date_rfc822(rfc822date)
	2917	+registerDateHandler(_parse_date_greek)
	2918	+
	2919	+# Unicode strings for Hungarian date strings
	2920	+_hungarian_months = \
	2921	+ { \
	2922	+ u'janu\u00e1r': u'01', # e1 in iso-8859-2
	2923	+ u'febru\u00e1ri': u'02', # e1 in iso-8859-2
	2924	+ u'm\u00e1rcius': u'03', # e1 in iso-8859-2
	2925	+ u'\u00e1prilis': u'04', # e1 in iso-8859-2
	2926	+ u'm\u00e1ujus': u'05', # e1 in iso-8859-2
	2927	+ u'j\u00fanius': u'06', # fa in iso-8859-2
	2928	+ u'j\u00falius': u'07', # fa in iso-8859-2
	2929	+ u'augusztus': u'08',
	2930	+ u'szeptember': u'09',
	2931	+ u'okt\u00f3ber': u'10', # f3 in iso-8859-2
	2932	+ u'november': u'11',
	2933	+ u'december': u'12',
	2934	+ }
	2935	+
	2936	+_hungarian_date_format_re = \
	2937	+ re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+\|-)(\d{,2}:\d{2}))')
	2938	+
	2939	+def _parse_date_hungarian(dateString):
	2940	+ '''Parse a string according to a Hungarian 8-bit date format.'''
	2941	+ m = _hungarian_date_format_re.match(dateString)
	2942	+ if not m: return
	2943	+ try:
	2944	+ month = _hungarian_months[m.group(2)]
	2945	+ day = m.group(3)
	2946	+ if len(day) == 1:
	2947	+ day = '0' + day
	2948	+ hour = m.group(4)
	2949	+ if len(hour) == 1:
	2950	+ hour = '0' + hour
	2951	+ except:
	2952	+ return
	2953	+ w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
	2954	+ {'year': m.group(1), 'month': month, 'day': day,\
	2955	+ 'hour': hour, 'minute': m.group(5),\
	2956	+ 'zonediff': m.group(6)}
	2957	+ if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate)
	2958	+ return _parse_date_w3dtf(w3dtfdate)
	2959	+registerDateHandler(_parse_date_hungarian)
	2960	+
	2961	+# W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by
	2962	+# Drake and licensed under the Python license. Removed all range checking
	2963	+# for month, day, hour, minute, and second, since mktime will normalize
	2964	+# these later
	2965	+def _parse_date_w3dtf(dateString):
	2966	+ def __extract_date(m):
	2967	+ year = int(m.group('year'))
	2968	+ if year < 100:
	2969	+ year = 100 * int(time.gmtime()[0] / 100) + int(year)
	2970	+ if year < 1000:
	2971	+ return 0, 0, 0
	2972	+ julian = m.group('julian')
	2973	+ if julian:
	2974	+ julian = int(julian)
	2975	+ month = julian / 30 + 1
	2976	+ day = julian % 30 + 1
	2977	+ jday = None
	2978	+ while jday != julian:
	2979	+ t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))
	2980	+ jday = time.gmtime(t)[-2]
	2981	+ diff = abs(jday - julian)
	2982	+ if jday > julian:
	2983	+ if diff < day:
	2984	+ day = day - diff
	2985	+ else:
	2986	+ month = month - 1
	2987	+ day = 31
	2988	+ elif jday < julian:
	2989	+ if day + diff < 28:
	2990	+ day = day + diff
	2991	+ else:
	2992	+ month = month + 1
	2993	+ return year, month, day
	2994	+ month = m.group('month')
	2995	+ day = 1
	2996	+ if month is None:
	2997	+ month = 1
	2998	+ else:
	2999	+ month = int(month)
	3000	+ day = m.group('day')
	3001	+ if day:
	3002	+ day = int(day)
	3003	+ else:
	3004	+ day = 1
	3005	+ return year, month, day
	3006	+
	3007	+ def __extract_time(m):
	3008	+ if not m:
	3009	+ return 0, 0, 0
	3010	+ hours = m.group('hours')
	3011	+ if not hours:
	3012	+ return 0, 0, 0
	3013	+ hours = int(hours)
	3014	+ minutes = int(m.group('minutes'))
	3015	+ seconds = m.group('seconds')
	3016	+ if seconds:
	3017	+ seconds = int(seconds)
	3018	+ else:
	3019	+ seconds = 0
	3020	+ return hours, minutes, seconds
	3021	+
	3022	+ def __extract_tzd(m):
	3023	+ '''Return the Time Zone Designator as an offset in seconds from UTC.'''
	3024	+ if not m:
	3025	+ return 0
	3026	+ tzd = m.group('tzd')
	3027	+ if not tzd:
	3028	+ return 0
	3029	+ if tzd == 'Z':
	3030	+ return 0
	3031	+ hours = int(m.group('tzdhours'))
	3032	+ minutes = m.group('tzdminutes')
	3033	+ if minutes:
	3034	+ minutes = int(minutes)
	3035	+ else:
	3036	+ minutes = 0
	3037	+ offset = (hours60 + minutes) 60
	3038	+ if tzd[0] == '+':
	3039	+ return -offset
	3040	+ return offset
	3041	+
	3042	+ __date_re = ('(?P<year>\d\d\d\d)'
	3043	+ '(?:(?P<dsep>-\|)'
	3044	+ '(?:(?P<julian>\d\d\d)'
	3045	+ '\|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?')
	3046	+ __tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))\|Z)'
	3047	+ __tzd_rx = re.compile(__tzd_re)
	3048	+ __time_re = ('(?P<hours>\d\d)(?P<tsep>:\|)(?P<minutes>\d\d)'
	3049	+ '(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?'
	3050	+ + __tzd_re)
	3051	+ __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re)
	3052	+ __datetime_rx = re.compile(__datetime_re)
	3053	+ m = __datetime_rx.match(dateString)
	3054	+ if (m is None) or (m.group() != dateString): return
	3055	+ gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
	3056	+ if gmt[0] == 0: return
	3057	+ return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)
	3058	+registerDateHandler(_parse_date_w3dtf)
	3059	+
	3060	+def _parse_date_rfc822(dateString):
	3061	+ '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date'''
	3062	+ data = dateString.split()
	3063	+ if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:
	3064	+ del data[0]
	3065	+ if len(data) == 4:
	3066	+ s = data[3]
	3067	+ i = s.find('+')
	3068	+ if i > 0:
	3069	+ data[3:] = [s[:i], s[i+1:]]
	3070	+ else:
	3071	+ data.append('')
	3072	+ dateString = " ".join(data)
	3073	+ if len(data) < 5:
	3074	+ dateString += ' 00:00:00 GMT'
	3075	+ tm = rfc822.parsedate_tz(dateString)
	3076	+ if tm:
	3077	+ return time.gmtime(rfc822.mktime_tz(tm))
	3078	+# rfc822.py defines several time zones, but we define some extra ones.
	3079	+# 'ET' is equivalent to 'EST', etc.
	3080	+_additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
	3081	+rfc822._timezones.update(_additional_timezones)
	3082	+registerDateHandler(_parse_date_rfc822)
	3083	+
	3084	+def _parse_date_perforce(aDateString):
	3085	+ """parse a date in yyyy/mm/dd hh:mm:ss TTT format"""
	3086	+ # Fri, 2006/09/15 08:19:53 EDT
	3087	+ _my_date_pattern = re.compile( \
	3088	+ r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})')
	3089	+
	3090	+ dow, year, month, day, hour, minute, second, tz = \
	3091	+ _my_date_pattern.search(aDateString).groups()
	3092	+ months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
	3093	+ dateString = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz)
	3094	+ tm = rfc822.parsedate_tz(dateString)
	3095	+ if tm:
	3096	+ return time.gmtime(rfc822.mktime_tz(tm))
	3097	+registerDateHandler(_parse_date_perforce)
	3098	+
	3099	+def _parse_date(dateString):
	3100	+ '''Parses a variety of date formats into a 9-tuple in GMT'''
	3101	+ for handler in _date_handlers:
	3102	+ try:
	3103	+ date9tuple = handler(dateString)
	3104	+ if not date9tuple: continue
	3105	+ if len(date9tuple) != 9:
	3106	+ if _debug: sys.stderr.write('date handler function must return 9-tuple\n')
	3107	+ raise ValueError
	3108	+ map(int, date9tuple)
	3109	+ return date9tuple
	3110	+ except Exception as e:
	3111	+ if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e)))
	3112	+ pass
	3113	+ return None
	3114	+
	3115	+def _getCharacterEncoding(http_headers, xml_data):
	3116	+ '''Get the character encoding of the XML document
	3117	+
	3118	+ http_headers is a dictionary
	3119	+ xml_data is a raw string (not Unicode)
	3120	+
	3121	+ This is so much trickier than it sounds, it's not even funny.
	3122	+ According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
	3123	+ is application/xml, application/*+xml,
	3124	+ application/xml-external-parsed-entity, or application/xml-dtd,
	3125	+ the encoding given in the charset parameter of the HTTP Content-Type
	3126	+ takes precedence over the encoding given in the XML prefix within the
	3127	+ document, and defaults to 'utf-8' if neither are specified. But, if
	3128	+ the HTTP Content-Type is text/xml, text/*+xml, or
	3129	+ text/xml-external-parsed-entity, the encoding given in the XML prefix
	3130	+ within the document is ALWAYS IGNORED and only the encoding given in
	3131	+ the charset parameter of the HTTP Content-Type header should be
	3132	+ respected, and it defaults to 'us-ascii' if not specified.
	3133	+
	3134	+ Furthermore, discussion on the atom-syntax mailing list with the
	3135	+ author of RFC 3023 leads me to the conclusion that any document
	3136	+ served with a Content-Type of text/* and no charset parameter
	3137	+ must be treated as us-ascii. (We now do this.) And also that it
	3138	+ must always be flagged as non-well-formed. (We now do this too.)
	3139	+
	3140	+ If Content-Type is unspecified (input was local file or non-HTTP source)
	3141	+ or unrecognized (server just got it totally wrong), then go by the
	3142	+ encoding given in the XML prefix of the document and default to
	3143	+ 'iso-8859-1' as per the HTTP specification (RFC 2616).
	3144	+
	3145	+ Then, assuming we didn't find a character encoding in the HTTP headers
	3146	+ (and the HTTP Content-type allowed us to look in the body), we need
	3147	+ to sniff the first few bytes of the XML data and try to determine
	3148	+ whether the encoding is ASCII-compatible. Section F of the XML
	3149	+ specification shows the way here:
	3150	+ http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
	3151	+
	3152	+ If the sniffed encoding is not ASCII-compatible, we need to make it
	3153	+ ASCII compatible so that we can sniff further into the XML declaration
	3154	+ to find the encoding attribute, which will tell us the true encoding.
	3155	+
	3156	+ Of course, none of this guarantees that we will be able to parse the
	3157	+ feed in the declared character encoding (assuming it was declared
	3158	+ correctly, which many are not). CJKCodecs and iconv_codec help a lot;
	3159	+ you should definitely install them if you can.
	3160	+ http://cjkpython.i18n.org/
	3161	+ '''
	3162	+
	3163	+ def _parseHTTPContentType(content_type):
	3164	+ '''takes HTTP Content-Type header and returns (content type, charset)
	3165	+
	3166	+ If no charset is specified, returns (content type, '')
	3167	+ If no content type is specified, returns ('', '')
	3168	+ Both return parameters are guaranteed to be lowercase strings
	3169	+ '''
	3170	+ content_type = content_type or ''
	3171	+ content_type, params = cgi.parse_header(content_type)
	3172	+ return content_type, params.get('charset', '').replace("'", '')
	3173	+
	3174	+ sniffed_xml_encoding = ''
	3175	+ xml_encoding = ''
	3176	+ true_encoding = ''
	3177	+ http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type'))
	3178	+ # Must sniff for non-ASCII-compatible character encodings before
	3179	+ # searching for XML declaration. This heuristic is defined in
	3180	+ # section F of the XML specification:
	3181	+ # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
	3182	+ try:
	3183	+ if xml_data[:4] == '\x4c\x6f\xa7\x94':
	3184	+ # EBCDIC
	3185	+ xml_data = _ebcdic_to_ascii(xml_data)
	3186	+ elif xml_data[:4] == '\x00\x3c\x00\x3f':
	3187	+ # UTF-16BE
	3188	+ sniffed_xml_encoding = 'utf-16be'
	3189	+ xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
	3190	+ elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'):
	3191	+ # UTF-16BE with BOM
	3192	+ sniffed_xml_encoding = 'utf-16be'
	3193	+ xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
	3194	+ elif xml_data[:4] == '\x3c\x00\x3f\x00':
	3195	+ # UTF-16LE
	3196	+ sniffed_xml_encoding = 'utf-16le'
	3197	+ xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
	3198	+ elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'):
	3199	+ # UTF-16LE with BOM
	3200	+ sniffed_xml_encoding = 'utf-16le'
	3201	+ xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
	3202	+ elif xml_data[:4] == '\x00\x00\x00\x3c':
	3203	+ # UTF-32BE
	3204	+ sniffed_xml_encoding = 'utf-32be'
	3205	+ xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
	3206	+ elif xml_data[:4] == '\x3c\x00\x00\x00':
	3207	+ # UTF-32LE
	3208	+ sniffed_xml_encoding = 'utf-32le'
	3209	+ xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
	3210	+ elif xml_data[:4] == '\x00\x00\xfe\xff':
	3211	+ # UTF-32BE with BOM
	3212	+ sniffed_xml_encoding = 'utf-32be'
	3213	+ xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
	3214	+ elif xml_data[:4] == '\xff\xfe\x00\x00':
	3215	+ # UTF-32LE with BOM
	3216	+ sniffed_xml_encoding = 'utf-32le'
	3217	+ xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
	3218	+ elif xml_data[:3] == '\xef\xbb\xbf':
	3219	+ # UTF-8 with BOM
	3220	+ sniffed_xml_encoding = 'utf-8'
	3221	+ xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
	3222	+ else:
	3223	+ # ASCII-compatible
	3224	+ pass
	3225	+ xml_encoding_match = re.compile('^<\?.encoding=[\'"](.?)[\'"].*\?>').match(xml_data)
	3226	+ except:
	3227	+ xml_encoding_match = None
	3228	+ if xml_encoding_match:
	3229	+ xml_encoding = xml_encoding_match.groups()[0].lower()
	3230	+ if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):
	3231	+ xml_encoding = sniffed_xml_encoding
	3232	+ acceptable_content_type = 0
	3233	+ application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')
	3234	+ text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
	3235	+ if (http_content_type in application_content_types) or \
	3236	+ (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
	3237	+ acceptable_content_type = 1
	3238	+ true_encoding = http_encoding or xml_encoding or 'utf-8'
	3239	+ elif (http_content_type in text_content_types) or \
	3240	+ (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):
	3241	+ acceptable_content_type = 1
	3242	+ true_encoding = http_encoding or 'us-ascii'
	3243	+ elif http_content_type.startswith('text/'):
	3244	+ true_encoding = http_encoding or 'us-ascii'
	3245	+ elif http_headers and (not http_headers.has_key('content-type')):
	3246	+ true_encoding = xml_encoding or 'iso-8859-1'
	3247	+ else:
	3248	+ true_encoding = xml_encoding or 'utf-8'
	3249	+ # some feeds claim to be gb2312 but are actually gb18030.
	3250	+ # apparently MSIE and Firefox both do the following switch:
	3251	+ if true_encoding.lower() == 'gb2312':
	3252	+ true_encoding = 'gb18030'
	3253	+ return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
	3254	+
	3255	+def _toUTF8(data, encoding):
	3256	+ '''Changes an XML data stream on the fly to specify a new encoding
	3257	+
	3258	+ data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
	3259	+ encoding is a string recognized by encodings.aliases
	3260	+ '''
	3261	+ if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)
	3262	+ # strip Byte Order Mark (if present)
	3263	+ if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'):
	3264	+ if _debug:
	3265	+ sys.stderr.write('stripping BOM\n')
	3266	+ if encoding != 'utf-16be':
	3267	+ sys.stderr.write('trying utf-16be instead\n')
	3268	+ encoding = 'utf-16be'
	3269	+ data = data[2:]
	3270	+ elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'):
	3271	+ if _debug:
	3272	+ sys.stderr.write('stripping BOM\n')
	3273	+ if encoding != 'utf-16le':
	3274	+ sys.stderr.write('trying utf-16le instead\n')
	3275	+ encoding = 'utf-16le'
	3276	+ data = data[2:]
	3277	+ elif data[:3] == '\xef\xbb\xbf':
	3278	+ if _debug:
	3279	+ sys.stderr.write('stripping BOM\n')
	3280	+ if encoding != 'utf-8':
	3281	+ sys.stderr.write('trying utf-8 instead\n')
	3282	+ encoding = 'utf-8'
	3283	+ data = data[3:]
	3284	+ elif data[:4] == '\x00\x00\xfe\xff':
	3285	+ if _debug:
	3286	+ sys.stderr.write('stripping BOM\n')
	3287	+ if encoding != 'utf-32be':
	3288	+ sys.stderr.write('trying utf-32be instead\n')
	3289	+ encoding = 'utf-32be'
	3290	+ data = data[4:]
	3291	+ elif data[:4] == '\xff\xfe\x00\x00':
	3292	+ if _debug:
	3293	+ sys.stderr.write('stripping BOM\n')
	3294	+ if encoding != 'utf-32le':
	3295	+ sys.stderr.write('trying utf-32le instead\n')
	3296	+ encoding = 'utf-32le'
	3297	+ data = data[4:]
	3298	+ newdata = unicode(data, encoding)
	3299	+ if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)
	3300	+ declmatch = re.compile('^<\?xml[^>]*?>')
	3301	+ newdecl = '''<?xml version='1.0' encoding='utf-8'?>'''
	3302	+ if declmatch.search(newdata):
	3303	+ newdata = declmatch.sub(newdecl, newdata)
	3304	+ else:
	3305	+ newdata = newdecl + u'\n' + newdata
	3306	+ return newdata.encode('utf-8')
	3307	+
	3308	+def _stripDoctype(data):
	3309	+ '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
	3310	+
	3311	+ rss_version may be 'rss091n' or None
	3312	+ stripped_data is the same XML document, minus the DOCTYPE
	3313	+ '''
	3314	+ start = re.search('<\w',data)
	3315	+ start = start and start.start() or -1
	3316	+ head,data = data[:start+1], data[start+1:]
	3317	+
	3318	+ entity_pattern = re.compile(r'^\s<!ENTITY([^>]?)>', re.MULTILINE)
	3319	+ entity_results=entity_pattern.findall(head)
	3320	+ head = entity_pattern.sub('', head)
	3321	+ doctype_pattern = re.compile(r'^\s<!DOCTYPE([^>]?)>', re.MULTILINE)
	3322	+ doctype_results = doctype_pattern.findall(head)
	3323	+ doctype = doctype_results and doctype_results[0] or ''
	3324	+ if doctype.lower().count('netscape'):
	3325	+ version = 'rss091n'
	3326	+ else:
	3327	+ version = None
	3328	+
	3329	+ # only allow in 'safe' inline entity definitions
	3330	+ replacement=''
	3331	+ if len(doctype_results)==1 and entity_results:
	3332	+ safe_pattern=re.compile('\s+(\w+)\s+"(&#\w+;\|[^&"]*)"')
	3333	+ safe_entities=filter(lambda e: safe_pattern.match(e),entity_results)
	3334	+ if safe_entities:
	3335	+ replacement='<!DOCTYPE feed [\n <!ENTITY %s>\n]>' % '>\n <!ENTITY '.join(safe_entities)
	3336	+ data = doctype_pattern.sub(replacement, head) + data
	3337	+
	3338	+ return version, data, dict(replacement and safe_pattern.findall(replacement))
	3339	+
	3340	+def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):
	3341	+ '''Parse a feed from a URL, file, stream, or string'''
	3342	+ result = FeedParserDict()
	3343	+ result['feed'] = FeedParserDict()
	3344	+ result['entries'] = []
	3345	+ if _XML_AVAILABLE:
	3346	+ result['bozo'] = 0
	3347	+ if type(handlers) == types.InstanceType:
	3348	+ handlers = [handlers]
	3349	+ try:
	3350	+ f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)
	3351	+ data = f.read()
	3352	+ except Exception as e:
	3353	+ result['bozo'] = 1
	3354	+ result['bozo_exception'] = e
	3355	+ data = ''
	3356	+ f = None
	3357	+
	3358	+ # if feed is gzip-compressed, decompress it
	3359	+ if f and data and hasattr(f, 'headers'):
	3360	+ if gzip and f.headers.get('content-encoding', '') == 'gzip':
	3361	+ try:
	3362	+ data = gzip.GzipFile(fileobj=_StringIO(data)).read()
	3363	+ except Exception as e:
	3364	+ # Some feeds claim to be gzipped but they're not, so
	3365	+ # we get garbage. Ideally, we should re-request the
	3366	+ # feed without the 'Accept-encoding: gzip' header,
	3367	+ # but we don't.
	3368	+ result['bozo'] = 1
	3369	+ result['bozo_exception'] = e
	3370	+ data = ''
	3371	+ elif zlib and f.headers.get('content-encoding', '') == 'deflate':
	3372	+ try:
	3373	+ data = zlib.decompress(data, -zlib.MAX_WBITS)
	3374	+ except Exception as e:
	3375	+ result['bozo'] = 1
	3376	+ result['bozo_exception'] = e
	3377	+ data = ''
	3378	+
	3379	+ # save HTTP headers
	3380	+ if hasattr(f, 'info'):
	3381	+ info = f.info()
	3382	+ etag = info.getheader('ETag')
	3383	+ if etag:
	3384	+ result['etag'] = etag
	3385	+ last_modified = info.getheader('Last-Modified')
	3386	+ if last_modified:
	3387	+ result['modified'] = _parse_date(last_modified)
	3388	+ if hasattr(f, 'url'):
	3389	+ result['href'] = f.url
	3390	+ result['status'] = 200
	3391	+ if hasattr(f, 'status'):
	3392	+ result['status'] = f.status
	3393	+ if hasattr(f, 'headers'):
	3394	+ result['headers'] = f.headers.dict
	3395	+ if hasattr(f, 'close'):
	3396	+ f.close()
	3397	+
	3398	+ # there are four encodings to keep track of:
	3399	+ # - http_encoding is the encoding declared in the Content-Type HTTP header
	3400	+ # - xml_encoding is the encoding declared in the <?xml declaration
	3401	+ # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data
	3402	+ # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
	3403	+ http_headers = result.get('headers', {})
	3404	+ result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \
	3405	+ _getCharacterEncoding(http_headers, data)
	3406	+ if http_headers and (not acceptable_content_type):
	3407	+ if http_headers.has_key('content-type'):
	3408	+ bozo_message = '%s is not an XML media type' % http_headers['content-type']
	3409	+ else:
	3410	+ bozo_message = 'no Content-type specified'
	3411	+ result['bozo'] = 1
	3412	+ result['bozo_exception'] = NonXMLContentType(bozo_message)
	3413	+
	3414	+ result['version'], data, entities = _stripDoctype(data)
	3415	+
	3416	+ baseuri = http_headers.get('content-location', result.get('href'))
	3417	+ baselang = http_headers.get('content-language', None)
	3418	+
	3419	+ # if server sent 304, we're done
	3420	+ if result.get('status', 0) == 304:
	3421	+ result['version'] = ''
	3422	+ result['debug_message'] = 'The feed has not changed since you last checked, ' + \
	3423	+ 'so the server sent no data. This is a feature, not a bug!'
	3424	+ return result
	3425	+
	3426	+ # if there was a problem downloading, we're done
	3427	+ if not data:
	3428	+ return result
	3429	+
	3430	+ # determine character encoding
	3431	+ use_strict_parser = 0
	3432	+ known_encoding = 0
	3433	+ tried_encodings = []
	3434	+ # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
	3435	+ for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding):
	3436	+ if not proposed_encoding: continue
	3437	+ if proposed_encoding in tried_encodings: continue
	3438	+ tried_encodings.append(proposed_encoding)
	3439	+ try:
	3440	+ data = _toUTF8(data, proposed_encoding)
	3441	+ known_encoding = use_strict_parser = 1
	3442	+ break
	3443	+ except:
	3444	+ pass
	3445	+ # if no luck and we have auto-detection library, try that
	3446	+ if (not known_encoding) and chardet:
	3447	+ try:
	3448	+ proposed_encoding = chardet.detect(data)['encoding']
	3449	+ if proposed_encoding and (proposed_encoding not in tried_encodings):
	3450	+ tried_encodings.append(proposed_encoding)
	3451	+ data = _toUTF8(data, proposed_encoding)
	3452	+ known_encoding = use_strict_parser = 1
	3453	+ except:
	3454	+ pass
	3455	+ # if still no luck and we haven't tried utf-8 yet, try that
	3456	+ if (not known_encoding) and ('utf-8' not in tried_encodings):
	3457	+ try:
	3458	+ proposed_encoding = 'utf-8'
	3459	+ tried_encodings.append(proposed_encoding)
	3460	+ data = _toUTF8(data, proposed_encoding)
	3461	+ known_encoding = use_strict_parser = 1
	3462	+ except:
	3463	+ pass
	3464	+ # if still no luck and we haven't tried windows-1252 yet, try that
	3465	+ if (not known_encoding) and ('windows-1252' not in tried_encodings):
	3466	+ try:
	3467	+ proposed_encoding = 'windows-1252'
	3468	+ tried_encodings.append(proposed_encoding)
	3469	+ data = _toUTF8(data, proposed_encoding)
	3470	+ known_encoding = use_strict_parser = 1
	3471	+ except:
	3472	+ pass
	3473	+ # if still no luck and we haven't tried iso-8859-2 yet, try that.
	3474	+ if (not known_encoding) and ('iso-8859-2' not in tried_encodings):
	3475	+ try:
	3476	+ proposed_encoding = 'iso-8859-2'
	3477	+ tried_encodings.append(proposed_encoding)
	3478	+ data = _toUTF8(data, proposed_encoding)
	3479	+ known_encoding = use_strict_parser = 1
	3480	+ except:
	3481	+ pass
	3482	+ # if still no luck, give up
	3483	+ if not known_encoding:
	3484	+ result['bozo'] = 1
	3485	+ result['bozo_exception'] = CharacterEncodingUnknown( \
	3486	+ 'document encoding unknown, I tried ' + \
	3487	+ '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' % \
	3488	+ (result['encoding'], xml_encoding))
	3489	+ result['encoding'] = ''
	3490	+ elif proposed_encoding != result['encoding']:
	3491	+ result['bozo'] = 1
	3492	+ result['bozo_exception'] = CharacterEncodingOverride( \
	3493	+ 'documented declared as %s, but parsed as %s' % \
	3494	+ (result['encoding'], proposed_encoding))
	3495	+ result['encoding'] = proposed_encoding
	3496	+
	3497	+ if not _XML_AVAILABLE:
	3498	+ use_strict_parser = 0
	3499	+ if use_strict_parser:
	3500	+ # initialize the SAX parser
	3501	+ feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
	3502	+ saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
	3503	+ saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
	3504	+ saxparser.setContentHandler(feedparser)
	3505	+ saxparser.setErrorHandler(feedparser)
	3506	+ source = xml.sax.xmlreader.InputSource()
	3507	+ source.setByteStream(_StringIO(data))
	3508	+ if hasattr(saxparser, '_ns_stack'):
	3509	+ # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
	3510	+ # PyXML doesn't have this problem, and it doesn't have _ns_stack either
	3511	+ saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
	3512	+ try:
	3513	+ saxparser.parse(source)
	3514	+ except Exception, e:
	3515	+ if _debug:
	3516	+ import traceback
	3517	+ traceback.print_stack()
	3518	+ traceback.print_exc()
	3519	+ sys.stderr.write('xml parsing failed\n')
	3520	+ result['bozo'] = 1
	3521	+ result['bozo_exception'] = feedparser.exc or e
	3522	+ use_strict_parser = 0
	3523	+ if not use_strict_parser:
	3524	+ feedparser = _LooseFeedParser(baseuri, baselang, known_encoding and 'utf-8' or '', entities)
	3525	+ feedparser.feed(data)
	3526	+ result['feed'] = feedparser.feeddata
	3527	+ result['entries'] = feedparser.entries
	3528	+ result['version'] = result['version'] or feedparser.version
	3529	+ result['namespaces'] = feedparser.namespacesInUse
	3530	+ return result
	3531	+
	3532	+class Serializer:
	3533	+ def __init__(self, results):
	3534	+ self.results = results
	3535	+
	3536	+class TextSerializer(Serializer):
	3537	+ def write(self, stream=sys.stdout):
	3538	+ self._writer(stream, self.results, '')
	3539	+
	3540	+ def _writer(self, stream, node, prefix):
	3541	+ if not node: return
	3542	+ if hasattr(node, 'keys'):
	3543	+ keys = node.keys()
	3544	+ keys.sort()
	3545	+ for k in keys:
	3546	+ if k in ('description', 'link'): continue
	3547	+ if node.has_key(k + '_detail'): continue
	3548	+ if node.has_key(k + '_parsed'): continue
	3549	+ self._writer(stream, node[k], prefix + k + '.')
	3550	+ elif type(node) == types.ListType:
	3551	+ index = 0
	3552	+ for n in node:
	3553	+ self._writer(stream, n, prefix[:-1] + '[' + str(index) + '].')
	3554	+ index += 1
	3555	+ else:
	3556	+ try:
	3557	+ s = str(node).encode('utf-8')
	3558	+ s = s.replace('\\', '\\\\')
	3559	+ s = s.replace('\r', '')
	3560	+ s = s.replace('\n', r'\n')
	3561	+ stream.write(prefix[:-1])
	3562	+ stream.write('=')
	3563	+ stream.write(s)
	3564	+ stream.write('\n')
	3565	+ except:
	3566	+ pass
	3567	+
	3568	+class PprintSerializer(Serializer):
	3569	+ def write(self, stream=sys.stdout):
	3570	+ if self.results.has_key('href'):
	3571	+ stream.write(self.results['href'] + '\n\n')
	3572	+ from pprint import pprint
	3573	+ pprint(self.results, stream)
	3574	+ stream.write('\n')
	3575	+
	3576	+if __name__ == '__main__':
	3577	+ try:
	3578	+ from optparse import OptionParser
	3579	+ except:
	3580	+ OptionParser = None
	3581	+
	3582	+ if OptionParser:
	3583	+ optionParser = OptionParser(version=__version__, usage="%prog [options] url_or_filename_or_-")
	3584	+ optionParser.set_defaults(format="pprint")
	3585	+ optionParser.add_option("-A", "--user-agent", dest="agent", metavar="AGENT", help="User-Agent for HTTP URLs")
	3586	+ optionParser.add_option("-e", "--referer", "--referrer", dest="referrer", metavar="URL", help="Referrer for HTTP URLs")
	3587	+ optionParser.add_option("-t", "--etag", dest="etag", metavar="TAG", help="ETag/If-None-Match for HTTP URLs")
	3588	+ optionParser.add_option("-m", "--last-modified", dest="modified", metavar="DATE", help="Last-modified/If-Modified-Since for HTTP URLs (any supported date format)")
	3589	+ optionParser.add_option("-f", "--format", dest="format", metavar="FORMAT", help="output results in FORMAT (text, pprint)")
	3590	+ optionParser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False, help="write debugging information to stderr")
	3591	+ (options, urls) = optionParser.parse_args()
	3592	+ if options.verbose:
	3593	+ _debug = 1
	3594	+ if not urls:
	3595	+ optionParser.print_help()
	3596	+ sys.exit(0)
	3597	+ else:
	3598	+ if not sys.argv[1:]:
	3599	+ print __doc__
	3600	+ sys.exit(0)
	3601	+ class _Options:
	3602	+ etag = modified = agent = referrer = None
	3603	+ format = 'pprint'
	3604	+ options = _Options()
	3605	+ urls = sys.argv[1:]
	3606	+
	3607	+ zopeCompatibilityHack()
	3608	+
	3609	+ serializer = globals().get(options.format.capitalize() + 'Serializer', Serializer)
	3610	+ for url in urls:
	3611	+ results = parse(url, etag=options.etag, modified=options.modified, agent=options.agent, referrer=options.referrer)
	3612	+ serializer(results).write(sys.stdout)

techcrunch.py

View file @ 7d91adf

...	...	@@ -0,0 +1,630 @@
	1	+#!/usr/bin/python2.5
	2	+# chmod 755 me, and make sure I have UNIX style newlines.
	3	+#
	4	+# techcrunch.py
	5	+#
	6	+# http://feeds.feedburner.com/TechCrunch
	7	+# feed = feedparser.parse( 'http://feeds.feedburner.com/TechCrunch' )
	8	+# feed.entries[14]['feedburner_origlink'], feed.entries[14]['slash_comments']
	9	+#
	10	+# TODO:
	11	+# 1. Deep links: '<a class="post_more_link snap_nopreview" href="http://www.crunchgear.com/2010/08/21/fanboyism-when-expression-meets-desperation/">Read the rest of this entry »</a>'
	12	+# link = "http://techcrunch.com/2010/08/21/fanboyism-when-expression-meets-desperation/"
	13	+
	14	+import feedparser
	15	+import yaml
	16	+import sys
	17	+import os
	18	+import time
	19	+import StringIO
	20	+import codecs
	21	+import traceback
	22	+import calendar
	23	+import pickle
	24	+import exceptions
	25	+import urllib
	26	+import urllib2
	27	+import httplib
	28	+import shutil
	29	+import glob
	30	+import smtplib
	31	+import bisect
	32	+import analysis
	33	+import simplejson as json
	34	+import cookielib
	35	+
	36	+debug = True
	37	+any_entry_added = False
	38	+
	39	+localdir = ''
	40	+
	41	+html_head = """
	42	+<!DOCTYPE HTML PUBLIC '-//W3C//DTD HTML 4.01 Transitional//EN'>
	43	+<HTML><HEAD>
	44	+ <title>TechCrunch Feed Filter</title>
	45	+ <!-- <link rel="alternate" type="application/rss+xml" title="RSS feed" href="http://techcrunch.dlma.com/rss_feed.xml" /> -->
	46	+ <link rel="alternate" type="application/rss+xml" title="RSS feed" href="http://feeds.feedburner.com/TrendingAtTechcrunch" />
	47	+ <style type="text/css">
	48	+ body { font-family: "Arial", san-serif; }
	49	+ .author { font-size: smaller; }
	50	+ .h3 { font-size: larger; }
	51	+ a { text-decoration: none; }
	52	+ /* table { border: none; border-collapse:collapse; font-size: large } */
	53	+ table { border-collapse: collapse; }
	54	+ table.legend { border:1px solid LightSlateGray; font-size: medium; border-collapse:separated; }
	55	+ table.legend th { border: 1px solid LightSlateGray; background-color: #E0E0E0; }
	56	+ table.legend td { border: 1px solid LightSlateGray; }
	57	+ tr.even { background:#%s; padding: 2em; }
	58	+ tr.odd { background:#%s; padding-bottom: 2em; }
	59	+ </style>
	60	+</HEAD>
	61	+<BODY>
	62	+<div align='center'><h3>TechCrunch Feed Filter</h3></div>
	63	+This page shows what analysis is done to filter the noise away from the Techcrunch feed into <a href="http://feeds.feedburner.com/TrendingAtTechcrunch">a more concise feed</a>.<br /><br />
	64	+"""
	65	+
	66	+html_footer = """
	67	+</table>
	68	+</div><br />
	69	+<div align='center'>Thanks to <a href="http://www.feedparser.org/">The Universal Feed Parser module</a>,
	70	+<a href="http://pyyaml.org/">PyYAML</a> and <a href="http://code.google.com/apis/chart/">Google Charts</a>.<br /><a href="techcrunch.yaml">raw data</a> • <a href="stats.txt">status</a></div><br />
	71	+</BODY>
	72	+</HTML>
	73	+"""
	74	+
	75	+img_width = 300
	76	+img_height = 50
	77	+
	78	+series_1_color = "0000FF"
	79	+series_2_color = "00AA00"
	80	+threshold_color = "FF8C00"
	81	+
	82	+even_background = "F8F8F8"
	83	+#even_background = "FFFFFF"
	84	+odd_background = "E8E8E8"
	85	+
	86	+def asciiize( s ):
	87	+ try:
	88	+ return s.encode( 'ascii' )
	89	+ except UnicodeEncodeError, e:
	90	+ return s
	91	+ except exceptions.AttributeError, e:
	92	+ return s
	93	+
	94	+def sendEmail( subject, message, toaddrs, fromaddr='"techcrunch.py" <techcrunch@techcrunch.dlma.com>' ):
	95	+ """Sends Email"""
	96	+ smtp = smtplib.SMTP( 'localhost' )
	97	+ smtp.sendmail( fromaddr, \
	98	+ toaddrs, \
	99	+ "Content-Type: text/plain; charset=\"us-ascii\"\r\nFrom: %s\r\nTo: %s\r\nSubject: %s\r\n%s" % \
	100	+ ( fromaddr, ", ".join( toaddrs ), subject, message ) )
	101	+ smtp.quit()
	102	+
	103	+def make_chart_url( time_posted, comment_times, comment_values, retweet_times, retweet_values, met_threshold_pt, bg_color ):
	104	+# comment_times, comment_values = zip( *comments )
	105	+# retweet_times, retweet_values = zip( *retweets )
	106	+
	107	+ # TODO handle failure cases, -1
	108	+
	109	+ if not len( comment_times ):
	110	+ comment_times = [ time_posted, ]
	111	+ if not len( comment_values ):
	112	+ comment_values = [ 0, ]
	113	+ if not len( retweet_times ):
	114	+ retweet_times = [ time_posted, ]
	115	+ if not len( retweet_values ):
	116	+ retweet_values = [ 0, ]
	117	+
	118	+# comment_times = [ (i - time_posted + 900) / 1800 for i in comment_times ]
	119	+# retweet_times = [ (i - time_posted + 900) / 1800 for i in retweet_times ]
	120	+ comment_times = [ (i - time_posted) / 1800 for i in comment_times ]
	121	+ retweet_times = [ (i - time_posted) / 1800 for i in retweet_times ]
	122	+
	123	+ min_comment_time = min( comment_times )
	124	+ max_comment_time = max( comment_times )
	125	+ min_comment_value = min( comment_values )
	126	+ max_comment_value = max( comment_values )
	127	+ min_retweet_time = min( retweet_times )
	128	+ max_retweet_time = max( retweet_times )
	129	+ min_retweet_value = min( retweet_values )
	130	+ max_retweet_value = max( retweet_values )
	131	+
	132	+ if len( comment_values ) < 8 and len( comment_values ) > 1:
	133	+ # max_comment_value *= 2
	134	+ pass
	135	+ elif len( comment_values ) == 1:
	136	+ min_comment_value = 0
	137	+ if len( retweet_values ) < 8 and len( retweet_values ) > 1:
	138	+ # max_retweet_value *= 2
	139	+ pass
	140	+ elif len( retweet_values ) == 1:
	141	+ min_retweet_value = 0
	142	+
	143	+ min_comment_value = 0
	144	+ min_retweet_value = 0
	145	+
	146	+ chart_url = "http://chart.apis.google.com/chart?cht=lxy&chco=%s,%s&chs=%dx%d&chxs=0,%s\|1,%s" % \
	147	+ ( series_1_color, series_2_color, img_width, img_height, series_1_color, series_2_color )
	148	+ chart_url += "&chd=t:%s\|%s\|%s\|%s" % ( ','.join( [ str( n ) for n in comment_times ] ),
	149	+ ','.join( [ str( n ) for n in comment_values ] ),
	150	+ ','.join( [ str( n ) for n in retweet_times ] ),
	151	+ ','.join( [ str( n ) for n in retweet_values ] ) )
	152	+ if met_threshold_pt != -1:
	153	+ chart_url += "&chm=o,%s,1,%d,10" % ( threshold_color, met_threshold_pt )
	154	+ chart_url += "&chxt=y,r&chxl=0:\|%d\|%d\|1:\|%d\|%d&chds=%d,%d,%d,%d,%d,%d,%d,%d" % \
	155	+ ( min_comment_value, max_comment_value, min_retweet_value, max_retweet_value,
	156	+ 0, max( 7, max_comment_time ),
	157	+ min_comment_value, max_comment_value,
	158	+ 0, max( 7, max_retweet_time ),
	159	+ min_comment_value, max_retweet_value )
	160	+ chart_url += "&chf=bg,s,%s&chdl=comments\|retweets" % ( bg_color, )
	161	+ return chart_url
	162	+
	163	+def process_feed( yaml_items ):
	164	+ """
	165	+ Retrieve the url and process it.
	166	+ feed_info (in, out) A tuple that describes an individual feed, like its name and etag.
	167	+ """
	168	+
	169	+ feed = feedparser.parse( 'http://feeds.feedburner.com/TechCrunch' )
	170	+ if hasattr( feed, 'status' ):
	171	+ if feed.status == 304:
	172	+ pass
	173	+ else:
	174	+ feed_is_modified = True
	175	+ if feed.status != 200 and feed.status != 307 and feed.status != 301 and feed.status != 302:
	176	+ if feed.status == 503:
	177	+ print "the feed is temporarily unavailable."
	178	+ elif feed.status == 400:
	179	+ print "the feed says we made a bad request."
	180	+ elif feed.status == 502:
	181	+ print "the feed reported a bad gateway error."
	182	+ elif feed.status == 404:
	183	+ print "the feed says the page was not found."
	184	+ elif feed.status == 500:
	185	+ print "the feed had an internal server error."
	186	+ elif feed.status == 403:
	187	+ print "Access to the feed was forbidden."
	188	+ else:
	189	+ print "the feed returned feed.status %d." % ( feed.status, )
	190	+ else:
	191	+ # Save off this
	192	+ f = file( os.path.join( localdir, 'techcrunch_feed.pickle' ), 'wb' )
	193	+ try:
	194	+ pickle.dump( feed, f )
	195	+ except( pickle.PicklingError, exceptions.TypeError ), e:
	196	+ print "An error occurred while pickling the feed: %s." % \
	197	+ ( # str(e.__class__),
	198	+ str(e) )
	199	+ traceback.print_exc( file = sys.stdout )
	200	+ feed_is_modified = False
	201	+ f.close()
	202	+
	203	+ for i in reversed( feed.entries ):
	204	+ process_item( i, yaml_items )
	205	+
	206	+ # If we have more than 200 items, remove the old ones.
	207	+ while len( yaml_items ) > 200:
	208	+ yaml_items.pop()
	209	+
	210	+ cookie = Get_cookie( urllib2.Request( 'http://mediacdn.disqus.com/1078/build/system/count.js' ) )
	211	+
	212	+ for i in yaml_items:
	213	+ # i['title'] = asciiize( i['title'] )
	214	+ # i['tags'] = map( asciiize, i['tags'] )
	215	+ process_yaml_item( i, cookie )
	216	+
	217	+ else:
	218	+ if hasattr(feed, 'bozo_exception'):
	219	+ e = feed.bozo_exception
	220	+ if isinstance( e, urllib2.URLError ): # e.__class__ == urllib2.URLError: # and hasattr(e, 'errno') and e.errno == 110:
	221	+ print_last_line = True
	222	+ if hasattr(e, 'reason'):
	223	+ if e.reason[0] == 110:
	224	+ print "the feed's connection timed out."
	225	+ print_last_line = False
	226	+ elif e.reason[0] == 111:
	227	+ print "the feed's connection was refused."
	228	+ print_last_line = False
	229	+ elif e.reason[0] == 104:
	230	+ print "the feed reset the connection."
	231	+ print_last_line = False
	232	+ else:
	233	+ print "the feed had a URLError with reason %s." % ( str(e.reason), )
	234	+ print_last_line = False
	235	+ if print_last_line:
	236	+ print "the feed had a URLError %s" % ( str(e), )
	237	+ elif isinstance( e, httplib.BadStatusLine ):
	238	+ if hasattr(e, 'message'):
	239	+ print "the feed gave a bad status line %s." % ( str(e.message ), )
	240	+ else:
	241	+ print "the feed gave a bad status line."
	242	+ else:
	243	+ if len( str(e) ):
	244	+ print "the feed bozo_exception: %s \"%s\"" % ( str(e.__class__), str(e) )
	245	+ else:
	246	+ print "the feed bozo_exception: %s %s" % ( str(e.__class__), repr(e) )
	247	+ else:
	248	+ print "the feed returned class %s, %s" % ( str(feed.__class__), str(feed) )
	249	+
	250	+def process_item( feed_item, yaml_items ):
	251	+ # Get the time
	252	+ global any_entry_added
	253	+ timecode_now = int( time.time() )
	254	+ date_parsed = time.gmtime()
	255	+ if hasattr( feed_item, 'issued_parsed' ):
	256	+ date_parsed = feed_item.issued_parsed
	257	+ date_set = True
	258	+ elif hasattr( feed_item, 'date_parsed' ):
	259	+ date_parsed = feed_item.date_parsed
	260	+ date_set = True
	261	+ else:
	262	+ print "process_item found no timestamp for", asciiize( feed_item.link )
	263	+ timecode_parsed = calendar.timegm( date_parsed )
	264	+
	265	+ # Look for i.feedburner_origlink in yaml_items
	266	+ yaml_item = None
	267	+ for i in yaml_items:
	268	+ if feed_item.feedburner_origlink == i['link']:
	269	+ yaml_item = i
	270	+ break
	271	+ if not yaml_item:
	272	+ author = ''
	273	+ link = feed_item.link
	274	+ if hasattr( feed_item, 'author' ):
	275	+ author = asciiize( feed_item.author )
	276	+ if hasattr( feed_item, 'feedburner_origlink' ):
	277	+ link = feed_item.feedburner_origlink
	278	+
	279	+ # Make a new yaml_item
	280	+ yaml_item = { 'title' : asciiize( feed_item.title ),
	281	+ 'link' : asciiize( link ),
	282	+ 'author' : author,
	283	+ 'tags' : [],
	284	+ 'orig_posted' : timecode_parsed,
	285	+ 'qualified' : -1,
	286	+ 'comment_times' : [],
	287	+ 'comments' : [],
	288	+ 'slash_comment_times' : [],
	289	+ 'slash_comments' : [],
	290	+ 'retweet_times' : [],
	291	+ 'retweets' : []
	292	+ }
	293	+ if hasattr( feed_item, 'tags' ):
	294	+ for i in feed_item.tags:
	295	+ yaml_item['tags'].append( asciiize( i.term ) )
	296	+
	297	+ yaml_items.insert( 0, yaml_item )
	298	+ any_entry_added = True
	299	+
	300	+ # Maybe check to ensure that this item isn't too old.
	301	+ if timecode_parsed < timecode_now - 60 * 30 * 9:
	302	+ return
	303	+
	304	+ # Now, add the new values
	305	+ if hasattr( feed_item, 'slash_comments' ) and len( yaml_item['slash_comments'] ) < 8:
	306	+ any_entry_added = True
	307	+ yaml_item['slash_comment_times'].append( timecode_now )
	308	+ yaml_item['slash_comments'].append( int( feed_item.slash_comments ) )
	309	+
	310	+def process_yaml_item( yaml_item, cookie ):
	311	+ global any_entry_added
	312	+
	313	+ timecode_now = int( time.time() )
	314	+ if len( yaml_item['comments'] ) < 8:
	315	+ num_comments = Get_num_disqus_comments( yaml_item['link'], cookie )
	316	+ if num_comments != -1:
	317	+ any_entry_added = True
	318	+ yaml_item['comment_times'].append( timecode_now )
	319	+ yaml_item['comments'].append( num_comments )
	320	+
	321	+ if len( yaml_item['retweets'] ) < 8:
	322	+ num_retweets = Get_num_retweets( yaml_item['link'] )
	323	+ if num_retweets != -1:
	324	+ any_entry_added = True
	325	+ yaml_item['retweet_times'].append( timecode_now )
	326	+ yaml_item['retweets'].append( num_retweets )
	327	+
	328	+def Get_num_comments( url_string ):
	329	+ try:
	330	+ f = urllib2.urlopen( url_string )
	331	+ data = f.read()
	332	+ f.close()
	333	+ except urllib2.URLError, e:
	334	+ if hasattr( e, 'reason' ):
	335	+ print "Get_num_comments got an error:", e.reason
	336	+ elif hasattr( e, 'code' ):
	337	+ print "Get_num_comments got an error. Code:", e.code
	338	+ return -1
	339	+ tag_to_find = '<a href="#comments" rel="nofollow">'
	340	+ offset = data.find( tag_to_find )
	341	+ if offset != -1:
	342	+ start_pos = offset + len( tag_to_find )
	343	+ end_pos = start_pos
	344	+ while str.isdigit( data[ end_pos ] ):
	345	+ end_pos += 1
	346	+ if end_pos > start_pos:
	347	+ return int( data[start_pos:end_pos] )
	348	+ return -1
	349	+
	350	+def Get_cookie( cookie_request ):
	351	+ cookie = cookielib.CookieJar()
	352	+ try:
	353	+ cookie_response = urllib2.urlopen( cookie_request )
	354	+ cookie.extract_cookies( cookie_response, cookie_request )
	355	+ return cookie
	356	+ except urllib2.URLError, e:
	357	+ if hasattr( e, 'reason' ):
	358	+ print "Get_cookie got an error:", e.reason
	359	+ elif hasattr( e, 'code' ):
	360	+ print "Get_cookie got an error. Code:", e.code
	361	+ return None
	362	+
	363	+def Get_num_disqus_comments( url_string, cookie ):
	364	+
	365	+ if cookie == None:
	366	+ return -1
	367	+
	368	+ try:
	369	+ f = urllib2.urlopen( url_string )
	370	+ data = f.read()
	371	+ f.close()
	372	+ except urllib2.URLError, e:
	373	+ if hasattr( e, 'reason' ):
	374	+ print "Get_num_disqus_comments got an error:", e.reason
	375	+ elif hasattr( e, 'code' ):
	376	+ print "Get_num_disqus_comments got an error. Code:", e.code
	377	+ return -1
	378	+
	379	+ tag_to_find = '<a href="#comments" rel="nofollow"><span class="dsq-postid" rel="'
	380	+ disqus_tag_to_find = 'displayCount('
	381	+ offset = data.find( tag_to_find )
	382	+ if offset != -1:
	383	+ start_pos = offset + len( tag_to_find )
	384	+ end_pos = start_pos
	385	+ while data[ end_pos ] != '"' and end_pos < start_pos + 200:
	386	+ end_pos += 1
	387	+ if end_pos < start_pos + 200:
	388	+ opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cookie ) )
	389	+ url_GET_data = urllib.quote_plus( data[start_pos:end_pos] ).replace( '+', '%20' )
	390	+ request = urllib2.Request( 'http://disqus.com/forums/techcrunch/count.js?q=1&0=1,' + url_GET_data )
	391	+ try:
	392	+ response = opener.open( request )
	393	+ disqus_data = response.read()
	394	+ except urllib2.URLError, e:
	395	+ if hasattr( e, 'reason' ):
	396	+ print "Get_num_disqus_comments got an error getting the count:", e.reason
	397	+ elif hasattr( e, 'code' ):
	398	+ print "Get_num_disqus_comments got an error getting the count. Code:", e.code
	399	+ disqus_data = ""
	400	+ disqus_offset = disqus_data.find( disqus_tag_to_find )
	401	+ if disqus_offset != -1:
	402	+ start_pos = disqus_offset + len( disqus_tag_to_find )
	403	+ end_pos = disqus_data.find( '}]})', start_pos )
	404	+ if end_pos != -1:
	405	+ return int( json.loads( disqus_data[start_pos:end_pos+3] )['counts'][0]['comments'] )
	406	+
	407	+ return -1
	408	+
	409	+def Get_num_retweets( url_string ):
	410	+ try:
	411	+ f = urllib2.urlopen( 'http://api.tweetmeme.com/button.js?url=%s' % ( url_string ) )
	412	+ data = f.read()
	413	+ f.close()
	414	+ except urllib2.URLError, e:
	415	+ if hasattr( e, 'reason' ):
	416	+ print "Get_num_retweets got an error:", e.reason
	417	+ elif hasattr( e, 'code' ):
	418	+ print "Get_num_retweets got an error. Code:", e.code
	419	+ return -1
	420	+ tag_to_find = '<span class="c">'
	421	+ offset = data.find( tag_to_find )
	422	+ if offset != -1:
	423	+ start_pos = offset + len( tag_to_find )
	424	+ end_pos = data.find( '<', start_pos )
	425	+ if end_pos != -1:
	426	+ return int( data[ start_pos:end_pos ] )
	427	+ return -1
	428	+
	429	+def Save_image( url_string, file_path ):
	430	+ try:
	431	+ f = urllib2.urlopen( url_string )
	432	+ data = f.read()
	433	+ f.close()
	434	+ except urllib2.URLError, e:
	435	+ if hasattr( e, 'reason' ):
	436	+ print "Save_image got an error:", e.reason
	437	+ elif hasattr( e, 'code' ):
	438	+ print "Save_image got an error. Code:", e.code
	439	+ return url_string
	440	+ if len( data ) > 50:
	441	+ f = open( file_path, 'wb' )
	442	+ f.write( data )
	443	+ f.close()
	444	+ return 'cache/' + os.path.basename( file_path )
	445	+ return url_string
	446	+
	447	+def Make_index_html( yaml_items, stats ):
	448	+ cur_time = int( time.time() )
	449	+ new_index_fullpath = os.path.join( localdir, 'index.html_new' )
	450	+ index_fullpath = os.path.join( localdir, 'index.html' )
	451	+ cache_path = os.path.join( localdir, 'cache' )
	452	+
	453	+ files_to_delete = glob.glob( cache_path + '*.png' )
	454	+# shutil.rmtree( cache_path )
	455	+# os.mkdir( cache_path )
	456	+
	457	+ f = file( new_index_fullpath, 'w' )
	458	+ f.write( html_head % ( even_background, odd_background ) )
	459	+# f.write( '<div align="center">\n<table cellpadding="4">' )
	460	+
	461	+ f.write( '<div align="center">\n<table class="legend">\n<tr><th>0:30</th><th>1:00</th><th>1:30</th><th>2:00</th><th>2:30</th><th>3:00</th><th>3:30</th><th>4:00</th></tr><tr>' )
	462	+ for median, mean, std_dev in stats:
	463	+ f.write( '<td>med=%1.1f μ=%1.1f σ=%1.1f </td> ' % ( median, mean, std_dev ) )
	464	+ f.write( '</tr>\n</table></div>\n<br />\n' )
	465	+
	466	+ f.write( '<div align="center">\n<table>\n' )
	467	+ image_index = 0
	468	+ for i in yaml_items[:40]:
	469	+ chart_url = make_chart_url( i['orig_posted'],
	470	+ i['comment_times'],
	471	+ i['comments'],
	472	+ i['retweet_times'],
	473	+ i['retweets'],
	474	+ i['qualified'],
	475	+ image_index % 2 and even_background or odd_background,
	476	+ )
	477	+ image_url = Save_image( chart_url, os.path.join( cache_path, '%d_%d.png' % ( cur_time, image_index ) ) )
	478	+ f.write( '<tr valign="center" class="%s">\n <td><strong><a href="%s">%s</a></strong> <span class="author">by %s</span></td>\n' % \
	479	+ ( image_index % 2 and "even" or "odd",
	480	+ i['link'],
	481	+ i['title'].encode( 'ascii', 'xmlcharrefreplace' ),
	482	+ i['author'].encode( 'ascii', 'xmlcharrefreplace' ),
	483	+ )
	484	+ )
	485	+ f.write( ' <td>%s<td>\n' % ( i['qualified'] != -1 and '<img src="star_30.png" width="30" height="29" />' or '' ) )
	486	+ f.write( ' <td><img src="%s" width="%d" height="%d" border="0" /></td></tr>\n' % \
	487	+ ( image_url,
	488	+ img_width,
	489	+ img_height
	490	+ )
	491	+ )
	492	+ image_index += 1
	493	+ f.write( html_footer )
	494	+ f.close()
	495	+ if os.path.exists( index_fullpath ):
	496	+ os.unlink( index_fullpath )
	497	+ shutil.move( new_index_fullpath, index_fullpath )
	498	+ for fname in files_to_delete:
	499	+ os.unlink( fname )
	500	+
	501	+def Make_feed_file( yaml_items ):
	502	+ f = open( os.path.join( localdir, 'rss_feed.xml' ), 'wb' )
	503	+ f.write( "<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n<rss version=\"2.0\">\n<channel>\n<title>Trending at TechCrunch</title><link>http://techcrunch.dlma.com</link>" )
	504	+ f.write( "<pubDate>%s</pubDate><description>Automatically Generated Feed</description><language>en-us</language>" % ( time.strftime( "%a, %d %b %Y %H:%M:%S +0000", time.gmtime() ) ) )
	505	+ count = 0
	506	+ for item in yaml_items:
	507	+ now = time.strftime( "%a, %d %b %Y %H:%M:%S +0000", time.gmtime( item['orig_posted'] ) )
	508	+ if item['qualified'] != -1:
	509	+ f.write( "<item><title>%s</title><pubDate>%s</pubDate><link>%s</link><guid isPermaLink=\"false\">%s</guid><description><![CDATA[By: %s]]></description></item>\n" % \
	510	+ ( item['title'].encode( 'ascii', 'xmlcharrefreplace' ), now, item['link'], item['link'], item['author'].encode( 'ascii', 'xmlcharrefreplace' ) ) )
	511	+ count += 1
	512	+ if count > 14:
	513	+ break
	514	+ f.write( "</channel></rss>" )
	515	+ f.close()
	516	+
	517	+if __name__=='__main__':
	518	+ start_time = time.time()
	519	+ progress_text = []
	520	+
	521	+ old_stdout = sys.stdout
	522	+ old_stderr = sys.stderr
	523	+ sys.stdout = sys.stderr = StringIO.StringIO()
	524	+
	525	+ try:
	526	+ localdir = os.path.abspath( os.path.dirname( sys.argv[0] ) )
	527	+ #
	528	+ # Read in techcrunch.yaml
	529	+ #
	530	+ # [ { 'title' : 'Title Text',
	531	+ # 'link' : u'http://techcrunch.com/2010/08/17/google-buzz-who-to-follow/',
	532	+ # 'author' : u'MG Siegler',
	533	+ # 'orig_posted' : 1282197199
	534	+ # 'tags' : [ u'Google', u'privacy' ]
	535	+ # 'qualified' : -1
	536	+ # 'comment_times' : [ 1282197199, 1282197407 ]
	537	+ # 'comments' : [ 0, 15 ]
	538	+ # 'slash_comment_times' : [ 1282197199, 1282197407 ]
	539	+ # 'slash_comments' : [ 0, 5 ]
	540	+ # 'slash_comment_times' : [ 1282197199, 1282197407 ]
	541	+ # 'slash_comments' : [ 0, 3 ]
	542	+ # 'retweet_times' : [ 1282197199, 1282197407 ]
	543	+ # 'retweets' : [ 0, 43 ]
	544	+ # },
	545	+ # { ... }
	546	+ # ]
	547	+ #
	548	+ yaml_fullpath = os.path.join( localdir, 'techcrunch.yaml' )
	549	+ if os.path.exists( yaml_fullpath ):
	550	+ f = file( yaml_fullpath, 'rb' )
	551	+ items = yaml.load( f )
	552	+ f.close()
	553	+ else:
	554	+ print "could not open", yaml_fullpath
	555	+ items = []
	556	+
	557	+ progress_text = [ "read techcrunch.yaml" ]
	558	+ process_feed( items )
	559	+
	560	+ #
	561	+ # If any work was done, then write files.
	562	+ #
	563	+ if True or any_entry_added:
	564	+
	565	+ stats = analysis.Process_retweets_for_feed( items )
	566	+
	567	+ # We'll only look at the stats for the time 1:00 to 1:30 after posting.
	568	+ median, mean, sigma = stats[2]
	569	+ threshold = median + sigma
	570	+ for item in items:
	571	+ if item['qualified'] == -1:
	572	+ for i in range( len( item['retweet_times'] ) ):
	573	+ r_time = item['retweet_times'][i]
	574	+ if r_time - item['orig_posted'] < 5400:
	575	+ if item['retweets'][i] >= threshold:
	576	+ item['qualified'] = i
	577	+ if r_time - item['orig_posted'] >= 3600:
	578	+ break
	579	+
	580	+ #
	581	+ # Write out the updated yaml file.
	582	+ #
	583	+ f = file( yaml_fullpath, 'wb' )
	584	+ yaml.dump( items, f, width=120 )
	585	+ f.close()
	586	+ f = file( os.path.join( localdir, 'techcrunch_text.yaml' ), 'w' )
	587	+ yaml.dump( items, f, width=120 )
	588	+ f.close()
	589	+ f = codecs.open( os.path.join( localdir, 'techcrunch_unicode.yaml' ), 'w', 'utf-8' )
	590	+ yaml.dump( items, f, encoding='utf-8', width=120 )
	591	+ f.close()
	592	+
	593	+ Make_feed_file( items )
	594	+
	595	+ Make_index_html( items, stats )
	596	+ else:
	597	+ print "No entries were added this time."
	598	+
	599	+ except Exception, e:
	600	+ exceptional_text = "An exception occurred: " + str( e.__class__ ) + " " + str(e)
	601	+ print exceptional_text, ' '.join( progress_text )
	602	+ traceback.print_exc( file = sys.stdout )
	603	+ try:
	604	+ sendEmail( 'Exception thrown in techcrunch.py',
	605	+ exceptional_text,
	606	+ ( 'david.blume@gmail.com', ) )
	607	+ except Exception, e:
	608	+ print "Could not send email to notify you of the exception. :("
	609	+
	610	+ message = sys.stdout.getvalue()
	611	+ sys.stdout = old_stdout
	612	+ sys.stderr = old_stderr
	613	+ if not debug:
	614	+ print message
	615	+
	616	+ # Finally, let's save this to a statistics page
	617	+ if os.path.exists( os.path.join( localdir, 'stats.txt' ) ):
	618	+ f = open( os.path.join( localdir, 'stats.txt' ))
	619	+ try:
	620	+ lines = f.readlines()
	621	+ finally:
	622	+ f.close()
	623	+ else:
	624	+ lines = []
	625	+ lines = lines[:168] # Just keep the past week's worth
	626	+ status = len( message.strip() ) and message.strip().replace( '\n', ' - ' ) or "OK"
	627	+ lines.insert( 0, "%s %3.0fs %s\n" % ( time.strftime('%H:%M, %Y-%m-%d', time.localtime()), time.time() - start_time, status ))
	628	+ f = open( os.path.join( localdir,'stats.txt' ), 'w' )
	629	+ f.writelines( lines )
	630	+ f.close()
0	631