David Blume commited on 2018-01-20 20:10:33
Showing 4 changed files, with 4291 additions and 0 deletions.
| ... | ... |
@@ -0,0 +1,19 @@ |
| 1 |
+Copyright (c) 2018, David Blume |
|
| 2 |
+ |
|
| 3 |
+Permission is hereby granted, free of charge, to any person obtaining |
|
| 4 |
+a copy of this software and associated documentation files (the "Software"), |
|
| 5 |
+to deal in the Software without restriction, including without limitation |
|
| 6 |
+the rights to use, copy, modify, merge, publish, distribute, sublicense, |
|
| 7 |
+and/or sell copies of the Software, and to permit persons to whom the |
|
| 8 |
+Software is furnished to do so, subject to the following conditions: |
|
| 9 |
+ |
|
| 10 |
+The above copyright notice and this permission notice shall be included |
|
| 11 |
+in all copies or substantial portions of the Software. |
|
| 12 |
+ |
|
| 13 |
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
|
| 14 |
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|
| 15 |
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
|
| 16 |
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|
| 17 |
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
|
| 18 |
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
|
| 19 |
+DEALINGS IN THE SOFTWARE. |
| ... | ... |
@@ -0,0 +1,30 @@ |
| 1 |
+[](http://git.dlma.com/techcrunch.git/blob/master/LICENSE.txt) |
|
| 2 |
+ |
|
| 3 |
+# TechCrunch Feed Filter |
|
| 4 |
+ |
|
| 5 |
+This is a Python script run as a cronjob to read the TechCrunch article feed, |
|
| 6 |
+and decide which articles to include in its own feed. |
|
| 7 |
+ |
|
| 8 |
+Here's a [blog post about it](http://david.dlma.com/blog/my-techcrunch-feed-filter). |
|
| 9 |
+ |
|
| 10 |
+# History |
|
| 11 |
+ |
|
| 12 |
+This was originally archived in a Subversion repo. I'd forgotten about the |
|
| 13 |
+version control and had gotten into the habit of just modifying the production |
|
| 14 |
+site. |
|
| 15 |
+ |
|
| 16 |
+* 2010-09-03: Original |
|
| 17 |
+* 2010-09-03: Save off the disqus identifier for use later. |
|
| 18 |
+* 2011-02-04: Algorithm changes (tags and author checked), new chart drawing, spaces used instead of tabs. |
|
| 19 |
+* 2011-02-04: Update to the chart drawing algorithm. |
|
| 20 |
+* 2013-08-04: Miscellaneous changes to techcrunch.py |
|
| 21 |
+* 2015-11-23: Resync svn with production site. |
|
| 22 |
+* 2015-11-27: Remove obsolete disqus and retweet code, and refactor style to be more PEP-8ish. |
|
| 23 |
+ |
|
| 24 |
+# Is it any good? |
|
| 25 |
+ |
|
| 26 |
+[Yes](https://news.ycombinator.com/item?id=3067434). |
|
| 27 |
+ |
|
| 28 |
+# Licence |
|
| 29 |
+ |
|
| 30 |
+This software uses the [MIT license](http://git.dlma.com/techcrunch.git/blob/master/LICENSE.txt) |
| ... | ... |
@@ -0,0 +1,3612 @@ |
| 1 |
+#!/usr/bin/env python |
|
| 2 |
+"""Universal feed parser |
|
| 3 |
+ |
|
| 4 |
+Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds |
|
| 5 |
+ |
|
| 6 |
+Visit http://feedparser.org/ for the latest version |
|
| 7 |
+Visit http://feedparser.org/docs/ for the latest documentation |
|
| 8 |
+ |
|
| 9 |
+Required: Python 2.1 or later |
|
| 10 |
+Recommended: Python 2.3 or later |
|
| 11 |
+Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/> |
|
| 12 |
+""" |
|
| 13 |
+ |
|
| 14 |
+__version__ = "4.2-pre-" + "$Revision: 291 $"[11:14] + "-svn" |
|
| 15 |
+__license__ = """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved. |
|
| 16 |
+ |
|
| 17 |
+Redistribution and use in source and binary forms, with or without modification, |
|
| 18 |
+are permitted provided that the following conditions are met: |
|
| 19 |
+ |
|
| 20 |
+* Redistributions of source code must retain the above copyright notice, |
|
| 21 |
+ this list of conditions and the following disclaimer. |
|
| 22 |
+* Redistributions in binary form must reproduce the above copyright notice, |
|
| 23 |
+ this list of conditions and the following disclaimer in the documentation |
|
| 24 |
+ and/or other materials provided with the distribution. |
|
| 25 |
+ |
|
| 26 |
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' |
|
| 27 |
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
| 28 |
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
| 29 |
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
|
| 30 |
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
|
| 31 |
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
|
| 32 |
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|
| 33 |
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
|
| 34 |
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
|
| 35 |
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
|
| 36 |
+POSSIBILITY OF SUCH DAMAGE.""" |
|
| 37 |
+__author__ = "Mark Pilgrim <http://diveintomark.org/>" |
|
| 38 |
+__contributors__ = ["Jason Diamond <http://injektilo.org/>", |
|
| 39 |
+ "John Beimler <http://john.beimler.org/>", |
|
| 40 |
+ "Fazal Majid <http://www.majid.info/mylos/weblog/>", |
|
| 41 |
+ "Aaron Swartz <http://aaronsw.com/>", |
|
| 42 |
+ "Kevin Marks <http://epeus.blogspot.com/>", |
|
| 43 |
+ "Sam Ruby <http://intertwingly.net/>"] |
|
| 44 |
+_debug = 0 |
|
| 45 |
+ |
|
| 46 |
+# HTTP "User-Agent" header to send to servers when downloading feeds. |
|
| 47 |
+# If you are embedding feedparser in a larger application, you should |
|
| 48 |
+# change this to your application name and URL. |
|
| 49 |
+USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__ |
|
| 50 |
+ |
|
| 51 |
+# HTTP "Accept" header to send to servers when downloading feeds. If you don't |
|
| 52 |
+# want to send an Accept header, set this to None. |
|
| 53 |
+ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1" |
|
| 54 |
+ |
|
| 55 |
+# List of preferred XML parsers, by SAX driver name. These will be tried first, |
|
| 56 |
+# but if they're not installed, Python will keep searching through its own list |
|
| 57 |
+# of pre-installed parsers until it finds one that supports everything we need. |
|
| 58 |
+PREFERRED_XML_PARSERS = ["drv_libxml2"] |
|
| 59 |
+ |
|
| 60 |
+# If you want feedparser to automatically run HTML markup through HTML Tidy, set |
|
| 61 |
+# this to 1. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html> |
|
| 62 |
+# or utidylib <http://utidylib.berlios.de/>. |
|
| 63 |
+TIDY_MARKUP = 0 |
|
| 64 |
+ |
|
| 65 |
+# List of Python interfaces for HTML Tidy, in order of preference. Only useful |
|
| 66 |
+# if TIDY_MARKUP = 1 |
|
| 67 |
+PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] |
|
| 68 |
+ |
|
| 69 |
+# If you want feedparser to automatically resolve all relative URIs, set this |
|
| 70 |
+# to 1. |
|
| 71 |
+RESOLVE_RELATIVE_URIS = 1 |
|
| 72 |
+ |
|
| 73 |
+# If you want feedparser to automatically sanitize all potentially unsafe |
|
| 74 |
+# HTML content, set this to 1. |
|
| 75 |
+SANITIZE_HTML = 1 |
|
| 76 |
+ |
|
| 77 |
+# ---------- required modules (should come with any Python distribution) ---------- |
|
| 78 |
+import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2 |
|
| 79 |
+try: |
|
| 80 |
+ from cStringIO import StringIO as _StringIO |
|
| 81 |
+except: |
|
| 82 |
+ from StringIO import StringIO as _StringIO |
|
| 83 |
+ |
|
| 84 |
+# ---------- optional modules (feedparser will work without these, but with reduced functionality) ---------- |
|
| 85 |
+ |
|
| 86 |
+# gzip is included with most Python distributions, but may not be available if you compiled your own |
|
| 87 |
+try: |
|
| 88 |
+ import gzip |
|
| 89 |
+except: |
|
| 90 |
+ gzip = None |
|
| 91 |
+try: |
|
| 92 |
+ import zlib |
|
| 93 |
+except: |
|
| 94 |
+ zlib = None |
|
| 95 |
+ |
|
| 96 |
+# If a real XML parser is available, feedparser will attempt to use it. feedparser has |
|
| 97 |
+# been tested with the built-in SAX parser, PyXML, and libxml2. On platforms where the |
|
| 98 |
+# Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some |
|
| 99 |
+# versions of FreeBSD), feedparser will quietly fall back on regex-based parsing. |
|
| 100 |
+try: |
|
| 101 |
+ import xml.sax |
|
| 102 |
+ xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers |
|
| 103 |
+ from xml.sax.saxutils import escape as _xmlescape |
|
| 104 |
+ _XML_AVAILABLE = 1 |
|
| 105 |
+except: |
|
| 106 |
+ _XML_AVAILABLE = 0 |
|
| 107 |
+ def _xmlescape(data,entities={}):
|
|
| 108 |
+ data = data.replace('&', '&')
|
|
| 109 |
+ data = data.replace('>', '>')
|
|
| 110 |
+ data = data.replace('<', '<')
|
|
| 111 |
+ for char, entity in entities: |
|
| 112 |
+ data = data.replace(char, entity) |
|
| 113 |
+ return data |
|
| 114 |
+ |
|
| 115 |
+# base64 support for Atom feeds that contain embedded binary data |
|
| 116 |
+try: |
|
| 117 |
+ import base64, binascii |
|
| 118 |
+except: |
|
| 119 |
+ base64 = binascii = None |
|
| 120 |
+ |
|
| 121 |
+# cjkcodecs and iconv_codec provide support for more character encodings. |
|
| 122 |
+# Both are available from http://cjkpython.i18n.org/ |
|
| 123 |
+try: |
|
| 124 |
+ import cjkcodecs.aliases |
|
| 125 |
+except: |
|
| 126 |
+ pass |
|
| 127 |
+try: |
|
| 128 |
+ import iconv_codec |
|
| 129 |
+except: |
|
| 130 |
+ pass |
|
| 131 |
+ |
|
| 132 |
+# chardet library auto-detects character encodings |
|
| 133 |
+# Download from http://chardet.feedparser.org/ |
|
| 134 |
+try: |
|
| 135 |
+ import chardet |
|
| 136 |
+ if _debug: |
|
| 137 |
+ import chardet.constants |
|
| 138 |
+ chardet.constants._debug = 1 |
|
| 139 |
+except: |
|
| 140 |
+ chardet = None |
|
| 141 |
+ |
|
| 142 |
+# reversable htmlentitydefs mappings for Python 2.2 |
|
| 143 |
+try: |
|
| 144 |
+ from htmlentitydefs import name2codepoint, codepoint2name |
|
| 145 |
+except: |
|
| 146 |
+ import htmlentitydefs |
|
| 147 |
+ name2codepoint={}
|
|
| 148 |
+ codepoint2name={}
|
|
| 149 |
+ for (name,codepoint) in htmlentitydefs.entitydefs.iteritems(): |
|
| 150 |
+ if codepoint.startswith('&#'): codepoint=unichr(int(codepoint[2:-1]))
|
|
| 151 |
+ name2codepoint[name]=ord(codepoint) |
|
| 152 |
+ codepoint2name[ord(codepoint)]=name |
|
| 153 |
+ |
|
| 154 |
+# BeautifulSoup parser used for parsing microformats from embedded HTML content |
|
| 155 |
+# http://www.crummy.com/software/BeautifulSoup/ |
|
| 156 |
+# feedparser is tested with BeautifulSoup 3.0.x, but it might work with the |
|
| 157 |
+# older 2.x series. If it doesn't, and you can figure out why, I'll accept a |
|
| 158 |
+# patch and modify the compatibility statement accordingly. |
|
| 159 |
+try: |
|
| 160 |
+ import BeautifulSoup |
|
| 161 |
+except: |
|
| 162 |
+ BeautifulSoup = None |
|
| 163 |
+ |
|
| 164 |
+# ---------- don't touch these ---------- |
|
| 165 |
+class ThingsNobodyCaresAboutButMe(Exception): pass |
|
| 166 |
+class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass |
|
| 167 |
+class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass |
|
| 168 |
+class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass |
|
| 169 |
+class UndeclaredNamespace(Exception): pass |
|
| 170 |
+ |
|
| 171 |
+sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
|
|
| 172 |
+sgmllib.special = re.compile('<!')
|
|
| 173 |
+sgmllib.charref = re.compile('&#(\d+|x[0-9a-fA-F]+);')
|
|
| 174 |
+ |
|
| 175 |
+if sgmllib.endbracket.search(' <').start(0):
|
|
| 176 |
+ class EndBracketMatch: |
|
| 177 |
+ endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''')
|
|
| 178 |
+ def search(self,string,index=0): |
|
| 179 |
+ self.match = self.endbracket.match(string,index) |
|
| 180 |
+ if self.match: return self |
|
| 181 |
+ def start(self,n): |
|
| 182 |
+ return self.match.end(n) |
|
| 183 |
+ sgmllib.endbracket = EndBracketMatch() |
|
| 184 |
+ |
|
| 185 |
+SUPPORTED_VERSIONS = {'': 'unknown',
|
|
| 186 |
+ 'rss090': 'RSS 0.90', |
|
| 187 |
+ 'rss091n': 'RSS 0.91 (Netscape)', |
|
| 188 |
+ 'rss091u': 'RSS 0.91 (Userland)', |
|
| 189 |
+ 'rss092': 'RSS 0.92', |
|
| 190 |
+ 'rss093': 'RSS 0.93', |
|
| 191 |
+ 'rss094': 'RSS 0.94', |
|
| 192 |
+ 'rss20': 'RSS 2.0', |
|
| 193 |
+ 'rss10': 'RSS 1.0', |
|
| 194 |
+ 'rss': 'RSS (unknown version)', |
|
| 195 |
+ 'atom01': 'Atom 0.1', |
|
| 196 |
+ 'atom02': 'Atom 0.2', |
|
| 197 |
+ 'atom03': 'Atom 0.3', |
|
| 198 |
+ 'atom10': 'Atom 1.0', |
|
| 199 |
+ 'atom': 'Atom (unknown version)', |
|
| 200 |
+ 'cdf': 'CDF', |
|
| 201 |
+ 'hotrss': 'Hot RSS' |
|
| 202 |
+ } |
|
| 203 |
+ |
|
| 204 |
+try: |
|
| 205 |
+ UserDict = dict |
|
| 206 |
+except NameError: |
|
| 207 |
+ # Python 2.1 does not have dict |
|
| 208 |
+ from UserDict import UserDict |
|
| 209 |
+ def dict(aList): |
|
| 210 |
+ rc = {}
|
|
| 211 |
+ for k, v in aList: |
|
| 212 |
+ rc[k] = v |
|
| 213 |
+ return rc |
|
| 214 |
+ |
|
| 215 |
+class FeedParserDict(UserDict): |
|
| 216 |
+ keymap = {'channel': 'feed',
|
|
| 217 |
+ 'items': 'entries', |
|
| 218 |
+ 'guid': 'id', |
|
| 219 |
+ 'date': 'updated', |
|
| 220 |
+ 'date_parsed': 'updated_parsed', |
|
| 221 |
+ 'description': ['subtitle', 'summary'], |
|
| 222 |
+ 'url': ['href'], |
|
| 223 |
+ 'modified': 'updated', |
|
| 224 |
+ 'modified_parsed': 'updated_parsed', |
|
| 225 |
+ 'issued': 'published', |
|
| 226 |
+ 'issued_parsed': 'published_parsed', |
|
| 227 |
+ 'copyright': 'rights', |
|
| 228 |
+ 'copyright_detail': 'rights_detail', |
|
| 229 |
+ 'tagline': 'subtitle', |
|
| 230 |
+ 'tagline_detail': 'subtitle_detail'} |
|
| 231 |
+ def __getitem__(self, key): |
|
| 232 |
+ if key == 'category': |
|
| 233 |
+ return UserDict.__getitem__(self, 'tags')[0]['term'] |
|
| 234 |
+ if key == 'enclosures': |
|
| 235 |
+ norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel']) |
|
| 236 |
+ return [norel(link) for link in UserDict.__getitem__(self, 'links') if link['rel']=='enclosure'] |
|
| 237 |
+ if key == 'license': |
|
| 238 |
+ for link in UserDict.__getitem__(self, 'links'): |
|
| 239 |
+ if link['rel']=='license' and link.has_key('href'):
|
|
| 240 |
+ return link['href'] |
|
| 241 |
+ if key == 'categories': |
|
| 242 |
+ return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')] |
|
| 243 |
+ realkey = self.keymap.get(key, key) |
|
| 244 |
+ if type(realkey) == types.ListType: |
|
| 245 |
+ for k in realkey: |
|
| 246 |
+ if UserDict.has_key(self, k): |
|
| 247 |
+ return UserDict.__getitem__(self, k) |
|
| 248 |
+ if UserDict.has_key(self, key): |
|
| 249 |
+ return UserDict.__getitem__(self, key) |
|
| 250 |
+ return UserDict.__getitem__(self, realkey) |
|
| 251 |
+ |
|
| 252 |
+ def __setitem__(self, key, value): |
|
| 253 |
+ for k in self.keymap.keys(): |
|
| 254 |
+ if key == k: |
|
| 255 |
+ key = self.keymap[k] |
|
| 256 |
+ if type(key) == types.ListType: |
|
| 257 |
+ key = key[0] |
|
| 258 |
+ return UserDict.__setitem__(self, key, value) |
|
| 259 |
+ |
|
| 260 |
+ def get(self, key, default=None): |
|
| 261 |
+ if self.has_key(key): |
|
| 262 |
+ return self[key] |
|
| 263 |
+ else: |
|
| 264 |
+ return default |
|
| 265 |
+ |
|
| 266 |
+ def setdefault(self, key, value): |
|
| 267 |
+ if not self.has_key(key): |
|
| 268 |
+ self[key] = value |
|
| 269 |
+ return self[key] |
|
| 270 |
+ |
|
| 271 |
+ def has_key(self, key): |
|
| 272 |
+ try: |
|
| 273 |
+ return hasattr(self, key) or UserDict.has_key(self, key) |
|
| 274 |
+ except AttributeError: |
|
| 275 |
+ return False |
|
| 276 |
+ |
|
| 277 |
+ def __getattr__(self, key): |
|
| 278 |
+ try: |
|
| 279 |
+ return self.__dict__[key] |
|
| 280 |
+ except KeyError: |
|
| 281 |
+ pass |
|
| 282 |
+ try: |
|
| 283 |
+ assert not key.startswith('_')
|
|
| 284 |
+ return self.__getitem__(key) |
|
| 285 |
+ except: |
|
| 286 |
+ raise AttributeError, "object has no attribute '%s'" % key |
|
| 287 |
+ |
|
| 288 |
+ def __setattr__(self, key, value): |
|
| 289 |
+ if key.startswith('_') or key == 'data':
|
|
| 290 |
+ self.__dict__[key] = value |
|
| 291 |
+ else: |
|
| 292 |
+ return self.__setitem__(key, value) |
|
| 293 |
+ |
|
| 294 |
+ def __contains__(self, key): |
|
| 295 |
+ return self.has_key(key) |
|
| 296 |
+ |
|
| 297 |
+def zopeCompatibilityHack(): |
|
| 298 |
+ global FeedParserDict |
|
| 299 |
+ del FeedParserDict |
|
| 300 |
+ def FeedParserDict(aDict=None): |
|
| 301 |
+ rc = {}
|
|
| 302 |
+ if aDict: |
|
| 303 |
+ rc.update(aDict) |
|
| 304 |
+ return rc |
|
| 305 |
+ |
|
| 306 |
+_ebcdic_to_ascii_map = None |
|
| 307 |
+def _ebcdic_to_ascii(s): |
|
| 308 |
+ global _ebcdic_to_ascii_map |
|
| 309 |
+ if not _ebcdic_to_ascii_map: |
|
| 310 |
+ emap = ( |
|
| 311 |
+ 0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, |
|
| 312 |
+ 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, |
|
| 313 |
+ 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, |
|
| 314 |
+ 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, |
|
| 315 |
+ 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, |
|
| 316 |
+ 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, |
|
| 317 |
+ 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, |
|
| 318 |
+ 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, |
|
| 319 |
+ 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201, |
|
| 320 |
+ 202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208, |
|
| 321 |
+ 209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215, |
|
| 322 |
+ 216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231, |
|
| 323 |
+ 123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237, |
|
| 324 |
+ 125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243, |
|
| 325 |
+ 92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249, |
|
| 326 |
+ 48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255 |
|
| 327 |
+ ) |
|
| 328 |
+ import string |
|
| 329 |
+ _ebcdic_to_ascii_map = string.maketrans( \ |
|
| 330 |
+ ''.join(map(chr, range(256))), ''.join(map(chr, emap))) |
|
| 331 |
+ return s.translate(_ebcdic_to_ascii_map) |
|
| 332 |
+ |
|
| 333 |
+_cp1252 = {
|
|
| 334 |
+ unichr(128): unichr(8364), # euro sign |
|
| 335 |
+ unichr(130): unichr(8218), # single low-9 quotation mark |
|
| 336 |
+ unichr(131): unichr( 402), # latin small letter f with hook |
|
| 337 |
+ unichr(132): unichr(8222), # double low-9 quotation mark |
|
| 338 |
+ unichr(133): unichr(8230), # horizontal ellipsis |
|
| 339 |
+ unichr(134): unichr(8224), # dagger |
|
| 340 |
+ unichr(135): unichr(8225), # double dagger |
|
| 341 |
+ unichr(136): unichr( 710), # modifier letter circumflex accent |
|
| 342 |
+ unichr(137): unichr(8240), # per mille sign |
|
| 343 |
+ unichr(138): unichr( 352), # latin capital letter s with caron |
|
| 344 |
+ unichr(139): unichr(8249), # single left-pointing angle quotation mark |
|
| 345 |
+ unichr(140): unichr( 338), # latin capital ligature oe |
|
| 346 |
+ unichr(142): unichr( 381), # latin capital letter z with caron |
|
| 347 |
+ unichr(145): unichr(8216), # left single quotation mark |
|
| 348 |
+ unichr(146): unichr(8217), # right single quotation mark |
|
| 349 |
+ unichr(147): unichr(8220), # left double quotation mark |
|
| 350 |
+ unichr(148): unichr(8221), # right double quotation mark |
|
| 351 |
+ unichr(149): unichr(8226), # bullet |
|
| 352 |
+ unichr(150): unichr(8211), # en dash |
|
| 353 |
+ unichr(151): unichr(8212), # em dash |
|
| 354 |
+ unichr(152): unichr( 732), # small tilde |
|
| 355 |
+ unichr(153): unichr(8482), # trade mark sign |
|
| 356 |
+ unichr(154): unichr( 353), # latin small letter s with caron |
|
| 357 |
+ unichr(155): unichr(8250), # single right-pointing angle quotation mark |
|
| 358 |
+ unichr(156): unichr( 339), # latin small ligature oe |
|
| 359 |
+ unichr(158): unichr( 382), # latin small letter z with caron |
|
| 360 |
+ unichr(159): unichr( 376)} # latin capital letter y with diaeresis |
|
| 361 |
+ |
|
| 362 |
+_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
|
|
| 363 |
+def _urljoin(base, uri): |
|
| 364 |
+ uri = _urifixer.sub(r'\1\3', uri) |
|
| 365 |
+ try: |
|
| 366 |
+ return urlparse.urljoin(base, uri) |
|
| 367 |
+ except: |
|
| 368 |
+ uri = urlparse.urlunparse([urllib.quote(part) for part in urlparse.urlparse(uri)]) |
|
| 369 |
+ return urlparse.urljoin(base, uri) |
|
| 370 |
+ |
|
| 371 |
+class _FeedParserMixin: |
|
| 372 |
+ namespaces = {'': '',
|
|
| 373 |
+ 'http://backend.userland.com/rss': '', |
|
| 374 |
+ 'http://blogs.law.harvard.edu/tech/rss': '', |
|
| 375 |
+ 'http://purl.org/rss/1.0/': '', |
|
| 376 |
+ 'http://my.netscape.com/rdf/simple/0.9/': '', |
|
| 377 |
+ 'http://example.com/newformat#': '', |
|
| 378 |
+ 'http://example.com/necho': '', |
|
| 379 |
+ 'http://purl.org/echo/': '', |
|
| 380 |
+ 'uri/of/echo/namespace#': '', |
|
| 381 |
+ 'http://purl.org/pie/': '', |
|
| 382 |
+ 'http://purl.org/atom/ns#': '', |
|
| 383 |
+ 'http://www.w3.org/2005/Atom': '', |
|
| 384 |
+ 'http://purl.org/rss/1.0/modules/rss091#': '', |
|
| 385 |
+ |
|
| 386 |
+ 'http://webns.net/mvcb/': 'admin', |
|
| 387 |
+ 'http://purl.org/rss/1.0/modules/aggregation/': 'ag', |
|
| 388 |
+ 'http://purl.org/rss/1.0/modules/annotate/': 'annotate', |
|
| 389 |
+ 'http://media.tangent.org/rss/1.0/': 'audio', |
|
| 390 |
+ 'http://backend.userland.com/blogChannelModule': 'blogChannel', |
|
| 391 |
+ 'http://web.resource.org/cc/': 'cc', |
|
| 392 |
+ 'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons', |
|
| 393 |
+ 'http://purl.org/rss/1.0/modules/company': 'co', |
|
| 394 |
+ 'http://purl.org/rss/1.0/modules/content/': 'content', |
|
| 395 |
+ 'http://my.theinfo.org/changed/1.0/rss/': 'cp', |
|
| 396 |
+ 'http://purl.org/dc/elements/1.1/': 'dc', |
|
| 397 |
+ 'http://purl.org/dc/terms/': 'dcterms', |
|
| 398 |
+ 'http://purl.org/rss/1.0/modules/email/': 'email', |
|
| 399 |
+ 'http://purl.org/rss/1.0/modules/event/': 'ev', |
|
| 400 |
+ 'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner', |
|
| 401 |
+ 'http://freshmeat.net/rss/fm/': 'fm', |
|
| 402 |
+ 'http://xmlns.com/foaf/0.1/': 'foaf', |
|
| 403 |
+ 'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo', |
|
| 404 |
+ 'http://postneo.com/icbm/': 'icbm', |
|
| 405 |
+ 'http://purl.org/rss/1.0/modules/image/': 'image', |
|
| 406 |
+ 'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes', |
|
| 407 |
+ 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes', |
|
| 408 |
+ 'http://purl.org/rss/1.0/modules/link/': 'l', |
|
| 409 |
+ 'http://search.yahoo.com/mrss': 'media', |
|
| 410 |
+ 'http://madskills.com/public/xml/rss/module/pingback/': 'pingback', |
|
| 411 |
+ 'http://prismstandard.org/namespaces/1.2/basic/': 'prism', |
|
| 412 |
+ 'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf', |
|
| 413 |
+ 'http://www.w3.org/2000/01/rdf-schema#': 'rdfs', |
|
| 414 |
+ 'http://purl.org/rss/1.0/modules/reference/': 'ref', |
|
| 415 |
+ 'http://purl.org/rss/1.0/modules/richequiv/': 'reqv', |
|
| 416 |
+ 'http://purl.org/rss/1.0/modules/search/': 'search', |
|
| 417 |
+ 'http://purl.org/rss/1.0/modules/slash/': 'slash', |
|
| 418 |
+ 'http://schemas.xmlsoap.org/soap/envelope/': 'soap', |
|
| 419 |
+ 'http://purl.org/rss/1.0/modules/servicestatus/': 'ss', |
|
| 420 |
+ 'http://hacks.benhammersley.com/rss/streaming/': 'str', |
|
| 421 |
+ 'http://purl.org/rss/1.0/modules/subscription/': 'sub', |
|
| 422 |
+ 'http://purl.org/rss/1.0/modules/syndication/': 'sy', |
|
| 423 |
+ 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf', |
|
| 424 |
+ 'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo', |
|
| 425 |
+ 'http://purl.org/rss/1.0/modules/threading/': 'thr', |
|
| 426 |
+ 'http://purl.org/rss/1.0/modules/textinput/': 'ti', |
|
| 427 |
+ 'http://madskills.com/public/xml/rss/module/trackback/':'trackback', |
|
| 428 |
+ 'http://wellformedweb.org/commentAPI/': 'wfw', |
|
| 429 |
+ 'http://purl.org/rss/1.0/modules/wiki/': 'wiki', |
|
| 430 |
+ 'http://www.w3.org/1999/xhtml': 'xhtml', |
|
| 431 |
+ 'http://www.w3.org/1999/xlink': 'xlink', |
|
| 432 |
+ 'http://www.w3.org/XML/1998/namespace': 'xml' |
|
| 433 |
+} |
|
| 434 |
+ _matchnamespaces = {}
|
|
| 435 |
+ |
|
| 436 |
+ can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo'] |
|
| 437 |
+ can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'] |
|
| 438 |
+ can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'] |
|
| 439 |
+ html_types = ['text/html', 'application/xhtml+xml'] |
|
| 440 |
+ |
|
| 441 |
+ def __init__(self, baseuri=None, baselang=None, encoding='utf-8'): |
|
| 442 |
+ if _debug: sys.stderr.write('initializing FeedParser\n')
|
|
| 443 |
+ if not self._matchnamespaces: |
|
| 444 |
+ for k, v in self.namespaces.items(): |
|
| 445 |
+ self._matchnamespaces[k.lower()] = v |
|
| 446 |
+ self.feeddata = FeedParserDict() # feed-level data |
|
| 447 |
+ self.encoding = encoding # character encoding |
|
| 448 |
+ self.entries = [] # list of entry-level data |
|
| 449 |
+ self.version = '' # feed type/version, see SUPPORTED_VERSIONS |
|
| 450 |
+ self.namespacesInUse = {} # dictionary of namespaces defined by the feed
|
|
| 451 |
+ |
|
| 452 |
+ # the following are used internally to track state; |
|
| 453 |
+ # this is really out of control and should be refactored |
|
| 454 |
+ self.infeed = 0 |
|
| 455 |
+ self.inentry = 0 |
|
| 456 |
+ self.incontent = 0 |
|
| 457 |
+ self.intextinput = 0 |
|
| 458 |
+ self.inimage = 0 |
|
| 459 |
+ self.inauthor = 0 |
|
| 460 |
+ self.incontributor = 0 |
|
| 461 |
+ self.inpublisher = 0 |
|
| 462 |
+ self.insource = 0 |
|
| 463 |
+ self.sourcedata = FeedParserDict() |
|
| 464 |
+ self.contentparams = FeedParserDict() |
|
| 465 |
+ self._summaryKey = None |
|
| 466 |
+ self.namespacemap = {}
|
|
| 467 |
+ self.elementstack = [] |
|
| 468 |
+ self.basestack = [] |
|
| 469 |
+ self.langstack = [] |
|
| 470 |
+ self.baseuri = baseuri or '' |
|
| 471 |
+ self.lang = baselang or None |
|
| 472 |
+ self.svgOK = 0 |
|
| 473 |
+ self.hasTitle = 0 |
|
| 474 |
+ if baselang: |
|
| 475 |
+ self.feeddata['language'] = baselang.replace('_','-')
|
|
| 476 |
+ |
|
| 477 |
+ def unknown_starttag(self, tag, attrs): |
|
| 478 |
+ if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
|
|
| 479 |
+ # normalize attrs |
|
| 480 |
+ attrs = [(k.lower(), v) for k, v in attrs] |
|
| 481 |
+ attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
|
|
| 482 |
+ |
|
| 483 |
+ # track xml:base and xml:lang |
|
| 484 |
+ attrsD = dict(attrs) |
|
| 485 |
+ baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
|
|
| 486 |
+ if type(baseuri) != type(u''): |
|
| 487 |
+ try: |
|
| 488 |
+ baseuri = unicode(baseuri, self.encoding) |
|
| 489 |
+ except: |
|
| 490 |
+ baseuri = unicode(baseuri, 'iso-8859-1') |
|
| 491 |
+ self.baseuri = _urljoin(self.baseuri, baseuri) |
|
| 492 |
+ lang = attrsD.get('xml:lang', attrsD.get('lang'))
|
|
| 493 |
+ if lang == '': |
|
| 494 |
+ # xml:lang could be explicitly set to '', we need to capture that |
|
| 495 |
+ lang = None |
|
| 496 |
+ elif lang is None: |
|
| 497 |
+ # if no xml:lang is specified, use parent lang |
|
| 498 |
+ lang = self.lang |
|
| 499 |
+ if lang: |
|
| 500 |
+ if tag in ('feed', 'rss', 'rdf:RDF'):
|
|
| 501 |
+ self.feeddata['language'] = lang.replace('_','-')
|
|
| 502 |
+ self.lang = lang |
|
| 503 |
+ self.basestack.append(self.baseuri) |
|
| 504 |
+ self.langstack.append(lang) |
|
| 505 |
+ |
|
| 506 |
+ # track namespaces |
|
| 507 |
+ for prefix, uri in attrs: |
|
| 508 |
+ if prefix.startswith('xmlns:'):
|
|
| 509 |
+ self.trackNamespace(prefix[6:], uri) |
|
| 510 |
+ elif prefix == 'xmlns': |
|
| 511 |
+ self.trackNamespace(None, uri) |
|
| 512 |
+ |
|
| 513 |
+ # track inline content |
|
| 514 |
+ if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
|
|
| 515 |
+ if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007 |
|
| 516 |
+ # element declared itself as escaped markup, but it isn't really |
|
| 517 |
+ self.contentparams['type'] = 'application/xhtml+xml' |
|
| 518 |
+ if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
|
|
| 519 |
+ if tag.find(':') <> -1:
|
|
| 520 |
+ prefix, tag = tag.split(':', 1)
|
|
| 521 |
+ namespace = self.namespacesInUse.get(prefix, '') |
|
| 522 |
+ if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML': |
|
| 523 |
+ attrs.append(('xmlns',namespace))
|
|
| 524 |
+ if tag=='svg' and namespace=='http://www.w3.org/2000/svg': |
|
| 525 |
+ attrs.append(('xmlns',namespace))
|
|
| 526 |
+ if tag == 'svg': self.svgOK += 1 |
|
| 527 |
+ return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0)
|
|
| 528 |
+ |
|
| 529 |
+ # match namespaces |
|
| 530 |
+ if tag.find(':') <> -1:
|
|
| 531 |
+ prefix, suffix = tag.split(':', 1)
|
|
| 532 |
+ else: |
|
| 533 |
+ prefix, suffix = '', tag |
|
| 534 |
+ prefix = self.namespacemap.get(prefix, prefix) |
|
| 535 |
+ if prefix: |
|
| 536 |
+ prefix = prefix + '_' |
|
| 537 |
+ |
|
| 538 |
+ # special hack for better tracking of empty textinput/image elements in illformed feeds |
|
| 539 |
+ if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
|
|
| 540 |
+ self.intextinput = 0 |
|
| 541 |
+ if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
|
|
| 542 |
+ self.inimage = 0 |
|
| 543 |
+ |
|
| 544 |
+ # call special handler (if defined) or default handler |
|
| 545 |
+ methodname = '_start_' + prefix + suffix |
|
| 546 |
+ try: |
|
| 547 |
+ method = getattr(self, methodname) |
|
| 548 |
+ return method(attrsD) |
|
| 549 |
+ except AttributeError: |
|
| 550 |
+ return self.push(prefix + suffix, 1) |
|
| 551 |
+ |
|
| 552 |
+ def unknown_endtag(self, tag): |
|
| 553 |
+ if _debug: sys.stderr.write('end %s\n' % tag)
|
|
| 554 |
+ # match namespaces |
|
| 555 |
+ if tag.find(':') <> -1:
|
|
| 556 |
+ prefix, suffix = tag.split(':', 1)
|
|
| 557 |
+ else: |
|
| 558 |
+ prefix, suffix = '', tag |
|
| 559 |
+ prefix = self.namespacemap.get(prefix, prefix) |
|
| 560 |
+ if prefix: |
|
| 561 |
+ prefix = prefix + '_' |
|
| 562 |
+ if suffix == 'svg' and self.svgOK: self.svgOK -= 1 |
|
| 563 |
+ |
|
| 564 |
+ # call special handler (if defined) or default handler |
|
| 565 |
+ methodname = '_end_' + prefix + suffix |
|
| 566 |
+ try: |
|
| 567 |
+ if self.svgOK: raise AttributeError() |
|
| 568 |
+ method = getattr(self, methodname) |
|
| 569 |
+ method() |
|
| 570 |
+ except AttributeError: |
|
| 571 |
+ self.pop(prefix + suffix) |
|
| 572 |
+ |
|
| 573 |
+ # track inline content |
|
| 574 |
+ if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
|
|
| 575 |
+ # element declared itself as escaped markup, but it isn't really |
|
| 576 |
+ if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007 |
|
| 577 |
+ self.contentparams['type'] = 'application/xhtml+xml' |
|
| 578 |
+ if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
|
|
| 579 |
+ tag = tag.split(':')[-1]
|
|
| 580 |
+ self.handle_data('</%s>' % tag, escape=0)
|
|
| 581 |
+ |
|
| 582 |
+ # track xml:base and xml:lang going out of scope |
|
| 583 |
+ if self.basestack: |
|
| 584 |
+ self.basestack.pop() |
|
| 585 |
+ if self.basestack and self.basestack[-1]: |
|
| 586 |
+ self.baseuri = self.basestack[-1] |
|
| 587 |
+ if self.langstack: |
|
| 588 |
+ self.langstack.pop() |
|
| 589 |
+ if self.langstack: # and (self.langstack[-1] is not None): |
|
| 590 |
+ self.lang = self.langstack[-1] |
|
| 591 |
+ |
|
| 592 |
+ def handle_charref(self, ref): |
|
| 593 |
+ # called for each character reference, e.g. for ' ', ref will be '160' |
|
| 594 |
+ if not self.elementstack: return |
|
| 595 |
+ ref = ref.lower() |
|
| 596 |
+ if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
|
|
| 597 |
+ text = '&#%s;' % ref |
|
| 598 |
+ else: |
|
| 599 |
+ if ref[0] == 'x': |
|
| 600 |
+ c = int(ref[1:], 16) |
|
| 601 |
+ else: |
|
| 602 |
+ c = int(ref) |
|
| 603 |
+ text = unichr(c).encode('utf-8')
|
|
| 604 |
+ self.elementstack[-1][2].append(text) |
|
| 605 |
+ |
|
| 606 |
+ def handle_entityref(self, ref): |
|
| 607 |
+ # called for each entity reference, e.g. for '©', ref will be 'copy' |
|
| 608 |
+ if not self.elementstack: return |
|
| 609 |
+ if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref)
|
|
| 610 |
+ if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
|
|
| 611 |
+ text = '&%s;' % ref |
|
| 612 |
+ elif ref in self.entities.keys(): |
|
| 613 |
+ text = self.entities[ref] |
|
| 614 |
+ if text.startswith('&#') and text.endswith(';'):
|
|
| 615 |
+ return self.handle_entityref(text) |
|
| 616 |
+ else: |
|
| 617 |
+ try: name2codepoint[ref] |
|
| 618 |
+ except KeyError: text = '&%s;' % ref |
|
| 619 |
+ else: text = unichr(name2codepoint[ref]).encode('utf-8')
|
|
| 620 |
+ self.elementstack[-1][2].append(text) |
|
| 621 |
+ |
|
| 622 |
+ def handle_data(self, text, escape=1): |
|
| 623 |
+ # called for each block of plain text, i.e. outside of any tag and |
|
| 624 |
+ # not containing any character or entity references |
|
| 625 |
+ if not self.elementstack: return |
|
| 626 |
+ if escape and self.contentparams.get('type') == 'application/xhtml+xml':
|
|
| 627 |
+ text = _xmlescape(text) |
|
| 628 |
+ self.elementstack[-1][2].append(text) |
|
| 629 |
+ |
|
| 630 |
+ def handle_comment(self, text): |
|
| 631 |
+ # called for each comment, e.g. <!-- insert message here --> |
|
| 632 |
+ pass |
|
| 633 |
+ |
|
| 634 |
+ def handle_pi(self, text): |
|
| 635 |
+ # called for each processing instruction, e.g. <?instruction> |
|
| 636 |
+ pass |
|
| 637 |
+ |
|
| 638 |
+ def handle_decl(self, text): |
|
| 639 |
+ pass |
|
| 640 |
+ |
|
| 641 |
+ def parse_declaration(self, i): |
|
| 642 |
+ # override internal declaration handler to handle CDATA blocks |
|
| 643 |
+ if _debug: sys.stderr.write('entering parse_declaration\n')
|
|
| 644 |
+ if self.rawdata[i:i+9] == '<![CDATA[': |
|
| 645 |
+ k = self.rawdata.find(']]>', i)
|
|
| 646 |
+ if k == -1: k = len(self.rawdata) |
|
| 647 |
+ self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0) |
|
| 648 |
+ return k+3 |
|
| 649 |
+ else: |
|
| 650 |
+ k = self.rawdata.find('>', i)
|
|
| 651 |
+ return k+1 |
|
| 652 |
+ |
|
| 653 |
+ def mapContentType(self, contentType): |
|
| 654 |
+ contentType = contentType.lower() |
|
| 655 |
+ if contentType == 'text': |
|
| 656 |
+ contentType = 'text/plain' |
|
| 657 |
+ elif contentType == 'html': |
|
| 658 |
+ contentType = 'text/html' |
|
| 659 |
+ elif contentType == 'xhtml': |
|
| 660 |
+ contentType = 'application/xhtml+xml' |
|
| 661 |
+ return contentType |
|
| 662 |
+ |
|
| 663 |
+ def trackNamespace(self, prefix, uri): |
|
| 664 |
+ loweruri = uri.lower() |
|
| 665 |
+ if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version: |
|
| 666 |
+ self.version = 'rss090' |
|
| 667 |
+ if loweruri == 'http://purl.org/rss/1.0/' and not self.version: |
|
| 668 |
+ self.version = 'rss10' |
|
| 669 |
+ if loweruri == 'http://www.w3.org/2005/atom' and not self.version: |
|
| 670 |
+ self.version = 'atom10' |
|
| 671 |
+ if loweruri.find('backend.userland.com/rss') <> -1:
|
|
| 672 |
+ # match any backend.userland.com namespace |
|
| 673 |
+ uri = 'http://backend.userland.com/rss' |
|
| 674 |
+ loweruri = uri |
|
| 675 |
+ if self._matchnamespaces.has_key(loweruri): |
|
| 676 |
+ self.namespacemap[prefix] = self._matchnamespaces[loweruri] |
|
| 677 |
+ self.namespacesInUse[self._matchnamespaces[loweruri]] = uri |
|
| 678 |
+ else: |
|
| 679 |
+ self.namespacesInUse[prefix or ''] = uri |
|
| 680 |
+ |
|
| 681 |
+ def resolveURI(self, uri): |
|
| 682 |
+ return _urljoin(self.baseuri or '', uri) |
|
| 683 |
+ |
|
| 684 |
+ def decodeEntities(self, element, data): |
|
| 685 |
+ return data |
|
| 686 |
+ |
|
| 687 |
+ def strattrs(self, attrs): |
|
| 688 |
+ return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'"'})) for t in attrs])
|
|
| 689 |
+ |
|
| 690 |
+ def push(self, element, expectingText): |
|
| 691 |
+ self.elementstack.append([element, expectingText, []]) |
|
| 692 |
+ |
|
| 693 |
+ def pop(self, element, stripWhitespace=1): |
|
| 694 |
+ if not self.elementstack: return |
|
| 695 |
+ if self.elementstack[-1][0] != element: return |
|
| 696 |
+ |
|
| 697 |
+ element, expectingText, pieces = self.elementstack.pop() |
|
| 698 |
+ |
|
| 699 |
+ if self.version == 'atom10' and self.contentparams.get('type','text') == 'application/xhtml+xml':
|
|
| 700 |
+ # remove enclosing child element, but only if it is a <div> and |
|
| 701 |
+ # only if all the remaining content is nested underneath it. |
|
| 702 |
+ # This means that the divs would be retained in the following: |
|
| 703 |
+ # <div>foo</div><div>bar</div> |
|
| 704 |
+ while pieces and len(pieces)>1 and not pieces[-1].strip(): |
|
| 705 |
+ del pieces[-1] |
|
| 706 |
+ while pieces and len(pieces)>1 and not pieces[0].strip(): |
|
| 707 |
+ del pieces[0] |
|
| 708 |
+ if pieces and (pieces[0] == '<div>' or pieces[0].startswith('<div ')) and pieces[-1]=='</div>':
|
|
| 709 |
+ depth = 0 |
|
| 710 |
+ for piece in pieces[:-1]: |
|
| 711 |
+ if piece.startswith('</'):
|
|
| 712 |
+ depth -= 1 |
|
| 713 |
+ if depth == 0: break |
|
| 714 |
+ elif piece.startswith('<') and not piece.endswith('/>'):
|
|
| 715 |
+ depth += 1 |
|
| 716 |
+ else: |
|
| 717 |
+ pieces = pieces[1:-1] |
|
| 718 |
+ |
|
| 719 |
+ output = ''.join(pieces) |
|
| 720 |
+ if stripWhitespace: |
|
| 721 |
+ output = output.strip() |
|
| 722 |
+ if not expectingText: return output |
|
| 723 |
+ |
|
| 724 |
+ # decode base64 content |
|
| 725 |
+ if base64 and self.contentparams.get('base64', 0):
|
|
| 726 |
+ try: |
|
| 727 |
+ output = base64.decodestring(output) |
|
| 728 |
+ except binascii.Error: |
|
| 729 |
+ pass |
|
| 730 |
+ except binascii.Incomplete: |
|
| 731 |
+ pass |
|
| 732 |
+ |
|
| 733 |
+ # resolve relative URIs |
|
| 734 |
+ if (element in self.can_be_relative_uri) and output: |
|
| 735 |
+ output = self.resolveURI(output) |
|
| 736 |
+ |
|
| 737 |
+ # decode entities within embedded markup |
|
| 738 |
+ if not self.contentparams.get('base64', 0):
|
|
| 739 |
+ output = self.decodeEntities(element, output) |
|
| 740 |
+ |
|
| 741 |
+ if self.lookslikehtml(output): |
|
| 742 |
+ self.contentparams['type']='text/html' |
|
| 743 |
+ |
|
| 744 |
+ # remove temporary cruft from contentparams |
|
| 745 |
+ try: |
|
| 746 |
+ del self.contentparams['mode'] |
|
| 747 |
+ except KeyError: |
|
| 748 |
+ pass |
|
| 749 |
+ try: |
|
| 750 |
+ del self.contentparams['base64'] |
|
| 751 |
+ except KeyError: |
|
| 752 |
+ pass |
|
| 753 |
+ |
|
| 754 |
+ is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types
|
|
| 755 |
+ # resolve relative URIs within embedded markup |
|
| 756 |
+ if is_htmlish and RESOLVE_RELATIVE_URIS: |
|
| 757 |
+ if element in self.can_contain_relative_uris: |
|
| 758 |
+ output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html'))
|
|
| 759 |
+ |
|
| 760 |
+ # parse microformats |
|
| 761 |
+ # (must do this before sanitizing because some microformats |
|
| 762 |
+ # rely on elements that we sanitize) |
|
| 763 |
+ if is_htmlish and element in ['content', 'description', 'summary']: |
|
| 764 |
+ mfresults = _parseMicroformats(output, self.baseuri, self.encoding) |
|
| 765 |
+ if mfresults: |
|
| 766 |
+ for tag in mfresults.get('tags', []):
|
|
| 767 |
+ self._addTag(tag['term'], tag['scheme'], tag['label']) |
|
| 768 |
+ for enclosure in mfresults.get('enclosures', []):
|
|
| 769 |
+ self._start_enclosure(enclosure) |
|
| 770 |
+ for xfn in mfresults.get('xfn', []):
|
|
| 771 |
+ self._addXFN(xfn['relationships'], xfn['href'], xfn['name']) |
|
| 772 |
+ vcard = mfresults.get('vcard')
|
|
| 773 |
+ if vcard: |
|
| 774 |
+ self._getContext()['vcard'] = vcard |
|
| 775 |
+ |
|
| 776 |
+ # sanitize embedded markup |
|
| 777 |
+ if is_htmlish and SANITIZE_HTML: |
|
| 778 |
+ if element in self.can_contain_dangerous_markup: |
|
| 779 |
+ output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html'))
|
|
| 780 |
+ |
|
| 781 |
+ if self.encoding and type(output) != type(u''): |
|
| 782 |
+ try: |
|
| 783 |
+ output = unicode(output, self.encoding) |
|
| 784 |
+ except: |
|
| 785 |
+ pass |
|
| 786 |
+ |
|
| 787 |
+ # address common error where people take data that is already |
|
| 788 |
+ # utf-8, presume that it is iso-8859-1, and re-encode it. |
|
| 789 |
+ if self.encoding=='utf-8' and type(output) == type(u''): |
|
| 790 |
+ try: |
|
| 791 |
+ output = unicode(output.encode('iso-8859-1'), 'utf-8')
|
|
| 792 |
+ except: |
|
| 793 |
+ pass |
|
| 794 |
+ |
|
| 795 |
+ # map win-1252 extensions to the proper code points |
|
| 796 |
+ if type(output) == type(u''): |
|
| 797 |
+ output = u''.join([c in _cp1252.keys() and _cp1252[c] or c for c in output]) |
|
| 798 |
+ |
|
| 799 |
+ # categories/tags/keywords/whatever are handled in _end_category |
|
| 800 |
+ if element == 'category': |
|
| 801 |
+ return output |
|
| 802 |
+ |
|
| 803 |
+ if element == 'title' and self.hasTitle: |
|
| 804 |
+ return output |
|
| 805 |
+ |
|
| 806 |
+ # store output in appropriate place(s) |
|
| 807 |
+ if self.inentry and not self.insource: |
|
| 808 |
+ if element == 'content': |
|
| 809 |
+ self.entries[-1].setdefault(element, []) |
|
| 810 |
+ contentparams = copy.deepcopy(self.contentparams) |
|
| 811 |
+ contentparams['value'] = output |
|
| 812 |
+ self.entries[-1][element].append(contentparams) |
|
| 813 |
+ elif element == 'link': |
|
| 814 |
+ self.entries[-1][element] = output |
|
| 815 |
+ if output: |
|
| 816 |
+ self.entries[-1]['links'][-1]['href'] = output |
|
| 817 |
+ else: |
|
| 818 |
+ if element == 'description': |
|
| 819 |
+ element = 'summary' |
|
| 820 |
+ self.entries[-1][element] = output |
|
| 821 |
+ if self.incontent: |
|
| 822 |
+ contentparams = copy.deepcopy(self.contentparams) |
|
| 823 |
+ contentparams['value'] = output |
|
| 824 |
+ self.entries[-1][element + '_detail'] = contentparams |
|
| 825 |
+ elif (self.infeed or self.insource):# and (not self.intextinput) and (not self.inimage): |
|
| 826 |
+ context = self._getContext() |
|
| 827 |
+ if element == 'description': |
|
| 828 |
+ element = 'subtitle' |
|
| 829 |
+ context[element] = output |
|
| 830 |
+ if element == 'link': |
|
| 831 |
+ context['links'][-1]['href'] = output |
|
| 832 |
+ elif self.incontent: |
|
| 833 |
+ contentparams = copy.deepcopy(self.contentparams) |
|
| 834 |
+ contentparams['value'] = output |
|
| 835 |
+ context[element + '_detail'] = contentparams |
|
| 836 |
+ return output |
|
| 837 |
+ |
|
| 838 |
+ def pushContent(self, tag, attrsD, defaultContentType, expectingText): |
|
| 839 |
+ self.incontent += 1 |
|
| 840 |
+ if self.lang: self.lang=self.lang.replace('_','-')
|
|
| 841 |
+ self.contentparams = FeedParserDict({
|
|
| 842 |
+ 'type': self.mapContentType(attrsD.get('type', defaultContentType)),
|
|
| 843 |
+ 'language': self.lang, |
|
| 844 |
+ 'base': self.baseuri}) |
|
| 845 |
+ self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams) |
|
| 846 |
+ self.push(tag, expectingText) |
|
| 847 |
+ |
|
| 848 |
+ def popContent(self, tag): |
|
| 849 |
+ value = self.pop(tag) |
|
| 850 |
+ self.incontent -= 1 |
|
| 851 |
+ self.contentparams.clear() |
|
| 852 |
+ return value |
|
| 853 |
+ |
|
| 854 |
+ # a number of elements in a number of RSS variants are nominally plain |
|
| 855 |
+ # text, but this is routinely ignored. This is an attempt to detect |
|
| 856 |
+ # the most common cases. As false positives often result in silent |
|
| 857 |
+ # data loss, this function errs on the conservative side. |
|
| 858 |
+ def lookslikehtml(self, str): |
|
| 859 |
+ if self.version.startswith('atom'): return
|
|
| 860 |
+ if self.contentparams.get('type','text/html') != 'text/plain': return
|
|
| 861 |
+ |
|
| 862 |
+ # must have a close tag or a entity reference to qualify |
|
| 863 |
+ if not (re.search(r'</(\w+)>',str) or re.search("&#?\w+;",str)): return
|
|
| 864 |
+ |
|
| 865 |
+ # all tags must be in a restricted subset of valid HTML tags |
|
| 866 |
+ if filter(lambda t: t.lower() not in _HTMLSanitizer.acceptable_elements, |
|
| 867 |
+ re.findall(r'</?(\w+)',str)): return |
|
| 868 |
+ |
|
| 869 |
+ # all entities must have been defined as valid HTML entities |
|
| 870 |
+ from htmlentitydefs import entitydefs |
|
| 871 |
+ if filter(lambda e: e not in entitydefs.keys(), |
|
| 872 |
+ re.findall(r'&(\w+);',str)): return |
|
| 873 |
+ |
|
| 874 |
+ return 1 |
|
| 875 |
+ |
|
| 876 |
+ def _mapToStandardPrefix(self, name): |
|
| 877 |
+ colonpos = name.find(':')
|
|
| 878 |
+ if colonpos <> -1: |
|
| 879 |
+ prefix = name[:colonpos] |
|
| 880 |
+ suffix = name[colonpos+1:] |
|
| 881 |
+ prefix = self.namespacemap.get(prefix, prefix) |
|
| 882 |
+ name = prefix + ':' + suffix |
|
| 883 |
+ return name |
|
| 884 |
+ |
|
| 885 |
+ def _getAttribute(self, attrsD, name): |
|
| 886 |
+ return attrsD.get(self._mapToStandardPrefix(name)) |
|
| 887 |
+ |
|
| 888 |
+ def _isBase64(self, attrsD, contentparams): |
|
| 889 |
+ if attrsD.get('mode', '') == 'base64':
|
|
| 890 |
+ return 1 |
|
| 891 |
+ if self.contentparams['type'].startswith('text/'):
|
|
| 892 |
+ return 0 |
|
| 893 |
+ if self.contentparams['type'].endswith('+xml'):
|
|
| 894 |
+ return 0 |
|
| 895 |
+ if self.contentparams['type'].endswith('/xml'):
|
|
| 896 |
+ return 0 |
|
| 897 |
+ return 1 |
|
| 898 |
+ |
|
| 899 |
+ def _itsAnHrefDamnIt(self, attrsD): |
|
| 900 |
+ href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
|
|
| 901 |
+ if href: |
|
| 902 |
+ try: |
|
| 903 |
+ del attrsD['url'] |
|
| 904 |
+ except KeyError: |
|
| 905 |
+ pass |
|
| 906 |
+ try: |
|
| 907 |
+ del attrsD['uri'] |
|
| 908 |
+ except KeyError: |
|
| 909 |
+ pass |
|
| 910 |
+ attrsD['href'] = href |
|
| 911 |
+ return attrsD |
|
| 912 |
+ |
|
| 913 |
+ def _save(self, key, value): |
|
| 914 |
+ context = self._getContext() |
|
| 915 |
+ context.setdefault(key, value) |
|
| 916 |
+ |
|
| 917 |
+ def _start_rss(self, attrsD): |
|
| 918 |
+ versionmap = {'0.91': 'rss091u',
|
|
| 919 |
+ '0.92': 'rss092', |
|
| 920 |
+ '0.93': 'rss093', |
|
| 921 |
+ '0.94': 'rss094'} |
|
| 922 |
+ if not self.version: |
|
| 923 |
+ attr_version = attrsD.get('version', '')
|
|
| 924 |
+ version = versionmap.get(attr_version) |
|
| 925 |
+ if version: |
|
| 926 |
+ self.version = version |
|
| 927 |
+ elif attr_version.startswith('2.'):
|
|
| 928 |
+ self.version = 'rss20' |
|
| 929 |
+ else: |
|
| 930 |
+ self.version = 'rss' |
|
| 931 |
+ |
|
| 932 |
+ def _start_dlhottitles(self, attrsD): |
|
| 933 |
+ self.version = 'hotrss' |
|
| 934 |
+ |
|
| 935 |
+ def _start_channel(self, attrsD): |
|
| 936 |
+ self.infeed = 1 |
|
| 937 |
+ self._cdf_common(attrsD) |
|
| 938 |
+ _start_feedinfo = _start_channel |
|
| 939 |
+ |
|
| 940 |
+ def _cdf_common(self, attrsD): |
|
| 941 |
+ if attrsD.has_key('lastmod'):
|
|
| 942 |
+ self._start_modified({})
|
|
| 943 |
+ self.elementstack[-1][-1] = attrsD['lastmod'] |
|
| 944 |
+ self._end_modified() |
|
| 945 |
+ if attrsD.has_key('href'):
|
|
| 946 |
+ self._start_link({})
|
|
| 947 |
+ self.elementstack[-1][-1] = attrsD['href'] |
|
| 948 |
+ self._end_link() |
|
| 949 |
+ |
|
| 950 |
+ def _start_feed(self, attrsD): |
|
| 951 |
+ self.infeed = 1 |
|
| 952 |
+ versionmap = {'0.1': 'atom01',
|
|
| 953 |
+ '0.2': 'atom02', |
|
| 954 |
+ '0.3': 'atom03'} |
|
| 955 |
+ if not self.version: |
|
| 956 |
+ attr_version = attrsD.get('version')
|
|
| 957 |
+ version = versionmap.get(attr_version) |
|
| 958 |
+ if version: |
|
| 959 |
+ self.version = version |
|
| 960 |
+ else: |
|
| 961 |
+ self.version = 'atom' |
|
| 962 |
+ |
|
| 963 |
+ def _end_channel(self): |
|
| 964 |
+ self.infeed = 0 |
|
| 965 |
+ _end_feed = _end_channel |
|
| 966 |
+ |
|
| 967 |
+ def _start_image(self, attrsD): |
|
| 968 |
+ context = self._getContext() |
|
| 969 |
+ context.setdefault('image', FeedParserDict())
|
|
| 970 |
+ self.inimage = 1 |
|
| 971 |
+ self.hasTitle = 0 |
|
| 972 |
+ self.push('image', 0)
|
|
| 973 |
+ |
|
| 974 |
+ def _end_image(self): |
|
| 975 |
+ self.pop('image')
|
|
| 976 |
+ self.inimage = 0 |
|
| 977 |
+ |
|
| 978 |
+ def _start_textinput(self, attrsD): |
|
| 979 |
+ context = self._getContext() |
|
| 980 |
+ context.setdefault('textinput', FeedParserDict())
|
|
| 981 |
+ self.intextinput = 1 |
|
| 982 |
+ self.hasTitle = 0 |
|
| 983 |
+ self.push('textinput', 0)
|
|
| 984 |
+ _start_textInput = _start_textinput |
|
| 985 |
+ |
|
| 986 |
+ def _end_textinput(self): |
|
| 987 |
+ self.pop('textinput')
|
|
| 988 |
+ self.intextinput = 0 |
|
| 989 |
+ _end_textInput = _end_textinput |
|
| 990 |
+ |
|
| 991 |
+ def _start_author(self, attrsD): |
|
| 992 |
+ self.inauthor = 1 |
|
| 993 |
+ self.push('author', 1)
|
|
| 994 |
+ _start_managingeditor = _start_author |
|
| 995 |
+ _start_dc_author = _start_author |
|
| 996 |
+ _start_dc_creator = _start_author |
|
| 997 |
+ _start_itunes_author = _start_author |
|
| 998 |
+ |
|
| 999 |
+ def _end_author(self): |
|
| 1000 |
+ self.pop('author')
|
|
| 1001 |
+ self.inauthor = 0 |
|
| 1002 |
+ self._sync_author_detail() |
|
| 1003 |
+ _end_managingeditor = _end_author |
|
| 1004 |
+ _end_dc_author = _end_author |
|
| 1005 |
+ _end_dc_creator = _end_author |
|
| 1006 |
+ _end_itunes_author = _end_author |
|
| 1007 |
+ |
|
| 1008 |
+ def _start_itunes_owner(self, attrsD): |
|
| 1009 |
+ self.inpublisher = 1 |
|
| 1010 |
+ self.push('publisher', 0)
|
|
| 1011 |
+ |
|
| 1012 |
+ def _end_itunes_owner(self): |
|
| 1013 |
+ self.pop('publisher')
|
|
| 1014 |
+ self.inpublisher = 0 |
|
| 1015 |
+ self._sync_author_detail('publisher')
|
|
| 1016 |
+ |
|
| 1017 |
+ def _start_contributor(self, attrsD): |
|
| 1018 |
+ self.incontributor = 1 |
|
| 1019 |
+ context = self._getContext() |
|
| 1020 |
+ context.setdefault('contributors', [])
|
|
| 1021 |
+ context['contributors'].append(FeedParserDict()) |
|
| 1022 |
+ self.push('contributor', 0)
|
|
| 1023 |
+ |
|
| 1024 |
+ def _end_contributor(self): |
|
| 1025 |
+ self.pop('contributor')
|
|
| 1026 |
+ self.incontributor = 0 |
|
| 1027 |
+ |
|
| 1028 |
+ def _start_dc_contributor(self, attrsD): |
|
| 1029 |
+ self.incontributor = 1 |
|
| 1030 |
+ context = self._getContext() |
|
| 1031 |
+ context.setdefault('contributors', [])
|
|
| 1032 |
+ context['contributors'].append(FeedParserDict()) |
|
| 1033 |
+ self.push('name', 0)
|
|
| 1034 |
+ |
|
| 1035 |
+ def _end_dc_contributor(self): |
|
| 1036 |
+ self._end_name() |
|
| 1037 |
+ self.incontributor = 0 |
|
| 1038 |
+ |
|
| 1039 |
+ def _start_name(self, attrsD): |
|
| 1040 |
+ self.push('name', 0)
|
|
| 1041 |
+ _start_itunes_name = _start_name |
|
| 1042 |
+ |
|
| 1043 |
+ def _end_name(self): |
|
| 1044 |
+ value = self.pop('name')
|
|
| 1045 |
+ if self.inpublisher: |
|
| 1046 |
+ self._save_author('name', value, 'publisher')
|
|
| 1047 |
+ elif self.inauthor: |
|
| 1048 |
+ self._save_author('name', value)
|
|
| 1049 |
+ elif self.incontributor: |
|
| 1050 |
+ self._save_contributor('name', value)
|
|
| 1051 |
+ elif self.intextinput: |
|
| 1052 |
+ context = self._getContext() |
|
| 1053 |
+ context['name'] = value |
|
| 1054 |
+ _end_itunes_name = _end_name |
|
| 1055 |
+ |
|
| 1056 |
+ def _start_width(self, attrsD): |
|
| 1057 |
+ self.push('width', 0)
|
|
| 1058 |
+ |
|
| 1059 |
+ def _end_width(self): |
|
| 1060 |
+ value = self.pop('width')
|
|
| 1061 |
+ try: |
|
| 1062 |
+ value = int(value) |
|
| 1063 |
+ except: |
|
| 1064 |
+ value = 0 |
|
| 1065 |
+ if self.inimage: |
|
| 1066 |
+ context = self._getContext() |
|
| 1067 |
+ context['width'] = value |
|
| 1068 |
+ |
|
| 1069 |
+ def _start_height(self, attrsD): |
|
| 1070 |
+ self.push('height', 0)
|
|
| 1071 |
+ |
|
| 1072 |
+ def _end_height(self): |
|
| 1073 |
+ value = self.pop('height')
|
|
| 1074 |
+ try: |
|
| 1075 |
+ value = int(value) |
|
| 1076 |
+ except: |
|
| 1077 |
+ value = 0 |
|
| 1078 |
+ if self.inimage: |
|
| 1079 |
+ context = self._getContext() |
|
| 1080 |
+ context['height'] = value |
|
| 1081 |
+ |
|
| 1082 |
+ def _start_url(self, attrsD): |
|
| 1083 |
+ self.push('href', 1)
|
|
| 1084 |
+ _start_homepage = _start_url |
|
| 1085 |
+ _start_uri = _start_url |
|
| 1086 |
+ |
|
| 1087 |
+ def _end_url(self): |
|
| 1088 |
+ value = self.pop('href')
|
|
| 1089 |
+ if self.inauthor: |
|
| 1090 |
+ self._save_author('href', value)
|
|
| 1091 |
+ elif self.incontributor: |
|
| 1092 |
+ self._save_contributor('href', value)
|
|
| 1093 |
+ _end_homepage = _end_url |
|
| 1094 |
+ _end_uri = _end_url |
|
| 1095 |
+ |
|
| 1096 |
+ def _start_email(self, attrsD): |
|
| 1097 |
+ self.push('email', 0)
|
|
| 1098 |
+ _start_itunes_email = _start_email |
|
| 1099 |
+ |
|
| 1100 |
+ def _end_email(self): |
|
| 1101 |
+ value = self.pop('email')
|
|
| 1102 |
+ if self.inpublisher: |
|
| 1103 |
+ self._save_author('email', value, 'publisher')
|
|
| 1104 |
+ elif self.inauthor: |
|
| 1105 |
+ self._save_author('email', value)
|
|
| 1106 |
+ elif self.incontributor: |
|
| 1107 |
+ self._save_contributor('email', value)
|
|
| 1108 |
+ _end_itunes_email = _end_email |
|
| 1109 |
+ |
|
| 1110 |
+ def _getContext(self): |
|
| 1111 |
+ if self.insource: |
|
| 1112 |
+ context = self.sourcedata |
|
| 1113 |
+ elif self.inimage: |
|
| 1114 |
+ context = self.feeddata['image'] |
|
| 1115 |
+ elif self.intextinput: |
|
| 1116 |
+ context = self.feeddata['textinput'] |
|
| 1117 |
+ elif self.inentry: |
|
| 1118 |
+ context = self.entries[-1] |
|
| 1119 |
+ else: |
|
| 1120 |
+ context = self.feeddata |
|
| 1121 |
+ return context |
|
| 1122 |
+ |
|
| 1123 |
+ def _save_author(self, key, value, prefix='author'): |
|
| 1124 |
+ context = self._getContext() |
|
| 1125 |
+ context.setdefault(prefix + '_detail', FeedParserDict()) |
|
| 1126 |
+ context[prefix + '_detail'][key] = value |
|
| 1127 |
+ self._sync_author_detail() |
|
| 1128 |
+ |
|
| 1129 |
+ def _save_contributor(self, key, value): |
|
| 1130 |
+ context = self._getContext() |
|
| 1131 |
+ context.setdefault('contributors', [FeedParserDict()])
|
|
| 1132 |
+ context['contributors'][-1][key] = value |
|
| 1133 |
+ |
|
| 1134 |
+ def _sync_author_detail(self, key='author'): |
|
| 1135 |
+ context = self._getContext() |
|
| 1136 |
+ detail = context.get('%s_detail' % key)
|
|
| 1137 |
+ if detail: |
|
| 1138 |
+ name = detail.get('name')
|
|
| 1139 |
+ email = detail.get('email')
|
|
| 1140 |
+ if name and email: |
|
| 1141 |
+ context[key] = '%s (%s)' % (name, email) |
|
| 1142 |
+ elif name: |
|
| 1143 |
+ context[key] = name |
|
| 1144 |
+ elif email: |
|
| 1145 |
+ context[key] = email |
|
| 1146 |
+ else: |
|
| 1147 |
+ author, email = context.get(key), None |
|
| 1148 |
+ if not author: return |
|
| 1149 |
+ emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author)
|
|
| 1150 |
+ if emailmatch: |
|
| 1151 |
+ email = emailmatch.group(0) |
|
| 1152 |
+ # probably a better way to do the following, but it passes all the tests |
|
| 1153 |
+ author = author.replace(email, '') |
|
| 1154 |
+ author = author.replace('()', '')
|
|
| 1155 |
+ author = author.replace('<>', '')
|
|
| 1156 |
+ author = author.replace('<>', '')
|
|
| 1157 |
+ author = author.strip() |
|
| 1158 |
+ if author and (author[0] == '('):
|
|
| 1159 |
+ author = author[1:] |
|
| 1160 |
+ if author and (author[-1] == ')'): |
|
| 1161 |
+ author = author[:-1] |
|
| 1162 |
+ author = author.strip() |
|
| 1163 |
+ if author or email: |
|
| 1164 |
+ context.setdefault('%s_detail' % key, FeedParserDict())
|
|
| 1165 |
+ if author: |
|
| 1166 |
+ context['%s_detail' % key]['name'] = author |
|
| 1167 |
+ if email: |
|
| 1168 |
+ context['%s_detail' % key]['email'] = email |
|
| 1169 |
+ |
|
| 1170 |
+ def _start_subtitle(self, attrsD): |
|
| 1171 |
+ self.pushContent('subtitle', attrsD, 'text/plain', 1)
|
|
| 1172 |
+ _start_tagline = _start_subtitle |
|
| 1173 |
+ _start_itunes_subtitle = _start_subtitle |
|
| 1174 |
+ |
|
| 1175 |
+ def _end_subtitle(self): |
|
| 1176 |
+ self.popContent('subtitle')
|
|
| 1177 |
+ _end_tagline = _end_subtitle |
|
| 1178 |
+ _end_itunes_subtitle = _end_subtitle |
|
| 1179 |
+ |
|
| 1180 |
+ def _start_rights(self, attrsD): |
|
| 1181 |
+ self.pushContent('rights', attrsD, 'text/plain', 1)
|
|
| 1182 |
+ _start_dc_rights = _start_rights |
|
| 1183 |
+ _start_copyright = _start_rights |
|
| 1184 |
+ |
|
| 1185 |
+ def _end_rights(self): |
|
| 1186 |
+ self.popContent('rights')
|
|
| 1187 |
+ _end_dc_rights = _end_rights |
|
| 1188 |
+ _end_copyright = _end_rights |
|
| 1189 |
+ |
|
| 1190 |
+ def _start_item(self, attrsD): |
|
| 1191 |
+ self.entries.append(FeedParserDict()) |
|
| 1192 |
+ self.push('item', 0)
|
|
| 1193 |
+ self.inentry = 1 |
|
| 1194 |
+ self.guidislink = 0 |
|
| 1195 |
+ self.hasTitle = 0 |
|
| 1196 |
+ id = self._getAttribute(attrsD, 'rdf:about') |
|
| 1197 |
+ if id: |
|
| 1198 |
+ context = self._getContext() |
|
| 1199 |
+ context['id'] = id |
|
| 1200 |
+ self._cdf_common(attrsD) |
|
| 1201 |
+ _start_entry = _start_item |
|
| 1202 |
+ _start_product = _start_item |
|
| 1203 |
+ |
|
| 1204 |
+ def _end_item(self): |
|
| 1205 |
+ self.pop('item')
|
|
| 1206 |
+ self.inentry = 0 |
|
| 1207 |
+ _end_entry = _end_item |
|
| 1208 |
+ |
|
| 1209 |
+ def _start_dc_language(self, attrsD): |
|
| 1210 |
+ self.push('language', 1)
|
|
| 1211 |
+ _start_language = _start_dc_language |
|
| 1212 |
+ |
|
| 1213 |
+ def _end_dc_language(self): |
|
| 1214 |
+ self.lang = self.pop('language')
|
|
| 1215 |
+ _end_language = _end_dc_language |
|
| 1216 |
+ |
|
| 1217 |
+ def _start_dc_publisher(self, attrsD): |
|
| 1218 |
+ self.push('publisher', 1)
|
|
| 1219 |
+ _start_webmaster = _start_dc_publisher |
|
| 1220 |
+ |
|
| 1221 |
+ def _end_dc_publisher(self): |
|
| 1222 |
+ self.pop('publisher')
|
|
| 1223 |
+ self._sync_author_detail('publisher')
|
|
| 1224 |
+ _end_webmaster = _end_dc_publisher |
|
| 1225 |
+ |
|
| 1226 |
+ def _start_published(self, attrsD): |
|
| 1227 |
+ self.push('published', 1)
|
|
| 1228 |
+ _start_dcterms_issued = _start_published |
|
| 1229 |
+ _start_issued = _start_published |
|
| 1230 |
+ |
|
| 1231 |
+ def _end_published(self): |
|
| 1232 |
+ value = self.pop('published')
|
|
| 1233 |
+ self._save('published_parsed', _parse_date(value))
|
|
| 1234 |
+ _end_dcterms_issued = _end_published |
|
| 1235 |
+ _end_issued = _end_published |
|
| 1236 |
+ |
|
| 1237 |
+ def _start_updated(self, attrsD): |
|
| 1238 |
+ self.push('updated', 1)
|
|
| 1239 |
+ _start_modified = _start_updated |
|
| 1240 |
+ _start_dcterms_modified = _start_updated |
|
| 1241 |
+ _start_pubdate = _start_updated |
|
| 1242 |
+ _start_dc_date = _start_updated |
|
| 1243 |
+ |
|
| 1244 |
+ def _end_updated(self): |
|
| 1245 |
+ value = self.pop('updated')
|
|
| 1246 |
+ parsed_value = _parse_date(value) |
|
| 1247 |
+ self._save('updated_parsed', parsed_value)
|
|
| 1248 |
+ _end_modified = _end_updated |
|
| 1249 |
+ _end_dcterms_modified = _end_updated |
|
| 1250 |
+ _end_pubdate = _end_updated |
|
| 1251 |
+ _end_dc_date = _end_updated |
|
| 1252 |
+ |
|
| 1253 |
+ def _start_created(self, attrsD): |
|
| 1254 |
+ self.push('created', 1)
|
|
| 1255 |
+ _start_dcterms_created = _start_created |
|
| 1256 |
+ |
|
| 1257 |
+ def _end_created(self): |
|
| 1258 |
+ value = self.pop('created')
|
|
| 1259 |
+ self._save('created_parsed', _parse_date(value))
|
|
| 1260 |
+ _end_dcterms_created = _end_created |
|
| 1261 |
+ |
|
| 1262 |
+ def _start_expirationdate(self, attrsD): |
|
| 1263 |
+ self.push('expired', 1)
|
|
| 1264 |
+ |
|
| 1265 |
+ def _end_expirationdate(self): |
|
| 1266 |
+ self._save('expired_parsed', _parse_date(self.pop('expired')))
|
|
| 1267 |
+ |
|
| 1268 |
+ def _start_cc_license(self, attrsD): |
|
| 1269 |
+ context = self._getContext() |
|
| 1270 |
+ value = self._getAttribute(attrsD, 'rdf:resource') |
|
| 1271 |
+ attrsD = FeedParserDict() |
|
| 1272 |
+ attrsD['rel']='license' |
|
| 1273 |
+ if value: attrsD['href']=value |
|
| 1274 |
+ context.setdefault('links', []).append(attrsD)
|
|
| 1275 |
+ |
|
| 1276 |
+ def _start_creativecommons_license(self, attrsD): |
|
| 1277 |
+ self.push('license', 1)
|
|
| 1278 |
+ _start_creativeCommons_license = _start_creativecommons_license |
|
| 1279 |
+ |
|
| 1280 |
+ def _end_creativecommons_license(self): |
|
| 1281 |
+ value = self.pop('license')
|
|
| 1282 |
+ context = self._getContext() |
|
| 1283 |
+ attrsD = FeedParserDict() |
|
| 1284 |
+ attrsD['rel']='license' |
|
| 1285 |
+ if value: attrsD['href']=value |
|
| 1286 |
+ context.setdefault('links', []).append(attrsD)
|
|
| 1287 |
+ del context['license'] |
|
| 1288 |
+ _end_creativeCommons_license = _end_creativecommons_license |
|
| 1289 |
+ |
|
| 1290 |
+ def _addXFN(self, relationships, href, name): |
|
| 1291 |
+ context = self._getContext() |
|
| 1292 |
+ xfn = context.setdefault('xfn', [])
|
|
| 1293 |
+ value = FeedParserDict({'relationships': relationships, 'href': href, 'name': name})
|
|
| 1294 |
+ if value not in xfn: |
|
| 1295 |
+ xfn.append(value) |
|
| 1296 |
+ |
|
| 1297 |
+ def _addTag(self, term, scheme, label): |
|
| 1298 |
+ context = self._getContext() |
|
| 1299 |
+ tags = context.setdefault('tags', [])
|
|
| 1300 |
+ if (not term) and (not scheme) and (not label): return |
|
| 1301 |
+ value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
|
|
| 1302 |
+ if value not in tags: |
|
| 1303 |
+ tags.append(value) |
|
| 1304 |
+ |
|
| 1305 |
+ def _start_category(self, attrsD): |
|
| 1306 |
+ if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD))
|
|
| 1307 |
+ term = attrsD.get('term')
|
|
| 1308 |
+ scheme = attrsD.get('scheme', attrsD.get('domain'))
|
|
| 1309 |
+ label = attrsD.get('label')
|
|
| 1310 |
+ self._addTag(term, scheme, label) |
|
| 1311 |
+ self.push('category', 1)
|
|
| 1312 |
+ _start_dc_subject = _start_category |
|
| 1313 |
+ _start_keywords = _start_category |
|
| 1314 |
+ |
|
| 1315 |
+ def _end_itunes_keywords(self): |
|
| 1316 |
+ for term in self.pop('itunes_keywords').split():
|
|
| 1317 |
+ self._addTag(term, 'http://www.itunes.com/', None) |
|
| 1318 |
+ |
|
| 1319 |
+ def _start_itunes_category(self, attrsD): |
|
| 1320 |
+ self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None)
|
|
| 1321 |
+ self.push('category', 1)
|
|
| 1322 |
+ |
|
| 1323 |
+ def _end_category(self): |
|
| 1324 |
+ value = self.pop('category')
|
|
| 1325 |
+ if not value: return |
|
| 1326 |
+ context = self._getContext() |
|
| 1327 |
+ tags = context['tags'] |
|
| 1328 |
+ if value and len(tags) and not tags[-1]['term']: |
|
| 1329 |
+ tags[-1]['term'] = value |
|
| 1330 |
+ else: |
|
| 1331 |
+ self._addTag(value, None, None) |
|
| 1332 |
+ _end_dc_subject = _end_category |
|
| 1333 |
+ _end_keywords = _end_category |
|
| 1334 |
+ _end_itunes_category = _end_category |
|
| 1335 |
+ |
|
| 1336 |
+ def _start_cloud(self, attrsD): |
|
| 1337 |
+ self._getContext()['cloud'] = FeedParserDict(attrsD) |
|
| 1338 |
+ |
|
| 1339 |
+ def _start_link(self, attrsD): |
|
| 1340 |
+ attrsD.setdefault('rel', 'alternate')
|
|
| 1341 |
+ if attrsD['rel'] == 'self': |
|
| 1342 |
+ attrsD.setdefault('type', 'application/atom+xml')
|
|
| 1343 |
+ else: |
|
| 1344 |
+ attrsD.setdefault('type', 'text/html')
|
|
| 1345 |
+ context = self._getContext() |
|
| 1346 |
+ attrsD = self._itsAnHrefDamnIt(attrsD) |
|
| 1347 |
+ if attrsD.has_key('href'):
|
|
| 1348 |
+ attrsD['href'] = self.resolveURI(attrsD['href']) |
|
| 1349 |
+ if attrsD.get('rel')=='enclosure' and not context.get('id'):
|
|
| 1350 |
+ context['id'] = attrsD.get('href')
|
|
| 1351 |
+ expectingText = self.infeed or self.inentry or self.insource |
|
| 1352 |
+ context.setdefault('links', [])
|
|
| 1353 |
+ context['links'].append(FeedParserDict(attrsD)) |
|
| 1354 |
+ if attrsD.has_key('href'):
|
|
| 1355 |
+ expectingText = 0 |
|
| 1356 |
+ if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
|
|
| 1357 |
+ context['link'] = attrsD['href'] |
|
| 1358 |
+ else: |
|
| 1359 |
+ self.push('link', expectingText)
|
|
| 1360 |
+ _start_producturl = _start_link |
|
| 1361 |
+ |
|
| 1362 |
+ def _end_link(self): |
|
| 1363 |
+ value = self.pop('link')
|
|
| 1364 |
+ context = self._getContext() |
|
| 1365 |
+ _end_producturl = _end_link |
|
| 1366 |
+ |
|
| 1367 |
+ def _start_guid(self, attrsD): |
|
| 1368 |
+ self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
|
|
| 1369 |
+ self.push('id', 1)
|
|
| 1370 |
+ |
|
| 1371 |
+ def _end_guid(self): |
|
| 1372 |
+ value = self.pop('id')
|
|
| 1373 |
+ self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
|
|
| 1374 |
+ if self.guidislink: |
|
| 1375 |
+ # guid acts as link, but only if 'ispermalink' is not present or is 'true', |
|
| 1376 |
+ # and only if the item doesn't already have a link element |
|
| 1377 |
+ self._save('link', value)
|
|
| 1378 |
+ |
|
| 1379 |
+ def _start_title(self, attrsD): |
|
| 1380 |
+ if self.svgOK: return self.unknown_starttag('title', attrsD.items())
|
|
| 1381 |
+ self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
|
|
| 1382 |
+ _start_dc_title = _start_title |
|
| 1383 |
+ _start_media_title = _start_title |
|
| 1384 |
+ |
|
| 1385 |
+ def _end_title(self): |
|
| 1386 |
+ if self.svgOK: return |
|
| 1387 |
+ value = self.popContent('title')
|
|
| 1388 |
+ if not value: return |
|
| 1389 |
+ context = self._getContext() |
|
| 1390 |
+ self.hasTitle = 1 |
|
| 1391 |
+ _end_dc_title = _end_title |
|
| 1392 |
+ |
|
| 1393 |
+ def _end_media_title(self): |
|
| 1394 |
+ hasTitle = self.hasTitle |
|
| 1395 |
+ self._end_title() |
|
| 1396 |
+ self.hasTitle = hasTitle |
|
| 1397 |
+ |
|
| 1398 |
+ def _start_description(self, attrsD): |
|
| 1399 |
+ context = self._getContext() |
|
| 1400 |
+ if context.has_key('summary'):
|
|
| 1401 |
+ self._summaryKey = 'content' |
|
| 1402 |
+ self._start_content(attrsD) |
|
| 1403 |
+ else: |
|
| 1404 |
+ self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)
|
|
| 1405 |
+ _start_dc_description = _start_description |
|
| 1406 |
+ |
|
| 1407 |
+ def _start_abstract(self, attrsD): |
|
| 1408 |
+ self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
|
|
| 1409 |
+ |
|
| 1410 |
+ def _end_description(self): |
|
| 1411 |
+ if self._summaryKey == 'content': |
|
| 1412 |
+ self._end_content() |
|
| 1413 |
+ else: |
|
| 1414 |
+ value = self.popContent('description')
|
|
| 1415 |
+ self._summaryKey = None |
|
| 1416 |
+ _end_abstract = _end_description |
|
| 1417 |
+ _end_dc_description = _end_description |
|
| 1418 |
+ |
|
| 1419 |
+ def _start_info(self, attrsD): |
|
| 1420 |
+ self.pushContent('info', attrsD, 'text/plain', 1)
|
|
| 1421 |
+ _start_feedburner_browserfriendly = _start_info |
|
| 1422 |
+ |
|
| 1423 |
+ def _end_info(self): |
|
| 1424 |
+ self.popContent('info')
|
|
| 1425 |
+ _end_feedburner_browserfriendly = _end_info |
|
| 1426 |
+ |
|
| 1427 |
+ def _start_generator(self, attrsD): |
|
| 1428 |
+ if attrsD: |
|
| 1429 |
+ attrsD = self._itsAnHrefDamnIt(attrsD) |
|
| 1430 |
+ if attrsD.has_key('href'):
|
|
| 1431 |
+ attrsD['href'] = self.resolveURI(attrsD['href']) |
|
| 1432 |
+ self._getContext()['generator_detail'] = FeedParserDict(attrsD) |
|
| 1433 |
+ self.push('generator', 1)
|
|
| 1434 |
+ |
|
| 1435 |
+ def _end_generator(self): |
|
| 1436 |
+ value = self.pop('generator')
|
|
| 1437 |
+ context = self._getContext() |
|
| 1438 |
+ if context.has_key('generator_detail'):
|
|
| 1439 |
+ context['generator_detail']['name'] = value |
|
| 1440 |
+ |
|
| 1441 |
+ def _start_admin_generatoragent(self, attrsD): |
|
| 1442 |
+ self.push('generator', 1)
|
|
| 1443 |
+ value = self._getAttribute(attrsD, 'rdf:resource') |
|
| 1444 |
+ if value: |
|
| 1445 |
+ self.elementstack[-1][2].append(value) |
|
| 1446 |
+ self.pop('generator')
|
|
| 1447 |
+ self._getContext()['generator_detail'] = FeedParserDict({'href': value})
|
|
| 1448 |
+ |
|
| 1449 |
+ def _start_admin_errorreportsto(self, attrsD): |
|
| 1450 |
+ self.push('errorreportsto', 1)
|
|
| 1451 |
+ value = self._getAttribute(attrsD, 'rdf:resource') |
|
| 1452 |
+ if value: |
|
| 1453 |
+ self.elementstack[-1][2].append(value) |
|
| 1454 |
+ self.pop('errorreportsto')
|
|
| 1455 |
+ |
|
| 1456 |
+ def _start_summary(self, attrsD): |
|
| 1457 |
+ context = self._getContext() |
|
| 1458 |
+ if context.has_key('summary'):
|
|
| 1459 |
+ self._summaryKey = 'content' |
|
| 1460 |
+ self._start_content(attrsD) |
|
| 1461 |
+ else: |
|
| 1462 |
+ self._summaryKey = 'summary' |
|
| 1463 |
+ self.pushContent(self._summaryKey, attrsD, 'text/plain', 1) |
|
| 1464 |
+ _start_itunes_summary = _start_summary |
|
| 1465 |
+ |
|
| 1466 |
+ def _end_summary(self): |
|
| 1467 |
+ if self._summaryKey == 'content': |
|
| 1468 |
+ self._end_content() |
|
| 1469 |
+ else: |
|
| 1470 |
+ self.popContent(self._summaryKey or 'summary') |
|
| 1471 |
+ self._summaryKey = None |
|
| 1472 |
+ _end_itunes_summary = _end_summary |
|
| 1473 |
+ |
|
| 1474 |
+ def _start_enclosure(self, attrsD): |
|
| 1475 |
+ attrsD = self._itsAnHrefDamnIt(attrsD) |
|
| 1476 |
+ context = self._getContext() |
|
| 1477 |
+ attrsD['rel']='enclosure' |
|
| 1478 |
+ context.setdefault('links', []).append(FeedParserDict(attrsD))
|
|
| 1479 |
+ href = attrsD.get('href')
|
|
| 1480 |
+ if href and not context.get('id'):
|
|
| 1481 |
+ context['id'] = href |
|
| 1482 |
+ |
|
| 1483 |
+ def _start_source(self, attrsD): |
|
| 1484 |
+ self.insource = 1 |
|
| 1485 |
+ self.hasTitle = 0 |
|
| 1486 |
+ |
|
| 1487 |
+ def _end_source(self): |
|
| 1488 |
+ self.insource = 0 |
|
| 1489 |
+ self._getContext()['source'] = copy.deepcopy(self.sourcedata) |
|
| 1490 |
+ self.sourcedata.clear() |
|
| 1491 |
+ |
|
| 1492 |
+ def _start_content(self, attrsD): |
|
| 1493 |
+ self.pushContent('content', attrsD, 'text/plain', 1)
|
|
| 1494 |
+ src = attrsD.get('src')
|
|
| 1495 |
+ if src: |
|
| 1496 |
+ self.contentparams['src'] = src |
|
| 1497 |
+ self.push('content', 1)
|
|
| 1498 |
+ |
|
| 1499 |
+ def _start_prodlink(self, attrsD): |
|
| 1500 |
+ self.pushContent('content', attrsD, 'text/html', 1)
|
|
| 1501 |
+ |
|
| 1502 |
+ def _start_body(self, attrsD): |
|
| 1503 |
+ self.pushContent('content', attrsD, 'application/xhtml+xml', 1)
|
|
| 1504 |
+ _start_xhtml_body = _start_body |
|
| 1505 |
+ |
|
| 1506 |
+ def _start_content_encoded(self, attrsD): |
|
| 1507 |
+ self.pushContent('content', attrsD, 'text/html', 1)
|
|
| 1508 |
+ _start_fullitem = _start_content_encoded |
|
| 1509 |
+ |
|
| 1510 |
+ def _end_content(self): |
|
| 1511 |
+ copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)
|
|
| 1512 |
+ value = self.popContent('content')
|
|
| 1513 |
+ if copyToDescription: |
|
| 1514 |
+ self._save('description', value)
|
|
| 1515 |
+ |
|
| 1516 |
+ _end_body = _end_content |
|
| 1517 |
+ _end_xhtml_body = _end_content |
|
| 1518 |
+ _end_content_encoded = _end_content |
|
| 1519 |
+ _end_fullitem = _end_content |
|
| 1520 |
+ _end_prodlink = _end_content |
|
| 1521 |
+ |
|
| 1522 |
+ def _start_itunes_image(self, attrsD): |
|
| 1523 |
+ self.push('itunes_image', 0)
|
|
| 1524 |
+ self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
|
|
| 1525 |
+ _start_itunes_link = _start_itunes_image |
|
| 1526 |
+ |
|
| 1527 |
+ def _end_itunes_block(self): |
|
| 1528 |
+ value = self.pop('itunes_block', 0)
|
|
| 1529 |
+ self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0 |
|
| 1530 |
+ |
|
| 1531 |
+ def _end_itunes_explicit(self): |
|
| 1532 |
+ value = self.pop('itunes_explicit', 0)
|
|
| 1533 |
+ self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0 |
|
| 1534 |
+ |
|
| 1535 |
+if _XML_AVAILABLE: |
|
| 1536 |
+ class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler): |
|
| 1537 |
+ def __init__(self, baseuri, baselang, encoding): |
|
| 1538 |
+ if _debug: sys.stderr.write('trying StrictFeedParser\n')
|
|
| 1539 |
+ xml.sax.handler.ContentHandler.__init__(self) |
|
| 1540 |
+ _FeedParserMixin.__init__(self, baseuri, baselang, encoding) |
|
| 1541 |
+ self.bozo = 0 |
|
| 1542 |
+ self.exc = None |
|
| 1543 |
+ |
|
| 1544 |
+ def startPrefixMapping(self, prefix, uri): |
|
| 1545 |
+ self.trackNamespace(prefix, uri) |
|
| 1546 |
+ |
|
| 1547 |
+ def startElementNS(self, name, qname, attrs): |
|
| 1548 |
+ namespace, localname = name |
|
| 1549 |
+ lowernamespace = str(namespace or '').lower() |
|
| 1550 |
+ if lowernamespace.find('backend.userland.com/rss') <> -1:
|
|
| 1551 |
+ # match any backend.userland.com namespace |
|
| 1552 |
+ namespace = 'http://backend.userland.com/rss' |
|
| 1553 |
+ lowernamespace = namespace |
|
| 1554 |
+ if qname and qname.find(':') > 0:
|
|
| 1555 |
+ givenprefix = qname.split(':')[0]
|
|
| 1556 |
+ else: |
|
| 1557 |
+ givenprefix = None |
|
| 1558 |
+ prefix = self._matchnamespaces.get(lowernamespace, givenprefix) |
|
| 1559 |
+ if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix): |
|
| 1560 |
+ raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix |
|
| 1561 |
+ localname = str(localname).lower() |
|
| 1562 |
+ |
|
| 1563 |
+ # qname implementation is horribly broken in Python 2.1 (it |
|
| 1564 |
+ # doesn't report any), and slightly broken in Python 2.2 (it |
|
| 1565 |
+ # doesn't report the xml: namespace). So we match up namespaces |
|
| 1566 |
+ # with a known list first, and then possibly override them with |
|
| 1567 |
+ # the qnames the SAX parser gives us (if indeed it gives us any |
|
| 1568 |
+ # at all). Thanks to MatejC for helping me test this and |
|
| 1569 |
+ # tirelessly telling me that it didn't work yet. |
|
| 1570 |
+ attrsD = {}
|
|
| 1571 |
+ if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML': |
|
| 1572 |
+ attrsD['xmlns']=namespace |
|
| 1573 |
+ if localname=='svg' and namespace=='http://www.w3.org/2000/svg': |
|
| 1574 |
+ attrsD['xmlns']=namespace |
|
| 1575 |
+ |
|
| 1576 |
+ if prefix: |
|
| 1577 |
+ localname = prefix.lower() + ':' + localname |
|
| 1578 |
+ elif namespace and not qname: #Expat |
|
| 1579 |
+ for name,value in self.namespacesInUse.items(): |
|
| 1580 |
+ if name and value == namespace: |
|
| 1581 |
+ localname = name + ':' + localname |
|
| 1582 |
+ break |
|
| 1583 |
+ if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname))
|
|
| 1584 |
+ |
|
| 1585 |
+ for (namespace, attrlocalname), attrvalue in attrs._attrs.items(): |
|
| 1586 |
+ lowernamespace = (namespace or '').lower() |
|
| 1587 |
+ prefix = self._matchnamespaces.get(lowernamespace, '') |
|
| 1588 |
+ if prefix: |
|
| 1589 |
+ attrlocalname = prefix + ':' + attrlocalname |
|
| 1590 |
+ attrsD[str(attrlocalname).lower()] = attrvalue |
|
| 1591 |
+ for qname in attrs.getQNames(): |
|
| 1592 |
+ attrsD[str(qname).lower()] = attrs.getValueByQName(qname) |
|
| 1593 |
+ self.unknown_starttag(localname, attrsD.items()) |
|
| 1594 |
+ |
|
| 1595 |
+ def characters(self, text): |
|
| 1596 |
+ self.handle_data(text) |
|
| 1597 |
+ |
|
| 1598 |
+ def endElementNS(self, name, qname): |
|
| 1599 |
+ namespace, localname = name |
|
| 1600 |
+ lowernamespace = str(namespace or '').lower() |
|
| 1601 |
+ if qname and qname.find(':') > 0:
|
|
| 1602 |
+ givenprefix = qname.split(':')[0]
|
|
| 1603 |
+ else: |
|
| 1604 |
+ givenprefix = '' |
|
| 1605 |
+ prefix = self._matchnamespaces.get(lowernamespace, givenprefix) |
|
| 1606 |
+ if prefix: |
|
| 1607 |
+ localname = prefix + ':' + localname |
|
| 1608 |
+ elif namespace and not qname: #Expat |
|
| 1609 |
+ for name,value in self.namespacesInUse.items(): |
|
| 1610 |
+ if name and value == namespace: |
|
| 1611 |
+ localname = name + ':' + localname |
|
| 1612 |
+ break |
|
| 1613 |
+ localname = str(localname).lower() |
|
| 1614 |
+ self.unknown_endtag(localname) |
|
| 1615 |
+ |
|
| 1616 |
+ def error(self, exc): |
|
| 1617 |
+ self.bozo = 1 |
|
| 1618 |
+ self.exc = exc |
|
| 1619 |
+ |
|
| 1620 |
+ def fatalError(self, exc): |
|
| 1621 |
+ self.error(exc) |
|
| 1622 |
+ raise exc |
|
| 1623 |
+ |
|
| 1624 |
+class _BaseHTMLProcessor(sgmllib.SGMLParser): |
|
| 1625 |
+ special = re.compile('''[<>'"]''')
|
|
| 1626 |
+ bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
|
|
| 1627 |
+ elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', |
|
| 1628 |
+ 'img', 'input', 'isindex', 'link', 'meta', 'param'] |
|
| 1629 |
+ |
|
| 1630 |
+ def __init__(self, encoding, type): |
|
| 1631 |
+ self.encoding = encoding |
|
| 1632 |
+ self.type = type |
|
| 1633 |
+ if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
|
|
| 1634 |
+ sgmllib.SGMLParser.__init__(self) |
|
| 1635 |
+ |
|
| 1636 |
+ def reset(self): |
|
| 1637 |
+ self.pieces = [] |
|
| 1638 |
+ sgmllib.SGMLParser.reset(self) |
|
| 1639 |
+ |
|
| 1640 |
+ def _shorttag_replace(self, match): |
|
| 1641 |
+ tag = match.group(1) |
|
| 1642 |
+ if tag in self.elements_no_end_tag: |
|
| 1643 |
+ return '<' + tag + ' />' |
|
| 1644 |
+ else: |
|
| 1645 |
+ return '<' + tag + '></' + tag + '>' |
|
| 1646 |
+ |
|
| 1647 |
+ def parse_starttag(self,i): |
|
| 1648 |
+ j=sgmllib.SGMLParser.parse_starttag(self, i) |
|
| 1649 |
+ if self.type == 'application/xhtml+xml': |
|
| 1650 |
+ if j>2 and self.rawdata[j-2:j]=='/>': |
|
| 1651 |
+ self.unknown_endtag(self.lasttag) |
|
| 1652 |
+ return j |
|
| 1653 |
+ |
|
| 1654 |
+ def feed(self, data): |
|
| 1655 |
+ data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data) |
|
| 1656 |
+ #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace |
|
| 1657 |
+ data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data) |
|
| 1658 |
+ data = data.replace(''', "'")
|
|
| 1659 |
+ data = data.replace('"', '"')
|
|
| 1660 |
+ if self.encoding and type(data) == type(u''): |
|
| 1661 |
+ data = data.encode(self.encoding) |
|
| 1662 |
+ sgmllib.SGMLParser.feed(self, data) |
|
| 1663 |
+ sgmllib.SGMLParser.close(self) |
|
| 1664 |
+ |
|
| 1665 |
+ def normalize_attrs(self, attrs): |
|
| 1666 |
+ if not attrs: return attrs |
|
| 1667 |
+ # utility method to be called by descendants |
|
| 1668 |
+ attrs = dict([(k.lower(), v) for k, v in attrs]).items() |
|
| 1669 |
+ attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
|
|
| 1670 |
+ attrs.sort() |
|
| 1671 |
+ return attrs |
|
| 1672 |
+ |
|
| 1673 |
+ def unknown_starttag(self, tag, attrs): |
|
| 1674 |
+ # called for each start tag |
|
| 1675 |
+ # attrs is a list of (attr, value) tuples |
|
| 1676 |
+ # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
|
|
| 1677 |
+ if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
|
|
| 1678 |
+ uattrs = [] |
|
| 1679 |
+ strattrs='' |
|
| 1680 |
+ if attrs: |
|
| 1681 |
+ for key, value in attrs: |
|
| 1682 |
+ value=value.replace('>','>').replace('<','<').replace('"','"')
|
|
| 1683 |
+ value = self.bare_ampersand.sub("&", value)
|
|
| 1684 |
+ # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds |
|
| 1685 |
+ if type(value) != type(u''): |
|
| 1686 |
+ try: |
|
| 1687 |
+ value = unicode(value, self.encoding) |
|
| 1688 |
+ except: |
|
| 1689 |
+ value = unicode(value, 'iso-8859-1') |
|
| 1690 |
+ uattrs.append((unicode(key, self.encoding), value)) |
|
| 1691 |
+ strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]) |
|
| 1692 |
+ if self.encoding: |
|
| 1693 |
+ try: |
|
| 1694 |
+ strattrs=strattrs.encode(self.encoding) |
|
| 1695 |
+ except: |
|
| 1696 |
+ pass |
|
| 1697 |
+ if tag in self.elements_no_end_tag: |
|
| 1698 |
+ self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
|
|
| 1699 |
+ else: |
|
| 1700 |
+ self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
|
|
| 1701 |
+ |
|
| 1702 |
+ def unknown_endtag(self, tag): |
|
| 1703 |
+ # called for each end tag, e.g. for </pre>, tag will be 'pre' |
|
| 1704 |
+ # Reconstruct the original end tag. |
|
| 1705 |
+ if tag not in self.elements_no_end_tag: |
|
| 1706 |
+ self.pieces.append("</%(tag)s>" % locals())
|
|
| 1707 |
+ |
|
| 1708 |
+ def handle_charref(self, ref): |
|
| 1709 |
+ # called for each character reference, e.g. for ' ', ref will be '160' |
|
| 1710 |
+ # Reconstruct the original character reference. |
|
| 1711 |
+ if ref.startswith('x'):
|
|
| 1712 |
+ value = unichr(int(ref[1:],16)) |
|
| 1713 |
+ else: |
|
| 1714 |
+ value = unichr(int(ref)) |
|
| 1715 |
+ |
|
| 1716 |
+ if value in _cp1252.keys(): |
|
| 1717 |
+ self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:])
|
|
| 1718 |
+ else: |
|
| 1719 |
+ self.pieces.append('&#%(ref)s;' % locals())
|
|
| 1720 |
+ |
|
| 1721 |
+ def handle_entityref(self, ref): |
|
| 1722 |
+ # called for each entity reference, e.g. for '©', ref will be 'copy' |
|
| 1723 |
+ # Reconstruct the original entity reference. |
|
| 1724 |
+ if name2codepoint.has_key(ref): |
|
| 1725 |
+ self.pieces.append('&%(ref)s;' % locals())
|
|
| 1726 |
+ else: |
|
| 1727 |
+ self.pieces.append('&%(ref)s' % locals())
|
|
| 1728 |
+ |
|
| 1729 |
+ def handle_data(self, text): |
|
| 1730 |
+ # called for each block of plain text, i.e. outside of any tag and |
|
| 1731 |
+ # not containing any character or entity references |
|
| 1732 |
+ # Store the original text verbatim. |
|
| 1733 |
+ if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
|
|
| 1734 |
+ self.pieces.append(text) |
|
| 1735 |
+ |
|
| 1736 |
+ def handle_comment(self, text): |
|
| 1737 |
+ # called for each HTML comment, e.g. <!-- insert Javascript code here --> |
|
| 1738 |
+ # Reconstruct the original comment. |
|
| 1739 |
+ self.pieces.append('<!--%(text)s-->' % locals())
|
|
| 1740 |
+ |
|
| 1741 |
+ def handle_pi(self, text): |
|
| 1742 |
+ # called for each processing instruction, e.g. <?instruction> |
|
| 1743 |
+ # Reconstruct original processing instruction. |
|
| 1744 |
+ self.pieces.append('<?%(text)s>' % locals())
|
|
| 1745 |
+ |
|
| 1746 |
+ def handle_decl(self, text): |
|
| 1747 |
+ # called for the DOCTYPE, if present, e.g. |
|
| 1748 |
+ # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" |
|
| 1749 |
+ # "http://www.w3.org/TR/html4/loose.dtd"> |
|
| 1750 |
+ # Reconstruct original DOCTYPE |
|
| 1751 |
+ self.pieces.append('<!%(text)s>' % locals())
|
|
| 1752 |
+ |
|
| 1753 |
+ _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match |
|
| 1754 |
+ def _scan_name(self, i, declstartpos): |
|
| 1755 |
+ rawdata = self.rawdata |
|
| 1756 |
+ n = len(rawdata) |
|
| 1757 |
+ if i == n: |
|
| 1758 |
+ return None, -1 |
|
| 1759 |
+ m = self._new_declname_match(rawdata, i) |
|
| 1760 |
+ if m: |
|
| 1761 |
+ s = m.group() |
|
| 1762 |
+ name = s.strip() |
|
| 1763 |
+ if (i + len(s)) == n: |
|
| 1764 |
+ return None, -1 # end of buffer |
|
| 1765 |
+ return name.lower(), m.end() |
|
| 1766 |
+ else: |
|
| 1767 |
+ self.handle_data(rawdata) |
|
| 1768 |
+# self.updatepos(declstartpos, i) |
|
| 1769 |
+ return None, -1 |
|
| 1770 |
+ |
|
| 1771 |
+ def convert_charref(self, name): |
|
| 1772 |
+ return '&#%s;' % name |
|
| 1773 |
+ |
|
| 1774 |
+ def convert_entityref(self, name): |
|
| 1775 |
+ return '&%s;' % name |
|
| 1776 |
+ |
|
| 1777 |
+ def output(self): |
|
| 1778 |
+ '''Return processed HTML as a single string''' |
|
| 1779 |
+ return ''.join([str(p) for p in self.pieces]) |
|
| 1780 |
+ |
|
| 1781 |
+class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor): |
|
| 1782 |
+ def __init__(self, baseuri, baselang, encoding, entities): |
|
| 1783 |
+ sgmllib.SGMLParser.__init__(self) |
|
| 1784 |
+ _FeedParserMixin.__init__(self, baseuri, baselang, encoding) |
|
| 1785 |
+ _BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml') |
|
| 1786 |
+ self.entities=entities |
|
| 1787 |
+ |
|
| 1788 |
+ def decodeEntities(self, element, data): |
|
| 1789 |
+ data = data.replace('<', '<')
|
|
| 1790 |
+ data = data.replace('<', '<')
|
|
| 1791 |
+ data = data.replace('<', '<')
|
|
| 1792 |
+ data = data.replace('>', '>')
|
|
| 1793 |
+ data = data.replace('>', '>')
|
|
| 1794 |
+ data = data.replace('>', '>')
|
|
| 1795 |
+ data = data.replace('&', '&')
|
|
| 1796 |
+ data = data.replace('&', '&')
|
|
| 1797 |
+ data = data.replace('"', '"')
|
|
| 1798 |
+ data = data.replace('"', '"')
|
|
| 1799 |
+ data = data.replace(''', ''')
|
|
| 1800 |
+ data = data.replace(''', ''')
|
|
| 1801 |
+ if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
|
|
| 1802 |
+ data = data.replace('<', '<')
|
|
| 1803 |
+ data = data.replace('>', '>')
|
|
| 1804 |
+ data = data.replace('&', '&')
|
|
| 1805 |
+ data = data.replace('"', '"')
|
|
| 1806 |
+ data = data.replace(''', "'")
|
|
| 1807 |
+ return data |
|
| 1808 |
+ |
|
| 1809 |
+ def strattrs(self, attrs): |
|
| 1810 |
+ return ''.join([' %s="%s"' % (n,v.replace('"','"')) for n,v in attrs])
|
|
| 1811 |
+ |
|
| 1812 |
+class _MicroformatsParser: |
|
| 1813 |
+ STRING = 1 |
|
| 1814 |
+ DATE = 2 |
|
| 1815 |
+ URI = 3 |
|
| 1816 |
+ NODE = 4 |
|
| 1817 |
+ EMAIL = 5 |
|
| 1818 |
+ |
|
| 1819 |
+ known_xfn_relationships = ['contact', 'acquaintance', 'friend', 'met', 'co-worker', 'coworker', 'colleague', 'co-resident', 'coresident', 'neighbor', 'child', 'parent', 'sibling', 'brother', 'sister', 'spouse', 'wife', 'husband', 'kin', 'relative', 'muse', 'crush', 'date', 'sweetheart', 'me'] |
|
| 1820 |
+ known_binary_extensions = ['zip','rar','exe','gz','tar','tgz','tbz2','bz2','z','7z','dmg','img','sit','sitx','hqx','deb','rpm','bz2','jar','rar','iso','bin','msi','mp2','mp3','ogg','ogm','mp4','m4v','m4a','avi','wma','wmv'] |
|
| 1821 |
+ |
|
| 1822 |
+ def __init__(self, data, baseuri, encoding): |
|
| 1823 |
+ self.document = BeautifulSoup.BeautifulSoup(data) |
|
| 1824 |
+ self.baseuri = baseuri |
|
| 1825 |
+ self.encoding = encoding |
|
| 1826 |
+ if type(data) == type(u''): |
|
| 1827 |
+ data = data.encode(encoding) |
|
| 1828 |
+ self.tags = [] |
|
| 1829 |
+ self.enclosures = [] |
|
| 1830 |
+ self.xfn = [] |
|
| 1831 |
+ self.vcard = None |
|
| 1832 |
+ |
|
| 1833 |
+ def vcardEscape(self, s): |
|
| 1834 |
+ if type(s) in (type(''), type(u'')):
|
|
| 1835 |
+ s = s.replace(',', '\\,').replace(';', '\\;').replace('\n', '\\n')
|
|
| 1836 |
+ return s |
|
| 1837 |
+ |
|
| 1838 |
+ def vcardFold(self, s): |
|
| 1839 |
+ s = re.sub(';+$', '', s)
|
|
| 1840 |
+ sFolded = '' |
|
| 1841 |
+ iMax = 75 |
|
| 1842 |
+ sPrefix = '' |
|
| 1843 |
+ while len(s) > iMax: |
|
| 1844 |
+ sFolded += sPrefix + s[:iMax] + '\n' |
|
| 1845 |
+ s = s[iMax:] |
|
| 1846 |
+ sPrefix = ' ' |
|
| 1847 |
+ iMax = 74 |
|
| 1848 |
+ sFolded += sPrefix + s |
|
| 1849 |
+ return sFolded |
|
| 1850 |
+ |
|
| 1851 |
+ def normalize(self, s): |
|
| 1852 |
+ return re.sub(r'\s+', ' ', s).strip() |
|
| 1853 |
+ |
|
| 1854 |
+ def unique(self, aList): |
|
| 1855 |
+ results = [] |
|
| 1856 |
+ for element in aList: |
|
| 1857 |
+ if element not in results: |
|
| 1858 |
+ results.append(element) |
|
| 1859 |
+ return results |
|
| 1860 |
+ |
|
| 1861 |
+ def toISO8601(self, dt): |
|
| 1862 |
+ return time.strftime('%Y-%m-%dT%H:%M:%SZ', dt)
|
|
| 1863 |
+ |
|
| 1864 |
+ def getPropertyValue(self, elmRoot, sProperty, iPropertyType=4, bAllowMultiple=0, bAutoEscape=0): |
|
| 1865 |
+ all = lambda x: 1 |
|
| 1866 |
+ sProperty = sProperty.lower() |
|
| 1867 |
+ bFound = 0 |
|
| 1868 |
+ bNormalize = 1 |
|
| 1869 |
+ propertyMatch = {'class': re.compile(r'\b%s\b' % sProperty)}
|
|
| 1870 |
+ if bAllowMultiple and (iPropertyType != self.NODE): |
|
| 1871 |
+ snapResults = [] |
|
| 1872 |
+ containers = elmRoot(['ul', 'ol'], propertyMatch) |
|
| 1873 |
+ for container in containers: |
|
| 1874 |
+ snapResults.extend(container('li'))
|
|
| 1875 |
+ bFound = (len(snapResults) != 0) |
|
| 1876 |
+ if not bFound: |
|
| 1877 |
+ snapResults = elmRoot(all, propertyMatch) |
|
| 1878 |
+ bFound = (len(snapResults) != 0) |
|
| 1879 |
+ if (not bFound) and (sProperty == 'value'): |
|
| 1880 |
+ snapResults = elmRoot('pre')
|
|
| 1881 |
+ bFound = (len(snapResults) != 0) |
|
| 1882 |
+ bNormalize = not bFound |
|
| 1883 |
+ if not bFound: |
|
| 1884 |
+ snapResults = [elmRoot] |
|
| 1885 |
+ bFound = (len(snapResults) != 0) |
|
| 1886 |
+ arFilter = [] |
|
| 1887 |
+ if sProperty == 'vcard': |
|
| 1888 |
+ snapFilter = elmRoot(all, propertyMatch) |
|
| 1889 |
+ for node in snapFilter: |
|
| 1890 |
+ if node.findParent(all, propertyMatch): |
|
| 1891 |
+ arFilter.append(node) |
|
| 1892 |
+ arResults = [] |
|
| 1893 |
+ for node in snapResults: |
|
| 1894 |
+ if node not in arFilter: |
|
| 1895 |
+ arResults.append(node) |
|
| 1896 |
+ bFound = (len(arResults) != 0) |
|
| 1897 |
+ if not bFound: |
|
| 1898 |
+ if bAllowMultiple: return [] |
|
| 1899 |
+ elif iPropertyType == self.STRING: return '' |
|
| 1900 |
+ elif iPropertyType == self.DATE: return None |
|
| 1901 |
+ elif iPropertyType == self.URI: return '' |
|
| 1902 |
+ elif iPropertyType == self.NODE: return None |
|
| 1903 |
+ else: return None |
|
| 1904 |
+ arValues = [] |
|
| 1905 |
+ for elmResult in arResults: |
|
| 1906 |
+ sValue = None |
|
| 1907 |
+ if iPropertyType == self.NODE: |
|
| 1908 |
+ if bAllowMultiple: |
|
| 1909 |
+ arValues.append(elmResult) |
|
| 1910 |
+ continue |
|
| 1911 |
+ else: |
|
| 1912 |
+ return elmResult |
|
| 1913 |
+ sNodeName = elmResult.name.lower() |
|
| 1914 |
+ if (iPropertyType == self.EMAIL) and (sNodeName == 'a'): |
|
| 1915 |
+ sValue = (elmResult.get('href') or '').split('mailto:').pop().split('?')[0]
|
|
| 1916 |
+ if sValue: |
|
| 1917 |
+ sValue = bNormalize and self.normalize(sValue) or sValue.strip() |
|
| 1918 |
+ if (not sValue) and (sNodeName == 'abbr'): |
|
| 1919 |
+ sValue = elmResult.get('title')
|
|
| 1920 |
+ if sValue: |
|
| 1921 |
+ sValue = bNormalize and self.normalize(sValue) or sValue.strip() |
|
| 1922 |
+ if (not sValue) and (iPropertyType == self.URI): |
|
| 1923 |
+ if sNodeName == 'a': sValue = elmResult.get('href')
|
|
| 1924 |
+ elif sNodeName == 'img': sValue = elmResult.get('src')
|
|
| 1925 |
+ elif sNodeName == 'object': sValue = elmResult.get('data')
|
|
| 1926 |
+ if sValue: |
|
| 1927 |
+ sValue = bNormalize and self.normalize(sValue) or sValue.strip() |
|
| 1928 |
+ if (not sValue) and (sNodeName == 'img'): |
|
| 1929 |
+ sValue = elmResult.get('alt')
|
|
| 1930 |
+ if sValue: |
|
| 1931 |
+ sValue = bNormalize and self.normalize(sValue) or sValue.strip() |
|
| 1932 |
+ if not sValue: |
|
| 1933 |
+ sValue = elmResult.renderContents() |
|
| 1934 |
+ sValue = re.sub(r'<\S[^>]*>', '', sValue) |
|
| 1935 |
+ sValue = sValue.replace('\r\n', '\n')
|
|
| 1936 |
+ sValue = sValue.replace('\r', '\n')
|
|
| 1937 |
+ if sValue: |
|
| 1938 |
+ sValue = bNormalize and self.normalize(sValue) or sValue.strip() |
|
| 1939 |
+ if not sValue: continue |
|
| 1940 |
+ if iPropertyType == self.DATE: |
|
| 1941 |
+ sValue = _parse_date_iso8601(sValue) |
|
| 1942 |
+ if bAllowMultiple: |
|
| 1943 |
+ arValues.append(bAutoEscape and self.vcardEscape(sValue) or sValue) |
|
| 1944 |
+ else: |
|
| 1945 |
+ return bAutoEscape and self.vcardEscape(sValue) or sValue |
|
| 1946 |
+ return arValues |
|
| 1947 |
+ |
|
| 1948 |
+ def findVCards(self, elmRoot, bAgentParsing=0): |
|
| 1949 |
+ sVCards = '' |
|
| 1950 |
+ |
|
| 1951 |
+ if not bAgentParsing: |
|
| 1952 |
+ arCards = self.getPropertyValue(elmRoot, 'vcard', bAllowMultiple=1) |
|
| 1953 |
+ else: |
|
| 1954 |
+ arCards = [elmRoot] |
|
| 1955 |
+ |
|
| 1956 |
+ for elmCard in arCards: |
|
| 1957 |
+ arLines = [] |
|
| 1958 |
+ |
|
| 1959 |
+ def processSingleString(sProperty): |
|
| 1960 |
+ sValue = self.getPropertyValue(elmCard, sProperty, self.STRING, bAutoEscape=1) |
|
| 1961 |
+ if sValue: |
|
| 1962 |
+ arLines.append(self.vcardFold(sProperty.upper() + ':' + sValue)) |
|
| 1963 |
+ return sValue or '' |
|
| 1964 |
+ |
|
| 1965 |
+ def processSingleURI(sProperty): |
|
| 1966 |
+ sValue = self.getPropertyValue(elmCard, sProperty, self.URI) |
|
| 1967 |
+ if sValue: |
|
| 1968 |
+ sContentType = '' |
|
| 1969 |
+ sEncoding = '' |
|
| 1970 |
+ sValueKey = '' |
|
| 1971 |
+ if sValue.startswith('data:'):
|
|
| 1972 |
+ sEncoding = ';ENCODING=b' |
|
| 1973 |
+ sContentType = sValue.split(';')[0].split('/').pop()
|
|
| 1974 |
+ sValue = sValue.split(',', 1).pop()
|
|
| 1975 |
+ else: |
|
| 1976 |
+ elmValue = self.getPropertyValue(elmCard, sProperty) |
|
| 1977 |
+ if elmValue: |
|
| 1978 |
+ if sProperty != 'url': |
|
| 1979 |
+ sValueKey = ';VALUE=uri' |
|
| 1980 |
+ sContentType = elmValue.get('type', '').strip().split('/').pop().strip()
|
|
| 1981 |
+ sContentType = sContentType.upper() |
|
| 1982 |
+ if sContentType == 'OCTET-STREAM': |
|
| 1983 |
+ sContentType = '' |
|
| 1984 |
+ if sContentType: |
|
| 1985 |
+ sContentType = ';TYPE=' + sContentType.upper() |
|
| 1986 |
+ arLines.append(self.vcardFold(sProperty.upper() + sEncoding + sContentType + sValueKey + ':' + sValue)) |
|
| 1987 |
+ |
|
| 1988 |
+ def processTypeValue(sProperty, arDefaultType, arForceType=None): |
|
| 1989 |
+ arResults = self.getPropertyValue(elmCard, sProperty, bAllowMultiple=1) |
|
| 1990 |
+ for elmResult in arResults: |
|
| 1991 |
+ arType = self.getPropertyValue(elmResult, 'type', self.STRING, 1, 1) |
|
| 1992 |
+ if arForceType: |
|
| 1993 |
+ arType = self.unique(arForceType + arType) |
|
| 1994 |
+ if not arType: |
|
| 1995 |
+ arType = arDefaultType |
|
| 1996 |
+ sValue = self.getPropertyValue(elmResult, 'value', self.EMAIL, 0) |
|
| 1997 |
+ if sValue: |
|
| 1998 |
+ arLines.append(self.vcardFold(sProperty.upper() + ';TYPE=' + ','.join(arType) + ':' + sValue)) |
|
| 1999 |
+ |
|
| 2000 |
+ # AGENT |
|
| 2001 |
+ # must do this before all other properties because it is destructive |
|
| 2002 |
+ # (removes nested class="vcard" nodes so they don't interfere with |
|
| 2003 |
+ # this vcard's other properties) |
|
| 2004 |
+ arAgent = self.getPropertyValue(elmCard, 'agent', bAllowMultiple=1) |
|
| 2005 |
+ for elmAgent in arAgent: |
|
| 2006 |
+ if re.compile(r'\bvcard\b').search(elmAgent.get('class')):
|
|
| 2007 |
+ sAgentValue = self.findVCards(elmAgent, 1) + '\n' |
|
| 2008 |
+ sAgentValue = sAgentValue.replace('\n', '\\n')
|
|
| 2009 |
+ sAgentValue = sAgentValue.replace(';', '\\;')
|
|
| 2010 |
+ if sAgentValue: |
|
| 2011 |
+ arLines.append(self.vcardFold('AGENT:' + sAgentValue))
|
|
| 2012 |
+ elmAgent['class'] = '' |
|
| 2013 |
+ elmAgent.contents = [] |
|
| 2014 |
+ else: |
|
| 2015 |
+ sAgentValue = self.getPropertyValue(elmAgent, 'value', self.URI, bAutoEscape=1); |
|
| 2016 |
+ if sAgentValue: |
|
| 2017 |
+ arLines.append(self.vcardFold('AGENT;VALUE=uri:' + sAgentValue))
|
|
| 2018 |
+ |
|
| 2019 |
+ # FN (full name) |
|
| 2020 |
+ sFN = processSingleString('fn')
|
|
| 2021 |
+ |
|
| 2022 |
+ # N (name) |
|
| 2023 |
+ elmName = self.getPropertyValue(elmCard, 'n') |
|
| 2024 |
+ if elmName: |
|
| 2025 |
+ sFamilyName = self.getPropertyValue(elmName, 'family-name', self.STRING, bAutoEscape=1) |
|
| 2026 |
+ sGivenName = self.getPropertyValue(elmName, 'given-name', self.STRING, bAutoEscape=1) |
|
| 2027 |
+ arAdditionalNames = self.getPropertyValue(elmName, 'additional-name', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'additional-names', self.STRING, 1, 1) |
|
| 2028 |
+ arHonorificPrefixes = self.getPropertyValue(elmName, 'honorific-prefix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-prefixes', self.STRING, 1, 1) |
|
| 2029 |
+ arHonorificSuffixes = self.getPropertyValue(elmName, 'honorific-suffix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-suffixes', self.STRING, 1, 1) |
|
| 2030 |
+ arLines.append(self.vcardFold('N:' + sFamilyName + ';' +
|
|
| 2031 |
+ sGivenName + ';' + |
|
| 2032 |
+ ','.join(arAdditionalNames) + ';' + |
|
| 2033 |
+ ','.join(arHonorificPrefixes) + ';' + |
|
| 2034 |
+ ','.join(arHonorificSuffixes))) |
|
| 2035 |
+ elif sFN: |
|
| 2036 |
+ # implied "N" optimization |
|
| 2037 |
+ # http://microformats.org/wiki/hcard#Implied_.22N.22_Optimization |
|
| 2038 |
+ arNames = self.normalize(sFN).split() |
|
| 2039 |
+ if len(arNames) == 2: |
|
| 2040 |
+ bFamilyNameFirst = (arNames[0].endswith(',') or
|
|
| 2041 |
+ len(arNames[1]) == 1 or |
|
| 2042 |
+ ((len(arNames[1]) == 2) and (arNames[1].endswith('.'))))
|
|
| 2043 |
+ if bFamilyNameFirst: |
|
| 2044 |
+ arLines.append(self.vcardFold('N:' + arNames[0] + ';' + arNames[1]))
|
|
| 2045 |
+ else: |
|
| 2046 |
+ arLines.append(self.vcardFold('N:' + arNames[1] + ';' + arNames[0]))
|
|
| 2047 |
+ |
|
| 2048 |
+ # SORT-STRING |
|
| 2049 |
+ sSortString = self.getPropertyValue(elmCard, 'sort-string', self.STRING, bAutoEscape=1) |
|
| 2050 |
+ if sSortString: |
|
| 2051 |
+ arLines.append(self.vcardFold('SORT-STRING:' + sSortString))
|
|
| 2052 |
+ |
|
| 2053 |
+ # NICKNAME |
|
| 2054 |
+ arNickname = self.getPropertyValue(elmCard, 'nickname', self.STRING, 1, 1) |
|
| 2055 |
+ if arNickname: |
|
| 2056 |
+ arLines.append(self.vcardFold('NICKNAME:' + ','.join(arNickname)))
|
|
| 2057 |
+ |
|
| 2058 |
+ # PHOTO |
|
| 2059 |
+ processSingleURI('photo')
|
|
| 2060 |
+ |
|
| 2061 |
+ # BDAY |
|
| 2062 |
+ dtBday = self.getPropertyValue(elmCard, 'bday', self.DATE) |
|
| 2063 |
+ if dtBday: |
|
| 2064 |
+ arLines.append(self.vcardFold('BDAY:' + self.toISO8601(dtBday)))
|
|
| 2065 |
+ |
|
| 2066 |
+ # ADR (address) |
|
| 2067 |
+ arAdr = self.getPropertyValue(elmCard, 'adr', bAllowMultiple=1) |
|
| 2068 |
+ for elmAdr in arAdr: |
|
| 2069 |
+ arType = self.getPropertyValue(elmAdr, 'type', self.STRING, 1, 1) |
|
| 2070 |
+ if not arType: |
|
| 2071 |
+ arType = ['intl','postal','parcel','work'] # default adr types, see RFC 2426 section 3.2.1 |
|
| 2072 |
+ sPostOfficeBox = self.getPropertyValue(elmAdr, 'post-office-box', self.STRING, 0, 1) |
|
| 2073 |
+ sExtendedAddress = self.getPropertyValue(elmAdr, 'extended-address', self.STRING, 0, 1) |
|
| 2074 |
+ sStreetAddress = self.getPropertyValue(elmAdr, 'street-address', self.STRING, 0, 1) |
|
| 2075 |
+ sLocality = self.getPropertyValue(elmAdr, 'locality', self.STRING, 0, 1) |
|
| 2076 |
+ sRegion = self.getPropertyValue(elmAdr, 'region', self.STRING, 0, 1) |
|
| 2077 |
+ sPostalCode = self.getPropertyValue(elmAdr, 'postal-code', self.STRING, 0, 1) |
|
| 2078 |
+ sCountryName = self.getPropertyValue(elmAdr, 'country-name', self.STRING, 0, 1) |
|
| 2079 |
+ arLines.append(self.vcardFold('ADR;TYPE=' + ','.join(arType) + ':' +
|
|
| 2080 |
+ sPostOfficeBox + ';' + |
|
| 2081 |
+ sExtendedAddress + ';' + |
|
| 2082 |
+ sStreetAddress + ';' + |
|
| 2083 |
+ sLocality + ';' + |
|
| 2084 |
+ sRegion + ';' + |
|
| 2085 |
+ sPostalCode + ';' + |
|
| 2086 |
+ sCountryName)) |
|
| 2087 |
+ |
|
| 2088 |
+ # LABEL |
|
| 2089 |
+ processTypeValue('label', ['intl','postal','parcel','work'])
|
|
| 2090 |
+ |
|
| 2091 |
+ # TEL (phone number) |
|
| 2092 |
+ processTypeValue('tel', ['voice'])
|
|
| 2093 |
+ |
|
| 2094 |
|
|
| 2095 |
+ processTypeValue('email', ['internet'], ['internet'])
|
|
| 2096 |
+ |
|
| 2097 |
+ # MAILER |
|
| 2098 |
+ processSingleString('mailer')
|
|
| 2099 |
+ |
|
| 2100 |
+ # TZ (timezone) |
|
| 2101 |
+ processSingleString('tz')
|
|
| 2102 |
+ |
|
| 2103 |
+ # GEO (geographical information) |
|
| 2104 |
+ elmGeo = self.getPropertyValue(elmCard, 'geo') |
|
| 2105 |
+ if elmGeo: |
|
| 2106 |
+ sLatitude = self.getPropertyValue(elmGeo, 'latitude', self.STRING, 0, 1) |
|
| 2107 |
+ sLongitude = self.getPropertyValue(elmGeo, 'longitude', self.STRING, 0, 1) |
|
| 2108 |
+ arLines.append(self.vcardFold('GEO:' + sLatitude + ';' + sLongitude))
|
|
| 2109 |
+ |
|
| 2110 |
+ # TITLE |
|
| 2111 |
+ processSingleString('title')
|
|
| 2112 |
+ |
|
| 2113 |
+ # ROLE |
|
| 2114 |
+ processSingleString('role')
|
|
| 2115 |
+ |
|
| 2116 |
+ # LOGO |
|
| 2117 |
+ processSingleURI('logo')
|
|
| 2118 |
+ |
|
| 2119 |
+ # ORG (organization) |
|
| 2120 |
+ elmOrg = self.getPropertyValue(elmCard, 'org') |
|
| 2121 |
+ if elmOrg: |
|
| 2122 |
+ sOrganizationName = self.getPropertyValue(elmOrg, 'organization-name', self.STRING, 0, 1) |
|
| 2123 |
+ if not sOrganizationName: |
|
| 2124 |
+ # implied "organization-name" optimization |
|
| 2125 |
+ # http://microformats.org/wiki/hcard#Implied_.22organization-name.22_Optimization |
|
| 2126 |
+ sOrganizationName = self.getPropertyValue(elmCard, 'org', self.STRING, 0, 1) |
|
| 2127 |
+ if sOrganizationName: |
|
| 2128 |
+ arLines.append(self.vcardFold('ORG:' + sOrganizationName))
|
|
| 2129 |
+ else: |
|
| 2130 |
+ arOrganizationUnit = self.getPropertyValue(elmOrg, 'organization-unit', self.STRING, 1, 1) |
|
| 2131 |
+ arLines.append(self.vcardFold('ORG:' + sOrganizationName + ';' + ';'.join(arOrganizationUnit)))
|
|
| 2132 |
+ |
|
| 2133 |
+ # CATEGORY |
|
| 2134 |
+ arCategory = self.getPropertyValue(elmCard, 'category', self.STRING, 1, 1) + self.getPropertyValue(elmCard, 'categories', self.STRING, 1, 1) |
|
| 2135 |
+ if arCategory: |
|
| 2136 |
+ arLines.append(self.vcardFold('CATEGORIES:' + ','.join(arCategory)))
|
|
| 2137 |
+ |
|
| 2138 |
+ # NOTE |
|
| 2139 |
+ processSingleString('note')
|
|
| 2140 |
+ |
|
| 2141 |
+ # REV |
|
| 2142 |
+ processSingleString('rev')
|
|
| 2143 |
+ |
|
| 2144 |
+ # SOUND |
|
| 2145 |
+ processSingleURI('sound')
|
|
| 2146 |
+ |
|
| 2147 |
+ # UID |
|
| 2148 |
+ processSingleString('uid')
|
|
| 2149 |
+ |
|
| 2150 |
+ # URL |
|
| 2151 |
+ processSingleURI('url')
|
|
| 2152 |
+ |
|
| 2153 |
+ # CLASS |
|
| 2154 |
+ processSingleString('class')
|
|
| 2155 |
+ |
|
| 2156 |
+ # KEY |
|
| 2157 |
+ processSingleURI('key')
|
|
| 2158 |
+ |
|
| 2159 |
+ if arLines: |
|
| 2160 |
+ arLines = ['BEGIN:vCard','VERSION:3.0'] + arLines + ['END:vCard'] |
|
| 2161 |
+ sVCards += '\n'.join(arLines) + '\n' |
|
| 2162 |
+ |
|
| 2163 |
+ return sVCards.strip() |
|
| 2164 |
+ |
|
| 2165 |
+ def isProbablyDownloadable(self, elm): |
|
| 2166 |
+ attrsD = elm.attrMap |
|
| 2167 |
+ if not attrsD.has_key('href'): return 0
|
|
| 2168 |
+ linktype = attrsD.get('type', '').strip()
|
|
| 2169 |
+ if linktype.startswith('audio/') or \
|
|
| 2170 |
+ linktype.startswith('video/') or \
|
|
| 2171 |
+ (linktype.startswith('application/') and not linktype.endswith('xml')):
|
|
| 2172 |
+ return 1 |
|
| 2173 |
+ path = urlparse.urlparse(attrsD['href'])[2] |
|
| 2174 |
+ if path.find('.') == -1: return 0
|
|
| 2175 |
+ fileext = path.split('.').pop().lower()
|
|
| 2176 |
+ return fileext in self.known_binary_extensions |
|
| 2177 |
+ |
|
| 2178 |
+ def findTags(self): |
|
| 2179 |
+ all = lambda x: 1 |
|
| 2180 |
+ for elm in self.document(all, {'rel': re.compile(r'\btag\b')}):
|
|
| 2181 |
+ href = elm.get('href')
|
|
| 2182 |
+ if not href: continue |
|
| 2183 |
+ urlscheme, domain, path, params, query, fragment = \ |
|
| 2184 |
+ urlparse.urlparse(_urljoin(self.baseuri, href)) |
|
| 2185 |
+ segments = path.split('/')
|
|
| 2186 |
+ tag = segments.pop() |
|
| 2187 |
+ if not tag: |
|
| 2188 |
+ tag = segments.pop() |
|
| 2189 |
+ tagscheme = urlparse.urlunparse((urlscheme, domain, '/'.join(segments), '', '', '')) |
|
| 2190 |
+ if not tagscheme.endswith('/'):
|
|
| 2191 |
+ tagscheme += '/' |
|
| 2192 |
+ self.tags.append(FeedParserDict({"term": tag, "scheme": tagscheme, "label": elm.string or ''}))
|
|
| 2193 |
+ |
|
| 2194 |
+ def findEnclosures(self): |
|
| 2195 |
+ all = lambda x: 1 |
|
| 2196 |
+ enclosure_match = re.compile(r'\benclosure\b') |
|
| 2197 |
+ for elm in self.document(all, {'href': re.compile(r'.+')}):
|
|
| 2198 |
+ if not enclosure_match.search(elm.get('rel', '')) and not self.isProbablyDownloadable(elm): continue
|
|
| 2199 |
+ if elm.attrMap not in self.enclosures: |
|
| 2200 |
+ self.enclosures.append(elm.attrMap) |
|
| 2201 |
+ if elm.string and not elm.get('title'):
|
|
| 2202 |
+ self.enclosures[-1]['title'] = elm.string |
|
| 2203 |
+ |
|
| 2204 |
+ def findXFN(self): |
|
| 2205 |
+ all = lambda x: 1 |
|
| 2206 |
+ for elm in self.document(all, {'rel': re.compile('.+'), 'href': re.compile('.+')}):
|
|
| 2207 |
+ rels = elm.get('rel', '').split()
|
|
| 2208 |
+ xfn_rels = [] |
|
| 2209 |
+ for rel in rels: |
|
| 2210 |
+ if rel in self.known_xfn_relationships: |
|
| 2211 |
+ xfn_rels.append(rel) |
|
| 2212 |
+ if xfn_rels: |
|
| 2213 |
+ self.xfn.append({"relationships": xfn_rels, "href": elm.get('href', ''), "name": elm.string})
|
|
| 2214 |
+ |
|
| 2215 |
+def _parseMicroformats(htmlSource, baseURI, encoding): |
|
| 2216 |
+ if not BeautifulSoup: return |
|
| 2217 |
+ if _debug: sys.stderr.write('entering _parseMicroformats\n')
|
|
| 2218 |
+ p = _MicroformatsParser(htmlSource, baseURI, encoding) |
|
| 2219 |
+ p.vcard = p.findVCards(p.document) |
|
| 2220 |
+ p.findTags() |
|
| 2221 |
+ p.findEnclosures() |
|
| 2222 |
+ p.findXFN() |
|
| 2223 |
+ return {"tags": p.tags, "enclosures": p.enclosures, "xfn": p.xfn, "vcard": p.vcard}
|
|
| 2224 |
+ |
|
| 2225 |
+class _RelativeURIResolver(_BaseHTMLProcessor): |
|
| 2226 |
+ relative_uris = [('a', 'href'),
|
|
| 2227 |
+ ('applet', 'codebase'),
|
|
| 2228 |
+ ('area', 'href'),
|
|
| 2229 |
+ ('blockquote', 'cite'),
|
|
| 2230 |
+ ('body', 'background'),
|
|
| 2231 |
+ ('del', 'cite'),
|
|
| 2232 |
+ ('form', 'action'),
|
|
| 2233 |
+ ('frame', 'longdesc'),
|
|
| 2234 |
+ ('frame', 'src'),
|
|
| 2235 |
+ ('iframe', 'longdesc'),
|
|
| 2236 |
+ ('iframe', 'src'),
|
|
| 2237 |
+ ('head', 'profile'),
|
|
| 2238 |
+ ('img', 'longdesc'),
|
|
| 2239 |
+ ('img', 'src'),
|
|
| 2240 |
+ ('img', 'usemap'),
|
|
| 2241 |
+ ('input', 'src'),
|
|
| 2242 |
+ ('input', 'usemap'),
|
|
| 2243 |
+ ('ins', 'cite'),
|
|
| 2244 |
+ ('link', 'href'),
|
|
| 2245 |
+ ('object', 'classid'),
|
|
| 2246 |
+ ('object', 'codebase'),
|
|
| 2247 |
+ ('object', 'data'),
|
|
| 2248 |
+ ('object', 'usemap'),
|
|
| 2249 |
+ ('q', 'cite'),
|
|
| 2250 |
+ ('script', 'src')]
|
|
| 2251 |
+ |
|
| 2252 |
+ def __init__(self, baseuri, encoding, type): |
|
| 2253 |
+ _BaseHTMLProcessor.__init__(self, encoding, type) |
|
| 2254 |
+ self.baseuri = baseuri |
|
| 2255 |
+ |
|
| 2256 |
+ def resolveURI(self, uri): |
|
| 2257 |
+ return _urljoin(self.baseuri, uri.strip()) |
|
| 2258 |
+ |
|
| 2259 |
+ def unknown_starttag(self, tag, attrs): |
|
| 2260 |
+ attrs = self.normalize_attrs(attrs) |
|
| 2261 |
+ attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs] |
|
| 2262 |
+ _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) |
|
| 2263 |
+ |
|
| 2264 |
+def _resolveRelativeURIs(htmlSource, baseURI, encoding, type): |
|
| 2265 |
+ if _debug: sys.stderr.write('entering _resolveRelativeURIs\n')
|
|
| 2266 |
+ p = _RelativeURIResolver(baseURI, encoding, type) |
|
| 2267 |
+ p.feed(htmlSource) |
|
| 2268 |
+ return p.output() |
|
| 2269 |
+ |
|
| 2270 |
+class _HTMLSanitizer(_BaseHTMLProcessor): |
|
| 2271 |
+ acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'article', |
|
| 2272 |
+ 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button', 'canvas', |
|
| 2273 |
+ 'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'command', |
|
| 2274 |
+ 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir', |
|
| 2275 |
+ 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', 'figure', 'footer', |
|
| 2276 |
+ 'font', 'form', 'header', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', |
|
| 2277 |
+ 'img', 'input', 'ins', 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', |
|
| 2278 |
+ 'menu', 'meter', 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', |
|
| 2279 |
+ 'option', 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select', |
|
| 2280 |
+ 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', 'sub', |
|
| 2281 |
+ 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', 'th', 'thead', |
|
| 2282 |
+ 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript'] |
|
| 2283 |
+ |
|
| 2284 |
+ acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey', |
|
| 2285 |
+ 'action', 'align', 'alt', 'autoplay', 'autocomplete', 'autofocus', 'axis', |
|
| 2286 |
+ 'background', 'balance', 'bgcolor', 'bgproperties', 'border', |
|
| 2287 |
+ 'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding', |
|
| 2288 |
+ 'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff', |
|
| 2289 |
+ 'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols', |
|
| 2290 |
+ 'colspan', 'compact', 'contenteditable', 'coords', 'data', 'datafld', |
|
| 2291 |
+ 'datapagesize', 'datasrc', 'datetime', 'default', 'delay', 'dir', |
|
| 2292 |
+ 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for', |
|
| 2293 |
+ 'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus', |
|
| 2294 |
+ 'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode', |
|
| 2295 |
+ 'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc', |
|
| 2296 |
+ 'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max', |
|
| 2297 |
+ 'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref', |
|
| 2298 |
+ 'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size', |
|
| 2299 |
+ 'prompt', 'pqg', 'radiogroup', 'readonly', 'rel', 'repeat-max', |
|
| 2300 |
+ 'repeat-min', 'replace', 'required', 'rev', 'rightspacing', 'rows', |
|
| 2301 |
+ 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', |
|
| 2302 |
+ 'start', 'step', 'summary', 'suppress', 'tabindex', 'target', 'template', |
|
| 2303 |
+ 'title', 'toppadding', 'type', 'unselectable', 'usemap', 'urn', 'valign', |
|
| 2304 |
+ 'value', 'variable', 'volume', 'vspace', 'vrml', 'width', 'wrap', |
|
| 2305 |
+ 'xml:lang'] |
|
| 2306 |
+ |
|
| 2307 |
+ unacceptable_elements_with_end_tag = ['script', 'applet', 'style'] |
|
| 2308 |
+ |
|
| 2309 |
+ acceptable_css_properties = ['azimuth', 'background-color', |
|
| 2310 |
+ 'border-bottom-color', 'border-collapse', 'border-color', |
|
| 2311 |
+ 'border-left-color', 'border-right-color', 'border-top-color', 'clear', |
|
| 2312 |
+ 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font', |
|
| 2313 |
+ 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight', |
|
| 2314 |
+ 'height', 'letter-spacing', 'line-height', 'overflow', 'pause', |
|
| 2315 |
+ 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness', |
|
| 2316 |
+ 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation', |
|
| 2317 |
+ 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent', |
|
| 2318 |
+ 'unicode-bidi', 'vertical-align', 'voice-family', 'volume', |
|
| 2319 |
+ 'white-space', 'width'] |
|
| 2320 |
+ |
|
| 2321 |
+ # survey of common keywords found in feeds |
|
| 2322 |
+ acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue', |
|
| 2323 |
+ 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed', |
|
| 2324 |
+ 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left', |
|
| 2325 |
+ 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive', |
|
| 2326 |
+ 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top', |
|
| 2327 |
+ 'transparent', 'underline', 'white', 'yellow'] |
|
| 2328 |
+ |
|
| 2329 |
+ valid_css_values = re.compile('^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' +
|
|
| 2330 |
+ '\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$')
|
|
| 2331 |
+ |
|
| 2332 |
+ mathml_elements = ['annotation', 'annotation-xml', 'maction', 'math', |
|
| 2333 |
+ 'merror', 'mfenced', 'mfrac', 'mi', 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', |
|
| 2334 |
+ 'mphantom', 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', |
|
| 2335 |
+ 'msub', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder', |
|
| 2336 |
+ 'munderover', 'none', 'semantics'] |
|
| 2337 |
+ |
|
| 2338 |
+ mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign', |
|
| 2339 |
+ 'columnalign', 'close', 'columnlines', 'columnspacing', 'columnspan', 'depth', |
|
| 2340 |
+ 'display', 'displaystyle', 'encoding', 'equalcolumns', 'equalrows', |
|
| 2341 |
+ 'fence', 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', |
|
| 2342 |
+ 'lspace', 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', |
|
| 2343 |
+ 'maxsize', 'minsize', 'open', 'other', 'rowalign', 'rowalign', 'rowalign', |
|
| 2344 |
+ 'rowlines', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection', |
|
| 2345 |
+ 'separator', 'separators', 'stretchy', 'width', 'width', 'xlink:href', |
|
| 2346 |
+ 'xlink:show', 'xlink:type', 'xmlns', 'xmlns:xlink'] |
|
| 2347 |
+ |
|
| 2348 |
+ # svgtiny - foreignObject + linearGradient + radialGradient + stop |
|
| 2349 |
+ svg_elements = ['a', 'animate', 'animateColor', 'animateMotion', |
|
| 2350 |
+ 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject', |
|
| 2351 |
+ 'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', |
|
| 2352 |
+ 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath', |
|
| 2353 |
+ 'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop', |
|
| 2354 |
+ 'svg', 'switch', 'text', 'title', 'tspan', 'use'] |
|
| 2355 |
+ |
|
| 2356 |
+ # svgtiny + class + opacity + offset + xmlns + xmlns:xlink |
|
| 2357 |
+ svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic', |
|
| 2358 |
+ 'arabic-form', 'ascent', 'attributeName', 'attributeType', |
|
| 2359 |
+ 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height', |
|
| 2360 |
+ 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx', |
|
| 2361 |
+ 'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity', |
|
| 2362 |
+ 'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style', |
|
| 2363 |
+ 'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', |
|
| 2364 |
+ 'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x', |
|
| 2365 |
+ 'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines', |
|
| 2366 |
+ 'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid', |
|
| 2367 |
+ 'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max', |
|
| 2368 |
+ 'min', 'name', 'offset', 'opacity', 'orient', 'origin', |
|
| 2369 |
+ 'overline-position', 'overline-thickness', 'panose-1', 'path', |
|
| 2370 |
+ 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY', |
|
| 2371 |
+ 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures', |
|
| 2372 |
+ 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', |
|
| 2373 |
+ 'stop-color', 'stop-opacity', 'strikethrough-position', |
|
| 2374 |
+ 'strikethrough-thickness', 'stroke', 'stroke-dasharray', |
|
| 2375 |
+ 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin', |
|
| 2376 |
+ 'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage', |
|
| 2377 |
+ 'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2', |
|
| 2378 |
+ 'underline-position', 'underline-thickness', 'unicode', 'unicode-range', |
|
| 2379 |
+ 'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width', |
|
| 2380 |
+ 'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole', |
|
| 2381 |
+ 'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type', |
|
| 2382 |
+ 'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', |
|
| 2383 |
+ 'y2', 'zoomAndPan'] |
|
| 2384 |
+ |
|
| 2385 |
+ svg_attr_map = None |
|
| 2386 |
+ svg_elem_map = None |
|
| 2387 |
+ |
|
| 2388 |
+ acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule', |
|
| 2389 |
+ 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin', |
|
| 2390 |
+ 'stroke-opacity'] |
|
| 2391 |
+ |
|
| 2392 |
+ def reset(self): |
|
| 2393 |
+ _BaseHTMLProcessor.reset(self) |
|
| 2394 |
+ self.unacceptablestack = 0 |
|
| 2395 |
+ self.mathmlOK = 0 |
|
| 2396 |
+ self.svgOK = 0 |
|
| 2397 |
+ |
|
| 2398 |
+ def unknown_starttag(self, tag, attrs): |
|
| 2399 |
+ acceptable_attributes = self.acceptable_attributes |
|
| 2400 |
+ keymap = {}
|
|
| 2401 |
+ if not tag in self.acceptable_elements or self.svgOK: |
|
| 2402 |
+ if tag in self.unacceptable_elements_with_end_tag: |
|
| 2403 |
+ self.unacceptablestack += 1 |
|
| 2404 |
+ |
|
| 2405 |
+ # not otherwise acceptable, perhaps it is MathML or SVG? |
|
| 2406 |
+ if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs:
|
|
| 2407 |
+ self.mathmlOK += 1 |
|
| 2408 |
+ if tag=='svg' and ('xmlns','http://www.w3.org/2000/svg') in attrs:
|
|
| 2409 |
+ self.svgOK += 1 |
|
| 2410 |
+ |
|
| 2411 |
+ # chose acceptable attributes based on tag class, else bail |
|
| 2412 |
+ if self.mathmlOK and tag in self.mathml_elements: |
|
| 2413 |
+ acceptable_attributes = self.mathml_attributes |
|
| 2414 |
+ elif self.svgOK and tag in self.svg_elements: |
|
| 2415 |
+ # for most vocabularies, lowercasing is a good idea. Many |
|
| 2416 |
+ # svg elements, however, are camel case |
|
| 2417 |
+ if not self.svg_attr_map: |
|
| 2418 |
+ lower=[attr.lower() for attr in self.svg_attributes] |
|
| 2419 |
+ mix=[a for a in self.svg_attributes if a not in lower] |
|
| 2420 |
+ self.svg_attributes = lower |
|
| 2421 |
+ self.svg_attr_map = dict([(a.lower(),a) for a in mix]) |
|
| 2422 |
+ |
|
| 2423 |
+ lower=[attr.lower() for attr in self.svg_elements] |
|
| 2424 |
+ mix=[a for a in self.svg_elements if a not in lower] |
|
| 2425 |
+ self.svg_elements = lower |
|
| 2426 |
+ self.svg_elem_map = dict([(a.lower(),a) for a in mix]) |
|
| 2427 |
+ acceptable_attributes = self.svg_attributes |
|
| 2428 |
+ tag = self.svg_elem_map.get(tag,tag) |
|
| 2429 |
+ keymap = self.svg_attr_map |
|
| 2430 |
+ elif not tag in self.acceptable_elements: |
|
| 2431 |
+ return |
|
| 2432 |
+ |
|
| 2433 |
+ # declare xlink namespace, if needed |
|
| 2434 |
+ if self.mathmlOK or self.svgOK: |
|
| 2435 |
+ if filter(lambda (n,v): n.startswith('xlink:'),attrs):
|
|
| 2436 |
+ if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs:
|
|
| 2437 |
+ attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink'))
|
|
| 2438 |
+ |
|
| 2439 |
+ clean_attrs = [] |
|
| 2440 |
+ for key, value in self.normalize_attrs(attrs): |
|
| 2441 |
+ if key in acceptable_attributes: |
|
| 2442 |
+ key=keymap.get(key,key) |
|
| 2443 |
+ clean_attrs.append((key,value)) |
|
| 2444 |
+ elif key=='style': |
|
| 2445 |
+ clean_value = self.sanitize_style(value) |
|
| 2446 |
+ if clean_value: clean_attrs.append((key,clean_value)) |
|
| 2447 |
+ _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs) |
|
| 2448 |
+ |
|
| 2449 |
+ def unknown_endtag(self, tag): |
|
| 2450 |
+ if not tag in self.acceptable_elements: |
|
| 2451 |
+ if tag in self.unacceptable_elements_with_end_tag: |
|
| 2452 |
+ self.unacceptablestack -= 1 |
|
| 2453 |
+ if self.mathmlOK and tag in self.mathml_elements: |
|
| 2454 |
+ if tag == 'math' and self.mathmlOK: self.mathmlOK -= 1 |
|
| 2455 |
+ elif self.svgOK and tag in self.svg_elements: |
|
| 2456 |
+ tag = self.svg_elem_map.get(tag,tag) |
|
| 2457 |
+ if tag == 'svg' and self.svgOK: self.svgOK -= 1 |
|
| 2458 |
+ else: |
|
| 2459 |
+ return |
|
| 2460 |
+ _BaseHTMLProcessor.unknown_endtag(self, tag) |
|
| 2461 |
+ |
|
| 2462 |
+ def handle_pi(self, text): |
|
| 2463 |
+ pass |
|
| 2464 |
+ |
|
| 2465 |
+ def handle_decl(self, text): |
|
| 2466 |
+ pass |
|
| 2467 |
+ |
|
| 2468 |
+ def handle_data(self, text): |
|
| 2469 |
+ if not self.unacceptablestack: |
|
| 2470 |
+ _BaseHTMLProcessor.handle_data(self, text) |
|
| 2471 |
+ |
|
| 2472 |
+ def sanitize_style(self, style): |
|
| 2473 |
+ # disallow urls |
|
| 2474 |
+ style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
|
|
| 2475 |
+ |
|
| 2476 |
+ # gauntlet |
|
| 2477 |
+ if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
|
|
| 2478 |
+ if not re.match("^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$", style): return ''
|
|
| 2479 |
+ |
|
| 2480 |
+ clean = [] |
|
| 2481 |
+ for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
|
|
| 2482 |
+ if not value: continue |
|
| 2483 |
+ if prop.lower() in self.acceptable_css_properties: |
|
| 2484 |
+ clean.append(prop + ': ' + value + ';') |
|
| 2485 |
+ elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
|
|
| 2486 |
+ for keyword in value.split(): |
|
| 2487 |
+ if not keyword in self.acceptable_css_keywords and \ |
|
| 2488 |
+ not self.valid_css_values.match(keyword): |
|
| 2489 |
+ break |
|
| 2490 |
+ else: |
|
| 2491 |
+ clean.append(prop + ': ' + value + ';') |
|
| 2492 |
+ elif self.svgOK and prop.lower() in self.acceptable_svg_properties: |
|
| 2493 |
+ clean.append(prop + ': ' + value + ';') |
|
| 2494 |
+ |
|
| 2495 |
+ return ' '.join(clean) |
|
| 2496 |
+ |
|
| 2497 |
+ |
|
| 2498 |
+def _sanitizeHTML(htmlSource, encoding, type): |
|
| 2499 |
+ p = _HTMLSanitizer(encoding, type) |
|
| 2500 |
+ p.feed(htmlSource) |
|
| 2501 |
+ data = p.output() |
|
| 2502 |
+ if TIDY_MARKUP: |
|
| 2503 |
+ # loop through list of preferred Tidy interfaces looking for one that's installed, |
|
| 2504 |
+ # then set up a common _tidy function to wrap the interface-specific API. |
|
| 2505 |
+ _tidy = None |
|
| 2506 |
+ for tidy_interface in PREFERRED_TIDY_INTERFACES: |
|
| 2507 |
+ try: |
|
| 2508 |
+ if tidy_interface == "uTidy": |
|
| 2509 |
+ from tidy import parseString as _utidy |
|
| 2510 |
+ def _tidy(data, **kwargs): |
|
| 2511 |
+ return str(_utidy(data, **kwargs)) |
|
| 2512 |
+ break |
|
| 2513 |
+ elif tidy_interface == "mxTidy": |
|
| 2514 |
+ from mx.Tidy import Tidy as _mxtidy |
|
| 2515 |
+ def _tidy(data, **kwargs): |
|
| 2516 |
+ nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs) |
|
| 2517 |
+ return data |
|
| 2518 |
+ break |
|
| 2519 |
+ except: |
|
| 2520 |
+ pass |
|
| 2521 |
+ if _tidy: |
|
| 2522 |
+ utf8 = type(data) == type(u'') |
|
| 2523 |
+ if utf8: |
|
| 2524 |
+ data = data.encode('utf-8')
|
|
| 2525 |
+ data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8") |
|
| 2526 |
+ if utf8: |
|
| 2527 |
+ data = unicode(data, 'utf-8') |
|
| 2528 |
+ if data.count('<body'):
|
|
| 2529 |
+ data = data.split('<body', 1)[1]
|
|
| 2530 |
+ if data.count('>'):
|
|
| 2531 |
+ data = data.split('>', 1)[1]
|
|
| 2532 |
+ if data.count('</body'):
|
|
| 2533 |
+ data = data.split('</body', 1)[0]
|
|
| 2534 |
+ data = data.strip().replace('\r\n', '\n')
|
|
| 2535 |
+ return data |
|
| 2536 |
+ |
|
| 2537 |
+class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler): |
|
| 2538 |
+ def http_error_default(self, req, fp, code, msg, headers): |
|
| 2539 |
+ if ((code / 100) == 3) and (code != 304): |
|
| 2540 |
+ return self.http_error_302(req, fp, code, msg, headers) |
|
| 2541 |
+ infourl = urllib.addinfourl(fp, headers, req.get_full_url()) |
|
| 2542 |
+ infourl.status = code |
|
| 2543 |
+ return infourl |
|
| 2544 |
+ |
|
| 2545 |
+ def http_error_302(self, req, fp, code, msg, headers): |
|
| 2546 |
+ if headers.dict.has_key('location'):
|
|
| 2547 |
+ infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers) |
|
| 2548 |
+ else: |
|
| 2549 |
+ infourl = urllib.addinfourl(fp, headers, req.get_full_url()) |
|
| 2550 |
+ if not hasattr(infourl, 'status'): |
|
| 2551 |
+ infourl.status = code |
|
| 2552 |
+ return infourl |
|
| 2553 |
+ |
|
| 2554 |
+ def http_error_301(self, req, fp, code, msg, headers): |
|
| 2555 |
+ if headers.dict.has_key('location'):
|
|
| 2556 |
+ infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers) |
|
| 2557 |
+ else: |
|
| 2558 |
+ infourl = urllib.addinfourl(fp, headers, req.get_full_url()) |
|
| 2559 |
+ if not hasattr(infourl, 'status'): |
|
| 2560 |
+ infourl.status = code |
|
| 2561 |
+ return infourl |
|
| 2562 |
+ |
|
| 2563 |
+ http_error_300 = http_error_302 |
|
| 2564 |
+ http_error_303 = http_error_302 |
|
| 2565 |
+ http_error_307 = http_error_302 |
|
| 2566 |
+ |
|
| 2567 |
+ def http_error_401(self, req, fp, code, msg, headers): |
|
| 2568 |
+ # Check if |
|
| 2569 |
+ # - server requires digest auth, AND |
|
| 2570 |
+ # - we tried (unsuccessfully) with basic auth, AND |
|
| 2571 |
+ # - we're using Python 2.3.3 or later (digest auth is irreparably broken in earlier versions) |
|
| 2572 |
+ # If all conditions hold, parse authentication information |
|
| 2573 |
+ # out of the Authorization header we sent the first time |
|
| 2574 |
+ # (for the username and password) and the WWW-Authenticate |
|
| 2575 |
+ # header the server sent back (for the realm) and retry |
|
| 2576 |
+ # the request with the appropriate digest auth headers instead. |
|
| 2577 |
+ # This evil genius hack has been brought to you by Aaron Swartz. |
|
| 2578 |
+ host = urlparse.urlparse(req.get_full_url())[1] |
|
| 2579 |
+ try: |
|
| 2580 |
+ assert sys.version.split()[0] >= '2.3.3' |
|
| 2581 |
+ assert base64 != None |
|
| 2582 |
+ user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':')
|
|
| 2583 |
+ realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
|
|
| 2584 |
+ self.add_password(realm, host, user, passw) |
|
| 2585 |
+ retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
|
|
| 2586 |
+ self.reset_retry_count() |
|
| 2587 |
+ return retry |
|
| 2588 |
+ except: |
|
| 2589 |
+ return self.http_error_default(req, fp, code, msg, headers) |
|
| 2590 |
+ |
|
| 2591 |
+def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers): |
|
| 2592 |
+ """URL, filename, or string --> stream |
|
| 2593 |
+ |
|
| 2594 |
+ This function lets you define parsers that take any input source |
|
| 2595 |
+ (URL, pathname to local or network file, or actual data as a string) |
|
| 2596 |
+ and deal with it in a uniform manner. Returned object is guaranteed |
|
| 2597 |
+ to have all the basic stdio read methods (read, readline, readlines). |
|
| 2598 |
+ Just .close() the object when you're done with it. |
|
| 2599 |
+ |
|
| 2600 |
+ If the etag argument is supplied, it will be used as the value of an |
|
| 2601 |
+ If-None-Match request header. |
|
| 2602 |
+ |
|
| 2603 |
+ If the modified argument is supplied, it can be a tuple of 9 integers |
|
| 2604 |
+ (as returned by gmtime() in the standard Python time module) or a date |
|
| 2605 |
+ string in any format supported by feedparser. Regardless, it MUST |
|
| 2606 |
+ be in GMT (Greenwich Mean Time). It will be reformatted into an |
|
| 2607 |
+ RFC 1123-compliant date and used as the value of an If-Modified-Since |
|
| 2608 |
+ request header. |
|
| 2609 |
+ |
|
| 2610 |
+ If the agent argument is supplied, it will be used as the value of a |
|
| 2611 |
+ User-Agent request header. |
|
| 2612 |
+ |
|
| 2613 |
+ If the referrer argument is supplied, it will be used as the value of a |
|
| 2614 |
+ Referer[sic] request header. |
|
| 2615 |
+ |
|
| 2616 |
+ If handlers is supplied, it is a list of handlers used to build a |
|
| 2617 |
+ urllib2 opener. |
|
| 2618 |
+ """ |
|
| 2619 |
+ |
|
| 2620 |
+ if hasattr(url_file_stream_or_string, 'read'): |
|
| 2621 |
+ return url_file_stream_or_string |
|
| 2622 |
+ |
|
| 2623 |
+ if url_file_stream_or_string == '-': |
|
| 2624 |
+ return sys.stdin |
|
| 2625 |
+ |
|
| 2626 |
+ if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):
|
|
| 2627 |
+ if not agent: |
|
| 2628 |
+ agent = USER_AGENT |
|
| 2629 |
+ # test for inline user:password for basic auth |
|
| 2630 |
+ auth = None |
|
| 2631 |
+ if base64: |
|
| 2632 |
+ urltype, rest = urllib.splittype(url_file_stream_or_string) |
|
| 2633 |
+ realhost, rest = urllib.splithost(rest) |
|
| 2634 |
+ if realhost: |
|
| 2635 |
+ user_passwd, realhost = urllib.splituser(realhost) |
|
| 2636 |
+ if user_passwd: |
|
| 2637 |
+ url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest) |
|
| 2638 |
+ auth = base64.encodestring(user_passwd).strip() |
|
| 2639 |
+ |
|
| 2640 |
+ # iri support |
|
| 2641 |
+ try: |
|
| 2642 |
+ if isinstance(url_file_stream_or_string,unicode): |
|
| 2643 |
+ url_file_stream_or_string = url_file_stream_or_string.encode('idna')
|
|
| 2644 |
+ else: |
|
| 2645 |
+ url_file_stream_or_string = url_file_stream_or_string.decode('utf-8').encode('idna')
|
|
| 2646 |
+ except: |
|
| 2647 |
+ pass |
|
| 2648 |
+ |
|
| 2649 |
+ # try to open with urllib2 (to use optional headers) |
|
| 2650 |
+ request = urllib2.Request(url_file_stream_or_string) |
|
| 2651 |
+ request.add_header('User-Agent', agent)
|
|
| 2652 |
+ if etag: |
|
| 2653 |
+ request.add_header('If-None-Match', etag)
|
|
| 2654 |
+ if type(modified) == type(''):
|
|
| 2655 |
+ modified = _parse_date(modified) |
|
| 2656 |
+ if modified: |
|
| 2657 |
+ # format into an RFC 1123-compliant timestamp. We can't use |
|
| 2658 |
+ # time.strftime() since the %a and %b directives can be affected |
|
| 2659 |
+ # by the current locale, but RFC 2616 states that dates must be |
|
| 2660 |
+ # in English. |
|
| 2661 |
+ short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] |
|
| 2662 |
+ months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] |
|
| 2663 |
+ request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
|
|
| 2664 |
+ if referrer: |
|
| 2665 |
+ request.add_header('Referer', referrer)
|
|
| 2666 |
+ if gzip and zlib: |
|
| 2667 |
+ request.add_header('Accept-encoding', 'gzip, deflate')
|
|
| 2668 |
+ elif gzip: |
|
| 2669 |
+ request.add_header('Accept-encoding', 'gzip')
|
|
| 2670 |
+ elif zlib: |
|
| 2671 |
+ request.add_header('Accept-encoding', 'deflate')
|
|
| 2672 |
+ else: |
|
| 2673 |
+ request.add_header('Accept-encoding', '')
|
|
| 2674 |
+ if auth: |
|
| 2675 |
+ request.add_header('Authorization', 'Basic %s' % auth)
|
|
| 2676 |
+ if ACCEPT_HEADER: |
|
| 2677 |
+ request.add_header('Accept', ACCEPT_HEADER)
|
|
| 2678 |
+ request.add_header('A-IM', 'feed') # RFC 3229 support
|
|
| 2679 |
+ opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers)) |
|
| 2680 |
+ opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent |
|
| 2681 |
+ try: |
|
| 2682 |
+ return opener.open(request) |
|
| 2683 |
+ finally: |
|
| 2684 |
+ opener.close() # JohnD |
|
| 2685 |
+ |
|
| 2686 |
+ # try to open with native open function (if url_file_stream_or_string is a filename) |
|
| 2687 |
+ try: |
|
| 2688 |
+ return open(url_file_stream_or_string) |
|
| 2689 |
+ except: |
|
| 2690 |
+ pass |
|
| 2691 |
+ |
|
| 2692 |
+ # treat url_file_stream_or_string as string |
|
| 2693 |
+ return _StringIO(str(url_file_stream_or_string)) |
|
| 2694 |
+ |
|
| 2695 |
+_date_handlers = [] |
|
| 2696 |
+def registerDateHandler(func): |
|
| 2697 |
+ '''Register a date handler function (takes string, returns 9-tuple date in GMT)''' |
|
| 2698 |
+ _date_handlers.insert(0, func) |
|
| 2699 |
+ |
|
| 2700 |
+# ISO-8601 date parsing routines written by Fazal Majid. |
|
| 2701 |
+# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601 |
|
| 2702 |
+# parser is beyond the scope of feedparser and would be a worthwhile addition |
|
| 2703 |
+# to the Python library. |
|
| 2704 |
+# A single regular expression cannot parse ISO 8601 date formats into groups |
|
| 2705 |
+# as the standard is highly irregular (for instance is 030104 2003-01-04 or |
|
| 2706 |
+# 0301-04-01), so we use templates instead. |
|
| 2707 |
+# Please note the order in templates is significant because we need a |
|
| 2708 |
+# greedy match. |
|
| 2709 |
+_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-0MM?-?DD', 'YYYY-MM', 'YYYY-?OOO', |
|
| 2710 |
+ 'YY-?MM-?DD', 'YY-?OOO', 'YYYY', |
|
| 2711 |
+ '-YY-?MM', '-OOO', '-YY', |
|
| 2712 |
+ '--MM-?DD', '--MM', |
|
| 2713 |
+ '---DD', |
|
| 2714 |
+ 'CC', ''] |
|
| 2715 |
+_iso8601_re = [ |
|
| 2716 |
+ tmpl.replace( |
|
| 2717 |
+ 'YYYY', r'(?P<year>\d{4})').replace(
|
|
| 2718 |
+ 'YY', r'(?P<year>\d\d)').replace( |
|
| 2719 |
+ 'MM', r'(?P<month>[01]\d)').replace( |
|
| 2720 |
+ 'DD', r'(?P<day>[0123]\d)').replace( |
|
| 2721 |
+ 'OOO', r'(?P<ordinal>[0123]\d\d)').replace( |
|
| 2722 |
+ 'CC', r'(?P<century>\d\d$)') |
|
| 2723 |
+ + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
|
|
| 2724 |
+ + r'(:(?P<second>\d{2}(\.\d*)?))?'
|
|
| 2725 |
+ + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
|
|
| 2726 |
+ for tmpl in _iso8601_tmpl] |
|
| 2727 |
+del tmpl |
|
| 2728 |
+_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re] |
|
| 2729 |
+del regex |
|
| 2730 |
+def _parse_date_iso8601(dateString): |
|
| 2731 |
+ '''Parse a variety of ISO-8601-compatible formats like 20040105''' |
|
| 2732 |
+ m = None |
|
| 2733 |
+ for _iso8601_match in _iso8601_matches: |
|
| 2734 |
+ m = _iso8601_match(dateString) |
|
| 2735 |
+ if m: break |
|
| 2736 |
+ if not m: return |
|
| 2737 |
+ if m.span() == (0, 0): return |
|
| 2738 |
+ params = m.groupdict() |
|
| 2739 |
+ ordinal = params.get('ordinal', 0)
|
|
| 2740 |
+ if ordinal: |
|
| 2741 |
+ ordinal = int(ordinal) |
|
| 2742 |
+ else: |
|
| 2743 |
+ ordinal = 0 |
|
| 2744 |
+ year = params.get('year', '--')
|
|
| 2745 |
+ if not year or year == '--': |
|
| 2746 |
+ year = time.gmtime()[0] |
|
| 2747 |
+ elif len(year) == 2: |
|
| 2748 |
+ # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993 |
|
| 2749 |
+ year = 100 * int(time.gmtime()[0] / 100) + int(year) |
|
| 2750 |
+ else: |
|
| 2751 |
+ year = int(year) |
|
| 2752 |
+ month = params.get('month', '-')
|
|
| 2753 |
+ if not month or month == '-': |
|
| 2754 |
+ # ordinals are NOT normalized by mktime, we simulate them |
|
| 2755 |
+ # by setting month=1, day=ordinal |
|
| 2756 |
+ if ordinal: |
|
| 2757 |
+ month = 1 |
|
| 2758 |
+ else: |
|
| 2759 |
+ month = time.gmtime()[1] |
|
| 2760 |
+ month = int(month) |
|
| 2761 |
+ day = params.get('day', 0)
|
|
| 2762 |
+ if not day: |
|
| 2763 |
+ # see above |
|
| 2764 |
+ if ordinal: |
|
| 2765 |
+ day = ordinal |
|
| 2766 |
+ elif params.get('century', 0) or \
|
|
| 2767 |
+ params.get('year', 0) or params.get('month', 0):
|
|
| 2768 |
+ day = 1 |
|
| 2769 |
+ else: |
|
| 2770 |
+ day = time.gmtime()[2] |
|
| 2771 |
+ else: |
|
| 2772 |
+ day = int(day) |
|
| 2773 |
+ # special case of the century - is the first year of the 21st century |
|
| 2774 |
+ # 2000 or 2001 ? The debate goes on... |
|
| 2775 |
+ if 'century' in params.keys(): |
|
| 2776 |
+ year = (int(params['century']) - 1) * 100 + 1 |
|
| 2777 |
+ # in ISO 8601 most fields are optional |
|
| 2778 |
+ for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']: |
|
| 2779 |
+ if not params.get(field, None): |
|
| 2780 |
+ params[field] = 0 |
|
| 2781 |
+ hour = int(params.get('hour', 0))
|
|
| 2782 |
+ minute = int(params.get('minute', 0))
|
|
| 2783 |
+ second = int(float(params.get('second', 0)))
|
|
| 2784 |
+ # weekday is normalized by mktime(), we can ignore it |
|
| 2785 |
+ weekday = 0 |
|
| 2786 |
+ daylight_savings_flag = -1 |
|
| 2787 |
+ tm = [year, month, day, hour, minute, second, weekday, |
|
| 2788 |
+ ordinal, daylight_savings_flag] |
|
| 2789 |
+ # ISO 8601 time zone adjustments |
|
| 2790 |
+ tz = params.get('tz')
|
|
| 2791 |
+ if tz and tz != 'Z': |
|
| 2792 |
+ if tz[0] == '-': |
|
| 2793 |
+ tm[3] += int(params.get('tzhour', 0))
|
|
| 2794 |
+ tm[4] += int(params.get('tzmin', 0))
|
|
| 2795 |
+ elif tz[0] == '+': |
|
| 2796 |
+ tm[3] -= int(params.get('tzhour', 0))
|
|
| 2797 |
+ tm[4] -= int(params.get('tzmin', 0))
|
|
| 2798 |
+ else: |
|
| 2799 |
+ return None |
|
| 2800 |
+ # Python's time.mktime() is a wrapper around the ANSI C mktime(3c) |
|
| 2801 |
+ # which is guaranteed to normalize d/m/y/h/m/s. |
|
| 2802 |
+ # Many implementations have bugs, but we'll pretend they don't. |
|
| 2803 |
+ return time.localtime(time.mktime(tm)) |
|
| 2804 |
+registerDateHandler(_parse_date_iso8601) |
|
| 2805 |
+ |
|
| 2806 |
+# 8-bit date handling routines written by ytrewq1. |
|
| 2807 |
+_korean_year = u'\ub144' # b3e2 in euc-kr |
|
| 2808 |
+_korean_month = u'\uc6d4' # bff9 in euc-kr |
|
| 2809 |
+_korean_day = u'\uc77c' # c0cf in euc-kr |
|
| 2810 |
+_korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr |
|
| 2811 |
+_korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr |
|
| 2812 |
+ |
|
| 2813 |
+_korean_onblog_date_re = \ |
|
| 2814 |
+ re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
|
|
| 2815 |
+ (_korean_year, _korean_month, _korean_day)) |
|
| 2816 |
+_korean_nate_date_re = \ |
|
| 2817 |
+ re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
|
|
| 2818 |
+ (_korean_am, _korean_pm)) |
|
| 2819 |
+def _parse_date_onblog(dateString): |
|
| 2820 |
+ '''Parse a string according to the OnBlog 8-bit date format''' |
|
| 2821 |
+ m = _korean_onblog_date_re.match(dateString) |
|
| 2822 |
+ if not m: return |
|
| 2823 |
+ w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ |
|
| 2824 |
+ {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
|
|
| 2825 |
+ 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\ |
|
| 2826 |
+ 'zonediff': '+09:00'} |
|
| 2827 |
+ if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate)
|
|
| 2828 |
+ return _parse_date_w3dtf(w3dtfdate) |
|
| 2829 |
+registerDateHandler(_parse_date_onblog) |
|
| 2830 |
+ |
|
| 2831 |
+def _parse_date_nate(dateString): |
|
| 2832 |
+ '''Parse a string according to the Nate 8-bit date format''' |
|
| 2833 |
+ m = _korean_nate_date_re.match(dateString) |
|
| 2834 |
+ if not m: return |
|
| 2835 |
+ hour = int(m.group(5)) |
|
| 2836 |
+ ampm = m.group(4) |
|
| 2837 |
+ if (ampm == _korean_pm): |
|
| 2838 |
+ hour += 12 |
|
| 2839 |
+ hour = str(hour) |
|
| 2840 |
+ if len(hour) == 1: |
|
| 2841 |
+ hour = '0' + hour |
|
| 2842 |
+ w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ |
|
| 2843 |
+ {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
|
|
| 2844 |
+ 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\ |
|
| 2845 |
+ 'zonediff': '+09:00'} |
|
| 2846 |
+ if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate)
|
|
| 2847 |
+ return _parse_date_w3dtf(w3dtfdate) |
|
| 2848 |
+registerDateHandler(_parse_date_nate) |
|
| 2849 |
+ |
|
| 2850 |
+_mssql_date_re = \ |
|
| 2851 |
+ re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')
|
|
| 2852 |
+def _parse_date_mssql(dateString): |
|
| 2853 |
+ '''Parse a string according to the MS SQL date format''' |
|
| 2854 |
+ m = _mssql_date_re.match(dateString) |
|
| 2855 |
+ if not m: return |
|
| 2856 |
+ w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ |
|
| 2857 |
+ {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
|
|
| 2858 |
+ 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\ |
|
| 2859 |
+ 'zonediff': '+09:00'} |
|
| 2860 |
+ if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate)
|
|
| 2861 |
+ return _parse_date_w3dtf(w3dtfdate) |
|
| 2862 |
+registerDateHandler(_parse_date_mssql) |
|
| 2863 |
+ |
|
| 2864 |
+# Unicode strings for Greek date strings |
|
| 2865 |
+_greek_months = \ |
|
| 2866 |
+ { \
|
|
| 2867 |
+ u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7 |
|
| 2868 |
+ u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7 |
|
| 2869 |
+ u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7 |
|
| 2870 |
+ u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7 |
|
| 2871 |
+ u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7 |
|
| 2872 |
+ u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7 |
|
| 2873 |
+ u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7 |
|
| 2874 |
+ u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7 |
|
| 2875 |
+ u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7 |
|
| 2876 |
+ u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7 |
|
| 2877 |
+ u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7 |
|
| 2878 |
+ u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7 |
|
| 2879 |
+ u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7 |
|
| 2880 |
+ u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7 |
|
| 2881 |
+ u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7 |
|
| 2882 |
+ u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7 |
|
| 2883 |
+ u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7 |
|
| 2884 |
+ u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7 |
|
| 2885 |
+ u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7 |
|
| 2886 |
+ } |
|
| 2887 |
+ |
|
| 2888 |
+_greek_wdays = \ |
|
| 2889 |
+ { \
|
|
| 2890 |
+ u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7 |
|
| 2891 |
+ u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7 |
|
| 2892 |
+ u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7 |
|
| 2893 |
+ u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7 |
|
| 2894 |
+ u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7 |
|
| 2895 |
+ u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7 |
|
| 2896 |
+ u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7 |
|
| 2897 |
+ } |
|
| 2898 |
+ |
|
| 2899 |
+_greek_date_format_re = \ |
|
| 2900 |
+ re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
|
|
| 2901 |
+ |
|
| 2902 |
+def _parse_date_greek(dateString): |
|
| 2903 |
+ '''Parse a string according to a Greek 8-bit date format.''' |
|
| 2904 |
+ m = _greek_date_format_re.match(dateString) |
|
| 2905 |
+ if not m: return |
|
| 2906 |
+ try: |
|
| 2907 |
+ wday = _greek_wdays[m.group(1)] |
|
| 2908 |
+ month = _greek_months[m.group(3)] |
|
| 2909 |
+ except: |
|
| 2910 |
+ return |
|
| 2911 |
+ rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \ |
|
| 2912 |
+ {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
|
|
| 2913 |
+ 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\ |
|
| 2914 |
+ 'zonediff': m.group(8)} |
|
| 2915 |
+ if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date)
|
|
| 2916 |
+ return _parse_date_rfc822(rfc822date) |
|
| 2917 |
+registerDateHandler(_parse_date_greek) |
|
| 2918 |
+ |
|
| 2919 |
+# Unicode strings for Hungarian date strings |
|
| 2920 |
+_hungarian_months = \ |
|
| 2921 |
+ { \
|
|
| 2922 |
+ u'janu\u00e1r': u'01', # e1 in iso-8859-2 |
|
| 2923 |
+ u'febru\u00e1ri': u'02', # e1 in iso-8859-2 |
|
| 2924 |
+ u'm\u00e1rcius': u'03', # e1 in iso-8859-2 |
|
| 2925 |
+ u'\u00e1prilis': u'04', # e1 in iso-8859-2 |
|
| 2926 |
+ u'm\u00e1ujus': u'05', # e1 in iso-8859-2 |
|
| 2927 |
+ u'j\u00fanius': u'06', # fa in iso-8859-2 |
|
| 2928 |
+ u'j\u00falius': u'07', # fa in iso-8859-2 |
|
| 2929 |
+ u'augusztus': u'08', |
|
| 2930 |
+ u'szeptember': u'09', |
|
| 2931 |
+ u'okt\u00f3ber': u'10', # f3 in iso-8859-2 |
|
| 2932 |
+ u'november': u'11', |
|
| 2933 |
+ u'december': u'12', |
|
| 2934 |
+ } |
|
| 2935 |
+ |
|
| 2936 |
+_hungarian_date_format_re = \ |
|
| 2937 |
+ re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
|
|
| 2938 |
+ |
|
| 2939 |
+def _parse_date_hungarian(dateString): |
|
| 2940 |
+ '''Parse a string according to a Hungarian 8-bit date format.''' |
|
| 2941 |
+ m = _hungarian_date_format_re.match(dateString) |
|
| 2942 |
+ if not m: return |
|
| 2943 |
+ try: |
|
| 2944 |
+ month = _hungarian_months[m.group(2)] |
|
| 2945 |
+ day = m.group(3) |
|
| 2946 |
+ if len(day) == 1: |
|
| 2947 |
+ day = '0' + day |
|
| 2948 |
+ hour = m.group(4) |
|
| 2949 |
+ if len(hour) == 1: |
|
| 2950 |
+ hour = '0' + hour |
|
| 2951 |
+ except: |
|
| 2952 |
+ return |
|
| 2953 |
+ w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \ |
|
| 2954 |
+ {'year': m.group(1), 'month': month, 'day': day,\
|
|
| 2955 |
+ 'hour': hour, 'minute': m.group(5),\ |
|
| 2956 |
+ 'zonediff': m.group(6)} |
|
| 2957 |
+ if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate)
|
|
| 2958 |
+ return _parse_date_w3dtf(w3dtfdate) |
|
| 2959 |
+registerDateHandler(_parse_date_hungarian) |
|
| 2960 |
+ |
|
| 2961 |
+# W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by |
|
| 2962 |
+# Drake and licensed under the Python license. Removed all range checking |
|
| 2963 |
+# for month, day, hour, minute, and second, since mktime will normalize |
|
| 2964 |
+# these later |
|
| 2965 |
+def _parse_date_w3dtf(dateString): |
|
| 2966 |
+ def __extract_date(m): |
|
| 2967 |
+ year = int(m.group('year'))
|
|
| 2968 |
+ if year < 100: |
|
| 2969 |
+ year = 100 * int(time.gmtime()[0] / 100) + int(year) |
|
| 2970 |
+ if year < 1000: |
|
| 2971 |
+ return 0, 0, 0 |
|
| 2972 |
+ julian = m.group('julian')
|
|
| 2973 |
+ if julian: |
|
| 2974 |
+ julian = int(julian) |
|
| 2975 |
+ month = julian / 30 + 1 |
|
| 2976 |
+ day = julian % 30 + 1 |
|
| 2977 |
+ jday = None |
|
| 2978 |
+ while jday != julian: |
|
| 2979 |
+ t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0)) |
|
| 2980 |
+ jday = time.gmtime(t)[-2] |
|
| 2981 |
+ diff = abs(jday - julian) |
|
| 2982 |
+ if jday > julian: |
|
| 2983 |
+ if diff < day: |
|
| 2984 |
+ day = day - diff |
|
| 2985 |
+ else: |
|
| 2986 |
+ month = month - 1 |
|
| 2987 |
+ day = 31 |
|
| 2988 |
+ elif jday < julian: |
|
| 2989 |
+ if day + diff < 28: |
|
| 2990 |
+ day = day + diff |
|
| 2991 |
+ else: |
|
| 2992 |
+ month = month + 1 |
|
| 2993 |
+ return year, month, day |
|
| 2994 |
+ month = m.group('month')
|
|
| 2995 |
+ day = 1 |
|
| 2996 |
+ if month is None: |
|
| 2997 |
+ month = 1 |
|
| 2998 |
+ else: |
|
| 2999 |
+ month = int(month) |
|
| 3000 |
+ day = m.group('day')
|
|
| 3001 |
+ if day: |
|
| 3002 |
+ day = int(day) |
|
| 3003 |
+ else: |
|
| 3004 |
+ day = 1 |
|
| 3005 |
+ return year, month, day |
|
| 3006 |
+ |
|
| 3007 |
+ def __extract_time(m): |
|
| 3008 |
+ if not m: |
|
| 3009 |
+ return 0, 0, 0 |
|
| 3010 |
+ hours = m.group('hours')
|
|
| 3011 |
+ if not hours: |
|
| 3012 |
+ return 0, 0, 0 |
|
| 3013 |
+ hours = int(hours) |
|
| 3014 |
+ minutes = int(m.group('minutes'))
|
|
| 3015 |
+ seconds = m.group('seconds')
|
|
| 3016 |
+ if seconds: |
|
| 3017 |
+ seconds = int(seconds) |
|
| 3018 |
+ else: |
|
| 3019 |
+ seconds = 0 |
|
| 3020 |
+ return hours, minutes, seconds |
|
| 3021 |
+ |
|
| 3022 |
+ def __extract_tzd(m): |
|
| 3023 |
+ '''Return the Time Zone Designator as an offset in seconds from UTC.''' |
|
| 3024 |
+ if not m: |
|
| 3025 |
+ return 0 |
|
| 3026 |
+ tzd = m.group('tzd')
|
|
| 3027 |
+ if not tzd: |
|
| 3028 |
+ return 0 |
|
| 3029 |
+ if tzd == 'Z': |
|
| 3030 |
+ return 0 |
|
| 3031 |
+ hours = int(m.group('tzdhours'))
|
|
| 3032 |
+ minutes = m.group('tzdminutes')
|
|
| 3033 |
+ if minutes: |
|
| 3034 |
+ minutes = int(minutes) |
|
| 3035 |
+ else: |
|
| 3036 |
+ minutes = 0 |
|
| 3037 |
+ offset = (hours*60 + minutes) * 60 |
|
| 3038 |
+ if tzd[0] == '+': |
|
| 3039 |
+ return -offset |
|
| 3040 |
+ return offset |
|
| 3041 |
+ |
|
| 3042 |
+ __date_re = ('(?P<year>\d\d\d\d)'
|
|
| 3043 |
+ '(?:(?P<dsep>-|)' |
|
| 3044 |
+ '(?:(?P<julian>\d\d\d)' |
|
| 3045 |
+ '|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?') |
|
| 3046 |
+ __tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)' |
|
| 3047 |
+ __tzd_rx = re.compile(__tzd_re) |
|
| 3048 |
+ __time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)'
|
|
| 3049 |
+ '(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?' |
|
| 3050 |
+ + __tzd_re) |
|
| 3051 |
+ __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re) |
|
| 3052 |
+ __datetime_rx = re.compile(__datetime_re) |
|
| 3053 |
+ m = __datetime_rx.match(dateString) |
|
| 3054 |
+ if (m is None) or (m.group() != dateString): return |
|
| 3055 |
+ gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0) |
|
| 3056 |
+ if gmt[0] == 0: return |
|
| 3057 |
+ return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone) |
|
| 3058 |
+registerDateHandler(_parse_date_w3dtf) |
|
| 3059 |
+ |
|
| 3060 |
+def _parse_date_rfc822(dateString): |
|
| 3061 |
+ '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date''' |
|
| 3062 |
+ data = dateString.split() |
|
| 3063 |
+ if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:
|
|
| 3064 |
+ del data[0] |
|
| 3065 |
+ if len(data) == 4: |
|
| 3066 |
+ s = data[3] |
|
| 3067 |
+ i = s.find('+')
|
|
| 3068 |
+ if i > 0: |
|
| 3069 |
+ data[3:] = [s[:i], s[i+1:]] |
|
| 3070 |
+ else: |
|
| 3071 |
+ data.append('')
|
|
| 3072 |
+ dateString = " ".join(data) |
|
| 3073 |
+ if len(data) < 5: |
|
| 3074 |
+ dateString += ' 00:00:00 GMT' |
|
| 3075 |
+ tm = rfc822.parsedate_tz(dateString) |
|
| 3076 |
+ if tm: |
|
| 3077 |
+ return time.gmtime(rfc822.mktime_tz(tm)) |
|
| 3078 |
+# rfc822.py defines several time zones, but we define some extra ones. |
|
| 3079 |
+# 'ET' is equivalent to 'EST', etc. |
|
| 3080 |
+_additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
|
|
| 3081 |
+rfc822._timezones.update(_additional_timezones) |
|
| 3082 |
+registerDateHandler(_parse_date_rfc822) |
|
| 3083 |
+ |
|
| 3084 |
+def _parse_date_perforce(aDateString): |
|
| 3085 |
+ """parse a date in yyyy/mm/dd hh:mm:ss TTT format""" |
|
| 3086 |
+ # Fri, 2006/09/15 08:19:53 EDT |
|
| 3087 |
+ _my_date_pattern = re.compile( \ |
|
| 3088 |
+ r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})')
|
|
| 3089 |
+ |
|
| 3090 |
+ dow, year, month, day, hour, minute, second, tz = \ |
|
| 3091 |
+ _my_date_pattern.search(aDateString).groups() |
|
| 3092 |
+ months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] |
|
| 3093 |
+ dateString = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz) |
|
| 3094 |
+ tm = rfc822.parsedate_tz(dateString) |
|
| 3095 |
+ if tm: |
|
| 3096 |
+ return time.gmtime(rfc822.mktime_tz(tm)) |
|
| 3097 |
+registerDateHandler(_parse_date_perforce) |
|
| 3098 |
+ |
|
| 3099 |
+def _parse_date(dateString): |
|
| 3100 |
+ '''Parses a variety of date formats into a 9-tuple in GMT''' |
|
| 3101 |
+ for handler in _date_handlers: |
|
| 3102 |
+ try: |
|
| 3103 |
+ date9tuple = handler(dateString) |
|
| 3104 |
+ if not date9tuple: continue |
|
| 3105 |
+ if len(date9tuple) != 9: |
|
| 3106 |
+ if _debug: sys.stderr.write('date handler function must return 9-tuple\n')
|
|
| 3107 |
+ raise ValueError |
|
| 3108 |
+ map(int, date9tuple) |
|
| 3109 |
+ return date9tuple |
|
| 3110 |
+ except Exception as e: |
|
| 3111 |
+ if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e)))
|
|
| 3112 |
+ pass |
|
| 3113 |
+ return None |
|
| 3114 |
+ |
|
| 3115 |
+def _getCharacterEncoding(http_headers, xml_data): |
|
| 3116 |
+ '''Get the character encoding of the XML document |
|
| 3117 |
+ |
|
| 3118 |
+ http_headers is a dictionary |
|
| 3119 |
+ xml_data is a raw string (not Unicode) |
|
| 3120 |
+ |
|
| 3121 |
+ This is so much trickier than it sounds, it's not even funny. |
|
| 3122 |
+ According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
|
|
| 3123 |
+ is application/xml, application/*+xml, |
|
| 3124 |
+ application/xml-external-parsed-entity, or application/xml-dtd, |
|
| 3125 |
+ the encoding given in the charset parameter of the HTTP Content-Type |
|
| 3126 |
+ takes precedence over the encoding given in the XML prefix within the |
|
| 3127 |
+ document, and defaults to 'utf-8' if neither are specified. But, if |
|
| 3128 |
+ the HTTP Content-Type is text/xml, text/*+xml, or |
|
| 3129 |
+ text/xml-external-parsed-entity, the encoding given in the XML prefix |
|
| 3130 |
+ within the document is ALWAYS IGNORED and only the encoding given in |
|
| 3131 |
+ the charset parameter of the HTTP Content-Type header should be |
|
| 3132 |
+ respected, and it defaults to 'us-ascii' if not specified. |
|
| 3133 |
+ |
|
| 3134 |
+ Furthermore, discussion on the atom-syntax mailing list with the |
|
| 3135 |
+ author of RFC 3023 leads me to the conclusion that any document |
|
| 3136 |
+ served with a Content-Type of text/* and no charset parameter |
|
| 3137 |
+ must be treated as us-ascii. (We now do this.) And also that it |
|
| 3138 |
+ must always be flagged as non-well-formed. (We now do this too.) |
|
| 3139 |
+ |
|
| 3140 |
+ If Content-Type is unspecified (input was local file or non-HTTP source) |
|
| 3141 |
+ or unrecognized (server just got it totally wrong), then go by the |
|
| 3142 |
+ encoding given in the XML prefix of the document and default to |
|
| 3143 |
+ 'iso-8859-1' as per the HTTP specification (RFC 2616). |
|
| 3144 |
+ |
|
| 3145 |
+ Then, assuming we didn't find a character encoding in the HTTP headers |
|
| 3146 |
+ (and the HTTP Content-type allowed us to look in the body), we need |
|
| 3147 |
+ to sniff the first few bytes of the XML data and try to determine |
|
| 3148 |
+ whether the encoding is ASCII-compatible. Section F of the XML |
|
| 3149 |
+ specification shows the way here: |
|
| 3150 |
+ http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info |
|
| 3151 |
+ |
|
| 3152 |
+ If the sniffed encoding is not ASCII-compatible, we need to make it |
|
| 3153 |
+ ASCII compatible so that we can sniff further into the XML declaration |
|
| 3154 |
+ to find the encoding attribute, which will tell us the true encoding. |
|
| 3155 |
+ |
|
| 3156 |
+ Of course, none of this guarantees that we will be able to parse the |
|
| 3157 |
+ feed in the declared character encoding (assuming it was declared |
|
| 3158 |
+ correctly, which many are not). CJKCodecs and iconv_codec help a lot; |
|
| 3159 |
+ you should definitely install them if you can. |
|
| 3160 |
+ http://cjkpython.i18n.org/ |
|
| 3161 |
+ ''' |
|
| 3162 |
+ |
|
| 3163 |
+ def _parseHTTPContentType(content_type): |
|
| 3164 |
+ '''takes HTTP Content-Type header and returns (content type, charset) |
|
| 3165 |
+ |
|
| 3166 |
+ If no charset is specified, returns (content type, '') |
|
| 3167 |
+ If no content type is specified, returns ('', '')
|
|
| 3168 |
+ Both return parameters are guaranteed to be lowercase strings |
|
| 3169 |
+ ''' |
|
| 3170 |
+ content_type = content_type or '' |
|
| 3171 |
+ content_type, params = cgi.parse_header(content_type) |
|
| 3172 |
+ return content_type, params.get('charset', '').replace("'", '')
|
|
| 3173 |
+ |
|
| 3174 |
+ sniffed_xml_encoding = '' |
|
| 3175 |
+ xml_encoding = '' |
|
| 3176 |
+ true_encoding = '' |
|
| 3177 |
+ http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type'))
|
|
| 3178 |
+ # Must sniff for non-ASCII-compatible character encodings before |
|
| 3179 |
+ # searching for XML declaration. This heuristic is defined in |
|
| 3180 |
+ # section F of the XML specification: |
|
| 3181 |
+ # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info |
|
| 3182 |
+ try: |
|
| 3183 |
+ if xml_data[:4] == '\x4c\x6f\xa7\x94': |
|
| 3184 |
+ # EBCDIC |
|
| 3185 |
+ xml_data = _ebcdic_to_ascii(xml_data) |
|
| 3186 |
+ elif xml_data[:4] == '\x00\x3c\x00\x3f': |
|
| 3187 |
+ # UTF-16BE |
|
| 3188 |
+ sniffed_xml_encoding = 'utf-16be' |
|
| 3189 |
+ xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
|
|
| 3190 |
+ elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'): |
|
| 3191 |
+ # UTF-16BE with BOM |
|
| 3192 |
+ sniffed_xml_encoding = 'utf-16be' |
|
| 3193 |
+ xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
|
|
| 3194 |
+ elif xml_data[:4] == '\x3c\x00\x3f\x00': |
|
| 3195 |
+ # UTF-16LE |
|
| 3196 |
+ sniffed_xml_encoding = 'utf-16le' |
|
| 3197 |
+ xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
|
|
| 3198 |
+ elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'): |
|
| 3199 |
+ # UTF-16LE with BOM |
|
| 3200 |
+ sniffed_xml_encoding = 'utf-16le' |
|
| 3201 |
+ xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
|
|
| 3202 |
+ elif xml_data[:4] == '\x00\x00\x00\x3c': |
|
| 3203 |
+ # UTF-32BE |
|
| 3204 |
+ sniffed_xml_encoding = 'utf-32be' |
|
| 3205 |
+ xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
|
|
| 3206 |
+ elif xml_data[:4] == '\x3c\x00\x00\x00': |
|
| 3207 |
+ # UTF-32LE |
|
| 3208 |
+ sniffed_xml_encoding = 'utf-32le' |
|
| 3209 |
+ xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
|
|
| 3210 |
+ elif xml_data[:4] == '\x00\x00\xfe\xff': |
|
| 3211 |
+ # UTF-32BE with BOM |
|
| 3212 |
+ sniffed_xml_encoding = 'utf-32be' |
|
| 3213 |
+ xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
|
|
| 3214 |
+ elif xml_data[:4] == '\xff\xfe\x00\x00': |
|
| 3215 |
+ # UTF-32LE with BOM |
|
| 3216 |
+ sniffed_xml_encoding = 'utf-32le' |
|
| 3217 |
+ xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
|
|
| 3218 |
+ elif xml_data[:3] == '\xef\xbb\xbf': |
|
| 3219 |
+ # UTF-8 with BOM |
|
| 3220 |
+ sniffed_xml_encoding = 'utf-8' |
|
| 3221 |
+ xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
|
|
| 3222 |
+ else: |
|
| 3223 |
+ # ASCII-compatible |
|
| 3224 |
+ pass |
|
| 3225 |
+ xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
|
|
| 3226 |
+ except: |
|
| 3227 |
+ xml_encoding_match = None |
|
| 3228 |
+ if xml_encoding_match: |
|
| 3229 |
+ xml_encoding = xml_encoding_match.groups()[0].lower() |
|
| 3230 |
+ if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):
|
|
| 3231 |
+ xml_encoding = sniffed_xml_encoding |
|
| 3232 |
+ acceptable_content_type = 0 |
|
| 3233 |
+ application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')
|
|
| 3234 |
+ text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
|
|
| 3235 |
+ if (http_content_type in application_content_types) or \ |
|
| 3236 |
+ (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
|
|
| 3237 |
+ acceptable_content_type = 1 |
|
| 3238 |
+ true_encoding = http_encoding or xml_encoding or 'utf-8' |
|
| 3239 |
+ elif (http_content_type in text_content_types) or \ |
|
| 3240 |
+ (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):
|
|
| 3241 |
+ acceptable_content_type = 1 |
|
| 3242 |
+ true_encoding = http_encoding or 'us-ascii' |
|
| 3243 |
+ elif http_content_type.startswith('text/'):
|
|
| 3244 |
+ true_encoding = http_encoding or 'us-ascii' |
|
| 3245 |
+ elif http_headers and (not http_headers.has_key('content-type')):
|
|
| 3246 |
+ true_encoding = xml_encoding or 'iso-8859-1' |
|
| 3247 |
+ else: |
|
| 3248 |
+ true_encoding = xml_encoding or 'utf-8' |
|
| 3249 |
+ # some feeds claim to be gb2312 but are actually gb18030. |
|
| 3250 |
+ # apparently MSIE and Firefox both do the following switch: |
|
| 3251 |
+ if true_encoding.lower() == 'gb2312': |
|
| 3252 |
+ true_encoding = 'gb18030' |
|
| 3253 |
+ return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type |
|
| 3254 |
+ |
|
| 3255 |
+def _toUTF8(data, encoding): |
|
| 3256 |
+ '''Changes an XML data stream on the fly to specify a new encoding |
|
| 3257 |
+ |
|
| 3258 |
+ data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already |
|
| 3259 |
+ encoding is a string recognized by encodings.aliases |
|
| 3260 |
+ ''' |
|
| 3261 |
+ if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)
|
|
| 3262 |
+ # strip Byte Order Mark (if present) |
|
| 3263 |
+ if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'): |
|
| 3264 |
+ if _debug: |
|
| 3265 |
+ sys.stderr.write('stripping BOM\n')
|
|
| 3266 |
+ if encoding != 'utf-16be': |
|
| 3267 |
+ sys.stderr.write('trying utf-16be instead\n')
|
|
| 3268 |
+ encoding = 'utf-16be' |
|
| 3269 |
+ data = data[2:] |
|
| 3270 |
+ elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'): |
|
| 3271 |
+ if _debug: |
|
| 3272 |
+ sys.stderr.write('stripping BOM\n')
|
|
| 3273 |
+ if encoding != 'utf-16le': |
|
| 3274 |
+ sys.stderr.write('trying utf-16le instead\n')
|
|
| 3275 |
+ encoding = 'utf-16le' |
|
| 3276 |
+ data = data[2:] |
|
| 3277 |
+ elif data[:3] == '\xef\xbb\xbf': |
|
| 3278 |
+ if _debug: |
|
| 3279 |
+ sys.stderr.write('stripping BOM\n')
|
|
| 3280 |
+ if encoding != 'utf-8': |
|
| 3281 |
+ sys.stderr.write('trying utf-8 instead\n')
|
|
| 3282 |
+ encoding = 'utf-8' |
|
| 3283 |
+ data = data[3:] |
|
| 3284 |
+ elif data[:4] == '\x00\x00\xfe\xff': |
|
| 3285 |
+ if _debug: |
|
| 3286 |
+ sys.stderr.write('stripping BOM\n')
|
|
| 3287 |
+ if encoding != 'utf-32be': |
|
| 3288 |
+ sys.stderr.write('trying utf-32be instead\n')
|
|
| 3289 |
+ encoding = 'utf-32be' |
|
| 3290 |
+ data = data[4:] |
|
| 3291 |
+ elif data[:4] == '\xff\xfe\x00\x00': |
|
| 3292 |
+ if _debug: |
|
| 3293 |
+ sys.stderr.write('stripping BOM\n')
|
|
| 3294 |
+ if encoding != 'utf-32le': |
|
| 3295 |
+ sys.stderr.write('trying utf-32le instead\n')
|
|
| 3296 |
+ encoding = 'utf-32le' |
|
| 3297 |
+ data = data[4:] |
|
| 3298 |
+ newdata = unicode(data, encoding) |
|
| 3299 |
+ if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)
|
|
| 3300 |
+ declmatch = re.compile('^<\?xml[^>]*?>')
|
|
| 3301 |
+ newdecl = '''<?xml version='1.0' encoding='utf-8'?>''' |
|
| 3302 |
+ if declmatch.search(newdata): |
|
| 3303 |
+ newdata = declmatch.sub(newdecl, newdata) |
|
| 3304 |
+ else: |
|
| 3305 |
+ newdata = newdecl + u'\n' + newdata |
|
| 3306 |
+ return newdata.encode('utf-8')
|
|
| 3307 |
+ |
|
| 3308 |
+def _stripDoctype(data): |
|
| 3309 |
+ '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data) |
|
| 3310 |
+ |
|
| 3311 |
+ rss_version may be 'rss091n' or None |
|
| 3312 |
+ stripped_data is the same XML document, minus the DOCTYPE |
|
| 3313 |
+ ''' |
|
| 3314 |
+ start = re.search('<\w',data)
|
|
| 3315 |
+ start = start and start.start() or -1 |
|
| 3316 |
+ head,data = data[:start+1], data[start+1:] |
|
| 3317 |
+ |
|
| 3318 |
+ entity_pattern = re.compile(r'^\s*<!ENTITY([^>]*?)>', re.MULTILINE) |
|
| 3319 |
+ entity_results=entity_pattern.findall(head) |
|
| 3320 |
+ head = entity_pattern.sub('', head)
|
|
| 3321 |
+ doctype_pattern = re.compile(r'^\s*<!DOCTYPE([^>]*?)>', re.MULTILINE) |
|
| 3322 |
+ doctype_results = doctype_pattern.findall(head) |
|
| 3323 |
+ doctype = doctype_results and doctype_results[0] or '' |
|
| 3324 |
+ if doctype.lower().count('netscape'):
|
|
| 3325 |
+ version = 'rss091n' |
|
| 3326 |
+ else: |
|
| 3327 |
+ version = None |
|
| 3328 |
+ |
|
| 3329 |
+ # only allow in 'safe' inline entity definitions |
|
| 3330 |
+ replacement='' |
|
| 3331 |
+ if len(doctype_results)==1 and entity_results: |
|
| 3332 |
+ safe_pattern=re.compile('\s+(\w+)\s+"(&#\w+;|[^&"]*)"')
|
|
| 3333 |
+ safe_entities=filter(lambda e: safe_pattern.match(e),entity_results) |
|
| 3334 |
+ if safe_entities: |
|
| 3335 |
+ replacement='<!DOCTYPE feed [\n <!ENTITY %s>\n]>' % '>\n <!ENTITY '.join(safe_entities) |
|
| 3336 |
+ data = doctype_pattern.sub(replacement, head) + data |
|
| 3337 |
+ |
|
| 3338 |
+ return version, data, dict(replacement and safe_pattern.findall(replacement)) |
|
| 3339 |
+ |
|
| 3340 |
+def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]): |
|
| 3341 |
+ '''Parse a feed from a URL, file, stream, or string''' |
|
| 3342 |
+ result = FeedParserDict() |
|
| 3343 |
+ result['feed'] = FeedParserDict() |
|
| 3344 |
+ result['entries'] = [] |
|
| 3345 |
+ if _XML_AVAILABLE: |
|
| 3346 |
+ result['bozo'] = 0 |
|
| 3347 |
+ if type(handlers) == types.InstanceType: |
|
| 3348 |
+ handlers = [handlers] |
|
| 3349 |
+ try: |
|
| 3350 |
+ f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers) |
|
| 3351 |
+ data = f.read() |
|
| 3352 |
+ except Exception as e: |
|
| 3353 |
+ result['bozo'] = 1 |
|
| 3354 |
+ result['bozo_exception'] = e |
|
| 3355 |
+ data = '' |
|
| 3356 |
+ f = None |
|
| 3357 |
+ |
|
| 3358 |
+ # if feed is gzip-compressed, decompress it |
|
| 3359 |
+ if f and data and hasattr(f, 'headers'): |
|
| 3360 |
+ if gzip and f.headers.get('content-encoding', '') == 'gzip':
|
|
| 3361 |
+ try: |
|
| 3362 |
+ data = gzip.GzipFile(fileobj=_StringIO(data)).read() |
|
| 3363 |
+ except Exception as e: |
|
| 3364 |
+ # Some feeds claim to be gzipped but they're not, so |
|
| 3365 |
+ # we get garbage. Ideally, we should re-request the |
|
| 3366 |
+ # feed without the 'Accept-encoding: gzip' header, |
|
| 3367 |
+ # but we don't. |
|
| 3368 |
+ result['bozo'] = 1 |
|
| 3369 |
+ result['bozo_exception'] = e |
|
| 3370 |
+ data = '' |
|
| 3371 |
+ elif zlib and f.headers.get('content-encoding', '') == 'deflate':
|
|
| 3372 |
+ try: |
|
| 3373 |
+ data = zlib.decompress(data, -zlib.MAX_WBITS) |
|
| 3374 |
+ except Exception as e: |
|
| 3375 |
+ result['bozo'] = 1 |
|
| 3376 |
+ result['bozo_exception'] = e |
|
| 3377 |
+ data = '' |
|
| 3378 |
+ |
|
| 3379 |
+ # save HTTP headers |
|
| 3380 |
+ if hasattr(f, 'info'): |
|
| 3381 |
+ info = f.info() |
|
| 3382 |
+ etag = info.getheader('ETag')
|
|
| 3383 |
+ if etag: |
|
| 3384 |
+ result['etag'] = etag |
|
| 3385 |
+ last_modified = info.getheader('Last-Modified')
|
|
| 3386 |
+ if last_modified: |
|
| 3387 |
+ result['modified'] = _parse_date(last_modified) |
|
| 3388 |
+ if hasattr(f, 'url'): |
|
| 3389 |
+ result['href'] = f.url |
|
| 3390 |
+ result['status'] = 200 |
|
| 3391 |
+ if hasattr(f, 'status'): |
|
| 3392 |
+ result['status'] = f.status |
|
| 3393 |
+ if hasattr(f, 'headers'): |
|
| 3394 |
+ result['headers'] = f.headers.dict |
|
| 3395 |
+ if hasattr(f, 'close'): |
|
| 3396 |
+ f.close() |
|
| 3397 |
+ |
|
| 3398 |
+ # there are four encodings to keep track of: |
|
| 3399 |
+ # - http_encoding is the encoding declared in the Content-Type HTTP header |
|
| 3400 |
+ # - xml_encoding is the encoding declared in the <?xml declaration |
|
| 3401 |
+ # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data |
|
| 3402 |
+ # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications |
|
| 3403 |
+ http_headers = result.get('headers', {})
|
|
| 3404 |
+ result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \ |
|
| 3405 |
+ _getCharacterEncoding(http_headers, data) |
|
| 3406 |
+ if http_headers and (not acceptable_content_type): |
|
| 3407 |
+ if http_headers.has_key('content-type'):
|
|
| 3408 |
+ bozo_message = '%s is not an XML media type' % http_headers['content-type'] |
|
| 3409 |
+ else: |
|
| 3410 |
+ bozo_message = 'no Content-type specified' |
|
| 3411 |
+ result['bozo'] = 1 |
|
| 3412 |
+ result['bozo_exception'] = NonXMLContentType(bozo_message) |
|
| 3413 |
+ |
|
| 3414 |
+ result['version'], data, entities = _stripDoctype(data) |
|
| 3415 |
+ |
|
| 3416 |
+ baseuri = http_headers.get('content-location', result.get('href'))
|
|
| 3417 |
+ baselang = http_headers.get('content-language', None)
|
|
| 3418 |
+ |
|
| 3419 |
+ # if server sent 304, we're done |
|
| 3420 |
+ if result.get('status', 0) == 304:
|
|
| 3421 |
+ result['version'] = '' |
|
| 3422 |
+ result['debug_message'] = 'The feed has not changed since you last checked, ' + \ |
|
| 3423 |
+ 'so the server sent no data. This is a feature, not a bug!' |
|
| 3424 |
+ return result |
|
| 3425 |
+ |
|
| 3426 |
+ # if there was a problem downloading, we're done |
|
| 3427 |
+ if not data: |
|
| 3428 |
+ return result |
|
| 3429 |
+ |
|
| 3430 |
+ # determine character encoding |
|
| 3431 |
+ use_strict_parser = 0 |
|
| 3432 |
+ known_encoding = 0 |
|
| 3433 |
+ tried_encodings = [] |
|
| 3434 |
+ # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM |
|
| 3435 |
+ for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding): |
|
| 3436 |
+ if not proposed_encoding: continue |
|
| 3437 |
+ if proposed_encoding in tried_encodings: continue |
|
| 3438 |
+ tried_encodings.append(proposed_encoding) |
|
| 3439 |
+ try: |
|
| 3440 |
+ data = _toUTF8(data, proposed_encoding) |
|
| 3441 |
+ known_encoding = use_strict_parser = 1 |
|
| 3442 |
+ break |
|
| 3443 |
+ except: |
|
| 3444 |
+ pass |
|
| 3445 |
+ # if no luck and we have auto-detection library, try that |
|
| 3446 |
+ if (not known_encoding) and chardet: |
|
| 3447 |
+ try: |
|
| 3448 |
+ proposed_encoding = chardet.detect(data)['encoding'] |
|
| 3449 |
+ if proposed_encoding and (proposed_encoding not in tried_encodings): |
|
| 3450 |
+ tried_encodings.append(proposed_encoding) |
|
| 3451 |
+ data = _toUTF8(data, proposed_encoding) |
|
| 3452 |
+ known_encoding = use_strict_parser = 1 |
|
| 3453 |
+ except: |
|
| 3454 |
+ pass |
|
| 3455 |
+ # if still no luck and we haven't tried utf-8 yet, try that |
|
| 3456 |
+ if (not known_encoding) and ('utf-8' not in tried_encodings):
|
|
| 3457 |
+ try: |
|
| 3458 |
+ proposed_encoding = 'utf-8' |
|
| 3459 |
+ tried_encodings.append(proposed_encoding) |
|
| 3460 |
+ data = _toUTF8(data, proposed_encoding) |
|
| 3461 |
+ known_encoding = use_strict_parser = 1 |
|
| 3462 |
+ except: |
|
| 3463 |
+ pass |
|
| 3464 |
+ # if still no luck and we haven't tried windows-1252 yet, try that |
|
| 3465 |
+ if (not known_encoding) and ('windows-1252' not in tried_encodings):
|
|
| 3466 |
+ try: |
|
| 3467 |
+ proposed_encoding = 'windows-1252' |
|
| 3468 |
+ tried_encodings.append(proposed_encoding) |
|
| 3469 |
+ data = _toUTF8(data, proposed_encoding) |
|
| 3470 |
+ known_encoding = use_strict_parser = 1 |
|
| 3471 |
+ except: |
|
| 3472 |
+ pass |
|
| 3473 |
+ # if still no luck and we haven't tried iso-8859-2 yet, try that. |
|
| 3474 |
+ if (not known_encoding) and ('iso-8859-2' not in tried_encodings):
|
|
| 3475 |
+ try: |
|
| 3476 |
+ proposed_encoding = 'iso-8859-2' |
|
| 3477 |
+ tried_encodings.append(proposed_encoding) |
|
| 3478 |
+ data = _toUTF8(data, proposed_encoding) |
|
| 3479 |
+ known_encoding = use_strict_parser = 1 |
|
| 3480 |
+ except: |
|
| 3481 |
+ pass |
|
| 3482 |
+ # if still no luck, give up |
|
| 3483 |
+ if not known_encoding: |
|
| 3484 |
+ result['bozo'] = 1 |
|
| 3485 |
+ result['bozo_exception'] = CharacterEncodingUnknown( \ |
|
| 3486 |
+ 'document encoding unknown, I tried ' + \ |
|
| 3487 |
+ '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' % \ |
|
| 3488 |
+ (result['encoding'], xml_encoding)) |
|
| 3489 |
+ result['encoding'] = '' |
|
| 3490 |
+ elif proposed_encoding != result['encoding']: |
|
| 3491 |
+ result['bozo'] = 1 |
|
| 3492 |
+ result['bozo_exception'] = CharacterEncodingOverride( \ |
|
| 3493 |
+ 'documented declared as %s, but parsed as %s' % \ |
|
| 3494 |
+ (result['encoding'], proposed_encoding)) |
|
| 3495 |
+ result['encoding'] = proposed_encoding |
|
| 3496 |
+ |
|
| 3497 |
+ if not _XML_AVAILABLE: |
|
| 3498 |
+ use_strict_parser = 0 |
|
| 3499 |
+ if use_strict_parser: |
|
| 3500 |
+ # initialize the SAX parser |
|
| 3501 |
+ feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8') |
|
| 3502 |
+ saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS) |
|
| 3503 |
+ saxparser.setFeature(xml.sax.handler.feature_namespaces, 1) |
|
| 3504 |
+ saxparser.setContentHandler(feedparser) |
|
| 3505 |
+ saxparser.setErrorHandler(feedparser) |
|
| 3506 |
+ source = xml.sax.xmlreader.InputSource() |
|
| 3507 |
+ source.setByteStream(_StringIO(data)) |
|
| 3508 |
+ if hasattr(saxparser, '_ns_stack'): |
|
| 3509 |
+ # work around bug in built-in SAX parser (doesn't recognize xml: namespace) |
|
| 3510 |
+ # PyXML doesn't have this problem, and it doesn't have _ns_stack either |
|
| 3511 |
+ saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
|
|
| 3512 |
+ try: |
|
| 3513 |
+ saxparser.parse(source) |
|
| 3514 |
+ except Exception, e: |
|
| 3515 |
+ if _debug: |
|
| 3516 |
+ import traceback |
|
| 3517 |
+ traceback.print_stack() |
|
| 3518 |
+ traceback.print_exc() |
|
| 3519 |
+ sys.stderr.write('xml parsing failed\n')
|
|
| 3520 |
+ result['bozo'] = 1 |
|
| 3521 |
+ result['bozo_exception'] = feedparser.exc or e |
|
| 3522 |
+ use_strict_parser = 0 |
|
| 3523 |
+ if not use_strict_parser: |
|
| 3524 |
+ feedparser = _LooseFeedParser(baseuri, baselang, known_encoding and 'utf-8' or '', entities) |
|
| 3525 |
+ feedparser.feed(data) |
|
| 3526 |
+ result['feed'] = feedparser.feeddata |
|
| 3527 |
+ result['entries'] = feedparser.entries |
|
| 3528 |
+ result['version'] = result['version'] or feedparser.version |
|
| 3529 |
+ result['namespaces'] = feedparser.namespacesInUse |
|
| 3530 |
+ return result |
|
| 3531 |
+ |
|
| 3532 |
+class Serializer: |
|
| 3533 |
+ def __init__(self, results): |
|
| 3534 |
+ self.results = results |
|
| 3535 |
+ |
|
| 3536 |
+class TextSerializer(Serializer): |
|
| 3537 |
+ def write(self, stream=sys.stdout): |
|
| 3538 |
+ self._writer(stream, self.results, '') |
|
| 3539 |
+ |
|
| 3540 |
+ def _writer(self, stream, node, prefix): |
|
| 3541 |
+ if not node: return |
|
| 3542 |
+ if hasattr(node, 'keys'): |
|
| 3543 |
+ keys = node.keys() |
|
| 3544 |
+ keys.sort() |
|
| 3545 |
+ for k in keys: |
|
| 3546 |
+ if k in ('description', 'link'): continue
|
|
| 3547 |
+ if node.has_key(k + '_detail'): continue |
|
| 3548 |
+ if node.has_key(k + '_parsed'): continue |
|
| 3549 |
+ self._writer(stream, node[k], prefix + k + '.') |
|
| 3550 |
+ elif type(node) == types.ListType: |
|
| 3551 |
+ index = 0 |
|
| 3552 |
+ for n in node: |
|
| 3553 |
+ self._writer(stream, n, prefix[:-1] + '[' + str(index) + '].') |
|
| 3554 |
+ index += 1 |
|
| 3555 |
+ else: |
|
| 3556 |
+ try: |
|
| 3557 |
+ s = str(node).encode('utf-8')
|
|
| 3558 |
+ s = s.replace('\\', '\\\\')
|
|
| 3559 |
+ s = s.replace('\r', '')
|
|
| 3560 |
+ s = s.replace('\n', r'\n')
|
|
| 3561 |
+ stream.write(prefix[:-1]) |
|
| 3562 |
+ stream.write('=')
|
|
| 3563 |
+ stream.write(s) |
|
| 3564 |
+ stream.write('\n')
|
|
| 3565 |
+ except: |
|
| 3566 |
+ pass |
|
| 3567 |
+ |
|
| 3568 |
+class PprintSerializer(Serializer): |
|
| 3569 |
+ def write(self, stream=sys.stdout): |
|
| 3570 |
+ if self.results.has_key('href'):
|
|
| 3571 |
+ stream.write(self.results['href'] + '\n\n') |
|
| 3572 |
+ from pprint import pprint |
|
| 3573 |
+ pprint(self.results, stream) |
|
| 3574 |
+ stream.write('\n')
|
|
| 3575 |
+ |
|
| 3576 |
+if __name__ == '__main__': |
|
| 3577 |
+ try: |
|
| 3578 |
+ from optparse import OptionParser |
|
| 3579 |
+ except: |
|
| 3580 |
+ OptionParser = None |
|
| 3581 |
+ |
|
| 3582 |
+ if OptionParser: |
|
| 3583 |
+ optionParser = OptionParser(version=__version__, usage="%prog [options] url_or_filename_or_-") |
|
| 3584 |
+ optionParser.set_defaults(format="pprint") |
|
| 3585 |
+ optionParser.add_option("-A", "--user-agent", dest="agent", metavar="AGENT", help="User-Agent for HTTP URLs")
|
|
| 3586 |
+ optionParser.add_option("-e", "--referer", "--referrer", dest="referrer", metavar="URL", help="Referrer for HTTP URLs")
|
|
| 3587 |
+ optionParser.add_option("-t", "--etag", dest="etag", metavar="TAG", help="ETag/If-None-Match for HTTP URLs")
|
|
| 3588 |
+ optionParser.add_option("-m", "--last-modified", dest="modified", metavar="DATE", help="Last-modified/If-Modified-Since for HTTP URLs (any supported date format)")
|
|
| 3589 |
+ optionParser.add_option("-f", "--format", dest="format", metavar="FORMAT", help="output results in FORMAT (text, pprint)")
|
|
| 3590 |
+ optionParser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False, help="write debugging information to stderr")
|
|
| 3591 |
+ (options, urls) = optionParser.parse_args() |
|
| 3592 |
+ if options.verbose: |
|
| 3593 |
+ _debug = 1 |
|
| 3594 |
+ if not urls: |
|
| 3595 |
+ optionParser.print_help() |
|
| 3596 |
+ sys.exit(0) |
|
| 3597 |
+ else: |
|
| 3598 |
+ if not sys.argv[1:]: |
|
| 3599 |
+ print __doc__ |
|
| 3600 |
+ sys.exit(0) |
|
| 3601 |
+ class _Options: |
|
| 3602 |
+ etag = modified = agent = referrer = None |
|
| 3603 |
+ format = 'pprint' |
|
| 3604 |
+ options = _Options() |
|
| 3605 |
+ urls = sys.argv[1:] |
|
| 3606 |
+ |
|
| 3607 |
+ zopeCompatibilityHack() |
|
| 3608 |
+ |
|
| 3609 |
+ serializer = globals().get(options.format.capitalize() + 'Serializer', Serializer) |
|
| 3610 |
+ for url in urls: |
|
| 3611 |
+ results = parse(url, etag=options.etag, modified=options.modified, agent=options.agent, referrer=options.referrer) |
|
| 3612 |
+ serializer(results).write(sys.stdout) |
| ... | ... |
@@ -0,0 +1,630 @@ |
| 1 |
+#!/usr/bin/python2.5 |
|
| 2 |
+# chmod 755 me, and make sure I have UNIX style newlines. |
|
| 3 |
+# |
|
| 4 |
+# techcrunch.py |
|
| 5 |
+# |
|
| 6 |
+# http://feeds.feedburner.com/TechCrunch |
|
| 7 |
+# feed = feedparser.parse( 'http://feeds.feedburner.com/TechCrunch' ) |
|
| 8 |
+# feed.entries[14]['feedburner_origlink'], feed.entries[14]['slash_comments'] |
|
| 9 |
+# |
|
| 10 |
+# TODO: |
|
| 11 |
+# 1. Deep links: '<a class="post_more_link snap_nopreview" href="http://www.crunchgear.com/2010/08/21/fanboyism-when-expression-meets-desperation/">Read the rest of this entry »</a>' |
|
| 12 |
+# link = "http://techcrunch.com/2010/08/21/fanboyism-when-expression-meets-desperation/" |
|
| 13 |
+ |
|
| 14 |
+import feedparser |
|
| 15 |
+import yaml |
|
| 16 |
+import sys |
|
| 17 |
+import os |
|
| 18 |
+import time |
|
| 19 |
+import StringIO |
|
| 20 |
+import codecs |
|
| 21 |
+import traceback |
|
| 22 |
+import calendar |
|
| 23 |
+import pickle |
|
| 24 |
+import exceptions |
|
| 25 |
+import urllib |
|
| 26 |
+import urllib2 |
|
| 27 |
+import httplib |
|
| 28 |
+import shutil |
|
| 29 |
+import glob |
|
| 30 |
+import smtplib |
|
| 31 |
+import bisect |
|
| 32 |
+import analysis |
|
| 33 |
+import simplejson as json |
|
| 34 |
+import cookielib |
|
| 35 |
+ |
|
| 36 |
+debug = True |
|
| 37 |
+any_entry_added = False |
|
| 38 |
+ |
|
| 39 |
+localdir = '' |
|
| 40 |
+ |
|
| 41 |
+html_head = """ |
|
| 42 |
+<!DOCTYPE HTML PUBLIC '-//W3C//DTD HTML 4.01 Transitional//EN'> |
|
| 43 |
+<HTML><HEAD> |
|
| 44 |
+ <title>TechCrunch Feed Filter</title> |
|
| 45 |
+ <!-- <link rel="alternate" type="application/rss+xml" title="RSS feed" href="http://techcrunch.dlma.com/rss_feed.xml" /> --> |
|
| 46 |
+ <link rel="alternate" type="application/rss+xml" title="RSS feed" href="http://feeds.feedburner.com/TrendingAtTechcrunch" /> |
|
| 47 |
+ <style type="text/css"> |
|
| 48 |
+ body { font-family: "Arial", san-serif; }
|
|
| 49 |
+ .author { font-size: smaller; }
|
|
| 50 |
+ .h3 { font-size: larger; }
|
|
| 51 |
+ a { text-decoration: none; }
|
|
| 52 |
+ /* table { border: none; border-collapse:collapse; font-size: large } */
|
|
| 53 |
+ table { border-collapse: collapse; }
|
|
| 54 |
+ table.legend { border:1px solid LightSlateGray; font-size: medium; border-collapse:separated; }
|
|
| 55 |
+ table.legend th { border: 1px solid LightSlateGray; background-color: #E0E0E0; }
|
|
| 56 |
+ table.legend td { border: 1px solid LightSlateGray; }
|
|
| 57 |
+ tr.even { background:#%s; padding: 2em; }
|
|
| 58 |
+ tr.odd { background:#%s; padding-bottom: 2em; }
|
|
| 59 |
+ </style> |
|
| 60 |
+</HEAD> |
|
| 61 |
+<BODY> |
|
| 62 |
+<div align='center'><h3>TechCrunch Feed Filter</h3></div> |
|
| 63 |
+This page shows what analysis is done to filter the noise away from the Techcrunch feed into <a href="http://feeds.feedburner.com/TrendingAtTechcrunch">a more concise feed</a>.<br /><br /> |
|
| 64 |
+""" |
|
| 65 |
+ |
|
| 66 |
+html_footer = """ |
|
| 67 |
+</table> |
|
| 68 |
+</div><br /> |
|
| 69 |
+<div align='center'>Thanks to <a href="http://www.feedparser.org/">The Universal Feed Parser module</a>, |
|
| 70 |
+<a href="http://pyyaml.org/">PyYAML</a> and <a href="http://code.google.com/apis/chart/">Google Charts</a>.<br /><a href="techcrunch.yaml">raw data</a> • <a href="stats.txt">status</a></div><br /> |
|
| 71 |
+</BODY> |
|
| 72 |
+</HTML> |
|
| 73 |
+""" |
|
| 74 |
+ |
|
| 75 |
+img_width = 300 |
|
| 76 |
+img_height = 50 |
|
| 77 |
+ |
|
| 78 |
+series_1_color = "0000FF" |
|
| 79 |
+series_2_color = "00AA00" |
|
| 80 |
+threshold_color = "FF8C00" |
|
| 81 |
+ |
|
| 82 |
+even_background = "F8F8F8" |
|
| 83 |
+#even_background = "FFFFFF" |
|
| 84 |
+odd_background = "E8E8E8" |
|
| 85 |
+ |
|
| 86 |
+def asciiize( s ): |
|
| 87 |
+ try: |
|
| 88 |
+ return s.encode( 'ascii' ) |
|
| 89 |
+ except UnicodeEncodeError, e: |
|
| 90 |
+ return s |
|
| 91 |
+ except exceptions.AttributeError, e: |
|
| 92 |
+ return s |
|
| 93 |
+ |
|
| 94 |
+def sendEmail( subject, message, toaddrs, fromaddr='"techcrunch.py" <techcrunch@techcrunch.dlma.com>' ): |
|
| 95 |
+ """Sends Email""" |
|
| 96 |
+ smtp = smtplib.SMTP( 'localhost' ) |
|
| 97 |
+ smtp.sendmail( fromaddr, \ |
|
| 98 |
+ toaddrs, \ |
|
| 99 |
+ "Content-Type: text/plain; charset=\"us-ascii\"\r\nFrom: %s\r\nTo: %s\r\nSubject: %s\r\n%s" % \ |
|
| 100 |
+ ( fromaddr, ", ".join( toaddrs ), subject, message ) ) |
|
| 101 |
+ smtp.quit() |
|
| 102 |
+ |
|
| 103 |
+def make_chart_url( time_posted, comment_times, comment_values, retweet_times, retweet_values, met_threshold_pt, bg_color ): |
|
| 104 |
+# comment_times, comment_values = zip( *comments ) |
|
| 105 |
+# retweet_times, retweet_values = zip( *retweets ) |
|
| 106 |
+ |
|
| 107 |
+ # TODO handle failure cases, -1 |
|
| 108 |
+ |
|
| 109 |
+ if not len( comment_times ): |
|
| 110 |
+ comment_times = [ time_posted, ] |
|
| 111 |
+ if not len( comment_values ): |
|
| 112 |
+ comment_values = [ 0, ] |
|
| 113 |
+ if not len( retweet_times ): |
|
| 114 |
+ retweet_times = [ time_posted, ] |
|
| 115 |
+ if not len( retweet_values ): |
|
| 116 |
+ retweet_values = [ 0, ] |
|
| 117 |
+ |
|
| 118 |
+# comment_times = [ (i - time_posted + 900) / 1800 for i in comment_times ] |
|
| 119 |
+# retweet_times = [ (i - time_posted + 900) / 1800 for i in retweet_times ] |
|
| 120 |
+ comment_times = [ (i - time_posted) / 1800 for i in comment_times ] |
|
| 121 |
+ retweet_times = [ (i - time_posted) / 1800 for i in retweet_times ] |
|
| 122 |
+ |
|
| 123 |
+ min_comment_time = min( comment_times ) |
|
| 124 |
+ max_comment_time = max( comment_times ) |
|
| 125 |
+ min_comment_value = min( comment_values ) |
|
| 126 |
+ max_comment_value = max( comment_values ) |
|
| 127 |
+ min_retweet_time = min( retweet_times ) |
|
| 128 |
+ max_retweet_time = max( retweet_times ) |
|
| 129 |
+ min_retweet_value = min( retweet_values ) |
|
| 130 |
+ max_retweet_value = max( retweet_values ) |
|
| 131 |
+ |
|
| 132 |
+ if len( comment_values ) < 8 and len( comment_values ) > 1: |
|
| 133 |
+ # max_comment_value *= 2 |
|
| 134 |
+ pass |
|
| 135 |
+ elif len( comment_values ) == 1: |
|
| 136 |
+ min_comment_value = 0 |
|
| 137 |
+ if len( retweet_values ) < 8 and len( retweet_values ) > 1: |
|
| 138 |
+ # max_retweet_value *= 2 |
|
| 139 |
+ pass |
|
| 140 |
+ elif len( retweet_values ) == 1: |
|
| 141 |
+ min_retweet_value = 0 |
|
| 142 |
+ |
|
| 143 |
+ min_comment_value = 0 |
|
| 144 |
+ min_retweet_value = 0 |
|
| 145 |
+ |
|
| 146 |
+ chart_url = "http://chart.apis.google.com/chart?cht=lxy&chco=%s,%s&chs=%dx%d&chxs=0,%s|1,%s" % \ |
|
| 147 |
+ ( series_1_color, series_2_color, img_width, img_height, series_1_color, series_2_color ) |
|
| 148 |
+ chart_url += "&chd=t:%s|%s|%s|%s" % ( ','.join( [ str( n ) for n in comment_times ] ), |
|
| 149 |
+ ','.join( [ str( n ) for n in comment_values ] ), |
|
| 150 |
+ ','.join( [ str( n ) for n in retweet_times ] ), |
|
| 151 |
+ ','.join( [ str( n ) for n in retweet_values ] ) ) |
|
| 152 |
+ if met_threshold_pt != -1: |
|
| 153 |
+ chart_url += "&chm=o,%s,1,%d,10" % ( threshold_color, met_threshold_pt ) |
|
| 154 |
+ chart_url += "&chxt=y,r&chxl=0:|%d|%d|1:|%d|%d&chds=%d,%d,%d,%d,%d,%d,%d,%d" % \ |
|
| 155 |
+ ( min_comment_value, max_comment_value, min_retweet_value, max_retweet_value, |
|
| 156 |
+ 0, max( 7, max_comment_time ), |
|
| 157 |
+ min_comment_value, max_comment_value, |
|
| 158 |
+ 0, max( 7, max_retweet_time ), |
|
| 159 |
+ min_comment_value, max_retweet_value ) |
|
| 160 |
+ chart_url += "&chf=bg,s,%s&chdl=comments|retweets" % ( bg_color, ) |
|
| 161 |
+ return chart_url |
|
| 162 |
+ |
|
| 163 |
+def process_feed( yaml_items ): |
|
| 164 |
+ """ |
|
| 165 |
+ Retrieve the url and process it. |
|
| 166 |
+ feed_info (in, out) A tuple that describes an individual feed, like its name and etag. |
|
| 167 |
+ """ |
|
| 168 |
+ |
|
| 169 |
+ feed = feedparser.parse( 'http://feeds.feedburner.com/TechCrunch' ) |
|
| 170 |
+ if hasattr( feed, 'status' ): |
|
| 171 |
+ if feed.status == 304: |
|
| 172 |
+ pass |
|
| 173 |
+ else: |
|
| 174 |
+ feed_is_modified = True |
|
| 175 |
+ if feed.status != 200 and feed.status != 307 and feed.status != 301 and feed.status != 302: |
|
| 176 |
+ if feed.status == 503: |
|
| 177 |
+ print "the feed is temporarily unavailable." |
|
| 178 |
+ elif feed.status == 400: |
|
| 179 |
+ print "the feed says we made a bad request." |
|
| 180 |
+ elif feed.status == 502: |
|
| 181 |
+ print "the feed reported a bad gateway error." |
|
| 182 |
+ elif feed.status == 404: |
|
| 183 |
+ print "the feed says the page was not found." |
|
| 184 |
+ elif feed.status == 500: |
|
| 185 |
+ print "the feed had an internal server error." |
|
| 186 |
+ elif feed.status == 403: |
|
| 187 |
+ print "Access to the feed was forbidden." |
|
| 188 |
+ else: |
|
| 189 |
+ print "the feed returned feed.status %d." % ( feed.status, ) |
|
| 190 |
+ else: |
|
| 191 |
+ # Save off this |
|
| 192 |
+ f = file( os.path.join( localdir, 'techcrunch_feed.pickle' ), 'wb' ) |
|
| 193 |
+ try: |
|
| 194 |
+ pickle.dump( feed, f ) |
|
| 195 |
+ except( pickle.PicklingError, exceptions.TypeError ), e: |
|
| 196 |
+ print "An error occurred while pickling the feed: %s." % \ |
|
| 197 |
+ ( # str(e.__class__), |
|
| 198 |
+ str(e) ) |
|
| 199 |
+ traceback.print_exc( file = sys.stdout ) |
|
| 200 |
+ feed_is_modified = False |
|
| 201 |
+ f.close() |
|
| 202 |
+ |
|
| 203 |
+ for i in reversed( feed.entries ): |
|
| 204 |
+ process_item( i, yaml_items ) |
|
| 205 |
+ |
|
| 206 |
+ # If we have more than 200 items, remove the old ones. |
|
| 207 |
+ while len( yaml_items ) > 200: |
|
| 208 |
+ yaml_items.pop() |
|
| 209 |
+ |
|
| 210 |
+ cookie = Get_cookie( urllib2.Request( 'http://mediacdn.disqus.com/1078/build/system/count.js' ) ) |
|
| 211 |
+ |
|
| 212 |
+ for i in yaml_items: |
|
| 213 |
+ # i['title'] = asciiize( i['title'] ) |
|
| 214 |
+ # i['tags'] = map( asciiize, i['tags'] ) |
|
| 215 |
+ process_yaml_item( i, cookie ) |
|
| 216 |
+ |
|
| 217 |
+ else: |
|
| 218 |
+ if hasattr(feed, 'bozo_exception'): |
|
| 219 |
+ e = feed.bozo_exception |
|
| 220 |
+ if isinstance( e, urllib2.URLError ): # e.__class__ == urllib2.URLError: # and hasattr(e, 'errno') and e.errno == 110: |
|
| 221 |
+ print_last_line = True |
|
| 222 |
+ if hasattr(e, 'reason'): |
|
| 223 |
+ if e.reason[0] == 110: |
|
| 224 |
+ print "the feed's connection timed out." |
|
| 225 |
+ print_last_line = False |
|
| 226 |
+ elif e.reason[0] == 111: |
|
| 227 |
+ print "the feed's connection was refused." |
|
| 228 |
+ print_last_line = False |
|
| 229 |
+ elif e.reason[0] == 104: |
|
| 230 |
+ print "the feed reset the connection." |
|
| 231 |
+ print_last_line = False |
|
| 232 |
+ else: |
|
| 233 |
+ print "the feed had a URLError with reason %s." % ( str(e.reason), ) |
|
| 234 |
+ print_last_line = False |
|
| 235 |
+ if print_last_line: |
|
| 236 |
+ print "the feed had a URLError %s" % ( str(e), ) |
|
| 237 |
+ elif isinstance( e, httplib.BadStatusLine ): |
|
| 238 |
+ if hasattr(e, 'message'): |
|
| 239 |
+ print "the feed gave a bad status line %s." % ( str(e.message ), ) |
|
| 240 |
+ else: |
|
| 241 |
+ print "the feed gave a bad status line." |
|
| 242 |
+ else: |
|
| 243 |
+ if len( str(e) ): |
|
| 244 |
+ print "the feed bozo_exception: %s \"%s\"" % ( str(e.__class__), str(e) ) |
|
| 245 |
+ else: |
|
| 246 |
+ print "the feed bozo_exception: %s %s" % ( str(e.__class__), repr(e) ) |
|
| 247 |
+ else: |
|
| 248 |
+ print "the feed returned class %s, %s" % ( str(feed.__class__), str(feed) ) |
|
| 249 |
+ |
|
| 250 |
+def process_item( feed_item, yaml_items ): |
|
| 251 |
+ # Get the time |
|
| 252 |
+ global any_entry_added |
|
| 253 |
+ timecode_now = int( time.time() ) |
|
| 254 |
+ date_parsed = time.gmtime() |
|
| 255 |
+ if hasattr( feed_item, 'issued_parsed' ): |
|
| 256 |
+ date_parsed = feed_item.issued_parsed |
|
| 257 |
+ date_set = True |
|
| 258 |
+ elif hasattr( feed_item, 'date_parsed' ): |
|
| 259 |
+ date_parsed = feed_item.date_parsed |
|
| 260 |
+ date_set = True |
|
| 261 |
+ else: |
|
| 262 |
+ print "process_item found no timestamp for", asciiize( feed_item.link ) |
|
| 263 |
+ timecode_parsed = calendar.timegm( date_parsed ) |
|
| 264 |
+ |
|
| 265 |
+ # Look for i.feedburner_origlink in yaml_items |
|
| 266 |
+ yaml_item = None |
|
| 267 |
+ for i in yaml_items: |
|
| 268 |
+ if feed_item.feedburner_origlink == i['link']: |
|
| 269 |
+ yaml_item = i |
|
| 270 |
+ break |
|
| 271 |
+ if not yaml_item: |
|
| 272 |
+ author = '' |
|
| 273 |
+ link = feed_item.link |
|
| 274 |
+ if hasattr( feed_item, 'author' ): |
|
| 275 |
+ author = asciiize( feed_item.author ) |
|
| 276 |
+ if hasattr( feed_item, 'feedburner_origlink' ): |
|
| 277 |
+ link = feed_item.feedburner_origlink |
|
| 278 |
+ |
|
| 279 |
+ # Make a new yaml_item |
|
| 280 |
+ yaml_item = { 'title' : asciiize( feed_item.title ),
|
|
| 281 |
+ 'link' : asciiize( link ), |
|
| 282 |
+ 'author' : author, |
|
| 283 |
+ 'tags' : [], |
|
| 284 |
+ 'orig_posted' : timecode_parsed, |
|
| 285 |
+ 'qualified' : -1, |
|
| 286 |
+ 'comment_times' : [], |
|
| 287 |
+ 'comments' : [], |
|
| 288 |
+ 'slash_comment_times' : [], |
|
| 289 |
+ 'slash_comments' : [], |
|
| 290 |
+ 'retweet_times' : [], |
|
| 291 |
+ 'retweets' : [] |
|
| 292 |
+ } |
|
| 293 |
+ if hasattr( feed_item, 'tags' ): |
|
| 294 |
+ for i in feed_item.tags: |
|
| 295 |
+ yaml_item['tags'].append( asciiize( i.term ) ) |
|
| 296 |
+ |
|
| 297 |
+ yaml_items.insert( 0, yaml_item ) |
|
| 298 |
+ any_entry_added = True |
|
| 299 |
+ |
|
| 300 |
+ # Maybe check to ensure that this item isn't too old. |
|
| 301 |
+ if timecode_parsed < timecode_now - 60 * 30 * 9: |
|
| 302 |
+ return |
|
| 303 |
+ |
|
| 304 |
+ # Now, add the new values |
|
| 305 |
+ if hasattr( feed_item, 'slash_comments' ) and len( yaml_item['slash_comments'] ) < 8: |
|
| 306 |
+ any_entry_added = True |
|
| 307 |
+ yaml_item['slash_comment_times'].append( timecode_now ) |
|
| 308 |
+ yaml_item['slash_comments'].append( int( feed_item.slash_comments ) ) |
|
| 309 |
+ |
|
| 310 |
+def process_yaml_item( yaml_item, cookie ): |
|
| 311 |
+ global any_entry_added |
|
| 312 |
+ |
|
| 313 |
+ timecode_now = int( time.time() ) |
|
| 314 |
+ if len( yaml_item['comments'] ) < 8: |
|
| 315 |
+ num_comments = Get_num_disqus_comments( yaml_item['link'], cookie ) |
|
| 316 |
+ if num_comments != -1: |
|
| 317 |
+ any_entry_added = True |
|
| 318 |
+ yaml_item['comment_times'].append( timecode_now ) |
|
| 319 |
+ yaml_item['comments'].append( num_comments ) |
|
| 320 |
+ |
|
| 321 |
+ if len( yaml_item['retweets'] ) < 8: |
|
| 322 |
+ num_retweets = Get_num_retweets( yaml_item['link'] ) |
|
| 323 |
+ if num_retweets != -1: |
|
| 324 |
+ any_entry_added = True |
|
| 325 |
+ yaml_item['retweet_times'].append( timecode_now ) |
|
| 326 |
+ yaml_item['retweets'].append( num_retweets ) |
|
| 327 |
+ |
|
| 328 |
+def Get_num_comments( url_string ): |
|
| 329 |
+ try: |
|
| 330 |
+ f = urllib2.urlopen( url_string ) |
|
| 331 |
+ data = f.read() |
|
| 332 |
+ f.close() |
|
| 333 |
+ except urllib2.URLError, e: |
|
| 334 |
+ if hasattr( e, 'reason' ): |
|
| 335 |
+ print "Get_num_comments got an error:", e.reason |
|
| 336 |
+ elif hasattr( e, 'code' ): |
|
| 337 |
+ print "Get_num_comments got an error. Code:", e.code |
|
| 338 |
+ return -1 |
|
| 339 |
+ tag_to_find = '<a href="#comments" rel="nofollow">' |
|
| 340 |
+ offset = data.find( tag_to_find ) |
|
| 341 |
+ if offset != -1: |
|
| 342 |
+ start_pos = offset + len( tag_to_find ) |
|
| 343 |
+ end_pos = start_pos |
|
| 344 |
+ while str.isdigit( data[ end_pos ] ): |
|
| 345 |
+ end_pos += 1 |
|
| 346 |
+ if end_pos > start_pos: |
|
| 347 |
+ return int( data[start_pos:end_pos] ) |
|
| 348 |
+ return -1 |
|
| 349 |
+ |
|
| 350 |
+def Get_cookie( cookie_request ): |
|
| 351 |
+ cookie = cookielib.CookieJar() |
|
| 352 |
+ try: |
|
| 353 |
+ cookie_response = urllib2.urlopen( cookie_request ) |
|
| 354 |
+ cookie.extract_cookies( cookie_response, cookie_request ) |
|
| 355 |
+ return cookie |
|
| 356 |
+ except urllib2.URLError, e: |
|
| 357 |
+ if hasattr( e, 'reason' ): |
|
| 358 |
+ print "Get_cookie got an error:", e.reason |
|
| 359 |
+ elif hasattr( e, 'code' ): |
|
| 360 |
+ print "Get_cookie got an error. Code:", e.code |
|
| 361 |
+ return None |
|
| 362 |
+ |
|
| 363 |
+def Get_num_disqus_comments( url_string, cookie ): |
|
| 364 |
+ |
|
| 365 |
+ if cookie == None: |
|
| 366 |
+ return -1 |
|
| 367 |
+ |
|
| 368 |
+ try: |
|
| 369 |
+ f = urllib2.urlopen( url_string ) |
|
| 370 |
+ data = f.read() |
|
| 371 |
+ f.close() |
|
| 372 |
+ except urllib2.URLError, e: |
|
| 373 |
+ if hasattr( e, 'reason' ): |
|
| 374 |
+ print "Get_num_disqus_comments got an error:", e.reason |
|
| 375 |
+ elif hasattr( e, 'code' ): |
|
| 376 |
+ print "Get_num_disqus_comments got an error. Code:", e.code |
|
| 377 |
+ return -1 |
|
| 378 |
+ |
|
| 379 |
+ tag_to_find = '<a href="#comments" rel="nofollow"><span class="dsq-postid" rel="' |
|
| 380 |
+ disqus_tag_to_find = 'displayCount('
|
|
| 381 |
+ offset = data.find( tag_to_find ) |
|
| 382 |
+ if offset != -1: |
|
| 383 |
+ start_pos = offset + len( tag_to_find ) |
|
| 384 |
+ end_pos = start_pos |
|
| 385 |
+ while data[ end_pos ] != '"' and end_pos < start_pos + 200: |
|
| 386 |
+ end_pos += 1 |
|
| 387 |
+ if end_pos < start_pos + 200: |
|
| 388 |
+ opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cookie ) ) |
|
| 389 |
+ url_GET_data = urllib.quote_plus( data[start_pos:end_pos] ).replace( '+', '%20' ) |
|
| 390 |
+ request = urllib2.Request( 'http://disqus.com/forums/techcrunch/count.js?q=1&0=1,' + url_GET_data ) |
|
| 391 |
+ try: |
|
| 392 |
+ response = opener.open( request ) |
|
| 393 |
+ disqus_data = response.read() |
|
| 394 |
+ except urllib2.URLError, e: |
|
| 395 |
+ if hasattr( e, 'reason' ): |
|
| 396 |
+ print "Get_num_disqus_comments got an error getting the count:", e.reason |
|
| 397 |
+ elif hasattr( e, 'code' ): |
|
| 398 |
+ print "Get_num_disqus_comments got an error getting the count. Code:", e.code |
|
| 399 |
+ disqus_data = "" |
|
| 400 |
+ disqus_offset = disqus_data.find( disqus_tag_to_find ) |
|
| 401 |
+ if disqus_offset != -1: |
|
| 402 |
+ start_pos = disqus_offset + len( disqus_tag_to_find ) |
|
| 403 |
+ end_pos = disqus_data.find( '}]})', start_pos ) |
|
| 404 |
+ if end_pos != -1: |
|
| 405 |
+ return int( json.loads( disqus_data[start_pos:end_pos+3] )['counts'][0]['comments'] ) |
|
| 406 |
+ |
|
| 407 |
+ return -1 |
|
| 408 |
+ |
|
| 409 |
+def Get_num_retweets( url_string ): |
|
| 410 |
+ try: |
|
| 411 |
+ f = urllib2.urlopen( 'http://api.tweetmeme.com/button.js?url=%s' % ( url_string ) ) |
|
| 412 |
+ data = f.read() |
|
| 413 |
+ f.close() |
|
| 414 |
+ except urllib2.URLError, e: |
|
| 415 |
+ if hasattr( e, 'reason' ): |
|
| 416 |
+ print "Get_num_retweets got an error:", e.reason |
|
| 417 |
+ elif hasattr( e, 'code' ): |
|
| 418 |
+ print "Get_num_retweets got an error. Code:", e.code |
|
| 419 |
+ return -1 |
|
| 420 |
+ tag_to_find = '<span class="c">' |
|
| 421 |
+ offset = data.find( tag_to_find ) |
|
| 422 |
+ if offset != -1: |
|
| 423 |
+ start_pos = offset + len( tag_to_find ) |
|
| 424 |
+ end_pos = data.find( '<', start_pos ) |
|
| 425 |
+ if end_pos != -1: |
|
| 426 |
+ return int( data[ start_pos:end_pos ] ) |
|
| 427 |
+ return -1 |
|
| 428 |
+ |
|
| 429 |
+def Save_image( url_string, file_path ): |
|
| 430 |
+ try: |
|
| 431 |
+ f = urllib2.urlopen( url_string ) |
|
| 432 |
+ data = f.read() |
|
| 433 |
+ f.close() |
|
| 434 |
+ except urllib2.URLError, e: |
|
| 435 |
+ if hasattr( e, 'reason' ): |
|
| 436 |
+ print "Save_image got an error:", e.reason |
|
| 437 |
+ elif hasattr( e, 'code' ): |
|
| 438 |
+ print "Save_image got an error. Code:", e.code |
|
| 439 |
+ return url_string |
|
| 440 |
+ if len( data ) > 50: |
|
| 441 |
+ f = open( file_path, 'wb' ) |
|
| 442 |
+ f.write( data ) |
|
| 443 |
+ f.close() |
|
| 444 |
+ return 'cache/' + os.path.basename( file_path ) |
|
| 445 |
+ return url_string |
|
| 446 |
+ |
|
| 447 |
+def Make_index_html( yaml_items, stats ): |
|
| 448 |
+ cur_time = int( time.time() ) |
|
| 449 |
+ new_index_fullpath = os.path.join( localdir, 'index.html_new' ) |
|
| 450 |
+ index_fullpath = os.path.join( localdir, 'index.html' ) |
|
| 451 |
+ cache_path = os.path.join( localdir, 'cache' ) |
|
| 452 |
+ |
|
| 453 |
+ files_to_delete = glob.glob( cache_path + '*.png' ) |
|
| 454 |
+# shutil.rmtree( cache_path ) |
|
| 455 |
+# os.mkdir( cache_path ) |
|
| 456 |
+ |
|
| 457 |
+ f = file( new_index_fullpath, 'w' ) |
|
| 458 |
+ f.write( html_head % ( even_background, odd_background ) ) |
|
| 459 |
+# f.write( '<div align="center">\n<table cellpadding="4">' ) |
|
| 460 |
+ |
|
| 461 |
+ f.write( '<div align="center">\n<table class="legend">\n<tr><th>0:30</th><th>1:00</th><th>1:30</th><th>2:00</th><th>2:30</th><th>3:00</th><th>3:30</th><th>4:00</th></tr><tr>' ) |
|
| 462 |
+ for median, mean, std_dev in stats: |
|
| 463 |
+ f.write( '<td>med=%1.1f μ=%1.1f σ=%1.1f </td> ' % ( median, mean, std_dev ) ) |
|
| 464 |
+ f.write( '</tr>\n</table></div>\n<br />\n' ) |
|
| 465 |
+ |
|
| 466 |
+ f.write( '<div align="center">\n<table>\n' ) |
|
| 467 |
+ image_index = 0 |
|
| 468 |
+ for i in yaml_items[:40]: |
|
| 469 |
+ chart_url = make_chart_url( i['orig_posted'], |
|
| 470 |
+ i['comment_times'], |
|
| 471 |
+ i['comments'], |
|
| 472 |
+ i['retweet_times'], |
|
| 473 |
+ i['retweets'], |
|
| 474 |
+ i['qualified'], |
|
| 475 |
+ image_index % 2 and even_background or odd_background, |
|
| 476 |
+ ) |
|
| 477 |
+ image_url = Save_image( chart_url, os.path.join( cache_path, '%d_%d.png' % ( cur_time, image_index ) ) ) |
|
| 478 |
+ f.write( '<tr valign="center" class="%s">\n <td><strong><a href="%s">%s</a></strong> <span class="author">by %s</span></td>\n' % \ |
|
| 479 |
+ ( image_index % 2 and "even" or "odd", |
|
| 480 |
+ i['link'], |
|
| 481 |
+ i['title'].encode( 'ascii', 'xmlcharrefreplace' ), |
|
| 482 |
+ i['author'].encode( 'ascii', 'xmlcharrefreplace' ), |
|
| 483 |
+ ) |
|
| 484 |
+ ) |
|
| 485 |
+ f.write( ' <td>%s<td>\n' % ( i['qualified'] != -1 and '<img src="star_30.png" width="30" height="29" />' or '' ) ) |
|
| 486 |
+ f.write( ' <td><img src="%s" width="%d" height="%d" border="0" /></td></tr>\n' % \ |
|
| 487 |
+ ( image_url, |
|
| 488 |
+ img_width, |
|
| 489 |
+ img_height |
|
| 490 |
+ ) |
|
| 491 |
+ ) |
|
| 492 |
+ image_index += 1 |
|
| 493 |
+ f.write( html_footer ) |
|
| 494 |
+ f.close() |
|
| 495 |
+ if os.path.exists( index_fullpath ): |
|
| 496 |
+ os.unlink( index_fullpath ) |
|
| 497 |
+ shutil.move( new_index_fullpath, index_fullpath ) |
|
| 498 |
+ for fname in files_to_delete: |
|
| 499 |
+ os.unlink( fname ) |
|
| 500 |
+ |
|
| 501 |
+def Make_feed_file( yaml_items ): |
|
| 502 |
+ f = open( os.path.join( localdir, 'rss_feed.xml' ), 'wb' ) |
|
| 503 |
+ f.write( "<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n<rss version=\"2.0\">\n<channel>\n<title>Trending at TechCrunch</title><link>http://techcrunch.dlma.com</link>" ) |
|
| 504 |
+ f.write( "<pubDate>%s</pubDate><description>Automatically Generated Feed</description><language>en-us</language>" % ( time.strftime( "%a, %d %b %Y %H:%M:%S +0000", time.gmtime() ) ) ) |
|
| 505 |
+ count = 0 |
|
| 506 |
+ for item in yaml_items: |
|
| 507 |
+ now = time.strftime( "%a, %d %b %Y %H:%M:%S +0000", time.gmtime( item['orig_posted'] ) ) |
|
| 508 |
+ if item['qualified'] != -1: |
|
| 509 |
+ f.write( "<item><title>%s</title><pubDate>%s</pubDate><link>%s</link><guid isPermaLink=\"false\">%s</guid><description><![CDATA[By: %s]]></description></item>\n" % \ |
|
| 510 |
+ ( item['title'].encode( 'ascii', 'xmlcharrefreplace' ), now, item['link'], item['link'], item['author'].encode( 'ascii', 'xmlcharrefreplace' ) ) ) |
|
| 511 |
+ count += 1 |
|
| 512 |
+ if count > 14: |
|
| 513 |
+ break |
|
| 514 |
+ f.write( "</channel></rss>" ) |
|
| 515 |
+ f.close() |
|
| 516 |
+ |
|
| 517 |
+if __name__=='__main__': |
|
| 518 |
+ start_time = time.time() |
|
| 519 |
+ progress_text = [] |
|
| 520 |
+ |
|
| 521 |
+ old_stdout = sys.stdout |
|
| 522 |
+ old_stderr = sys.stderr |
|
| 523 |
+ sys.stdout = sys.stderr = StringIO.StringIO() |
|
| 524 |
+ |
|
| 525 |
+ try: |
|
| 526 |
+ localdir = os.path.abspath( os.path.dirname( sys.argv[0] ) ) |
|
| 527 |
+ # |
|
| 528 |
+ # Read in techcrunch.yaml |
|
| 529 |
+ # |
|
| 530 |
+ # [ { 'title' : 'Title Text',
|
|
| 531 |
+ # 'link' : u'http://techcrunch.com/2010/08/17/google-buzz-who-to-follow/', |
|
| 532 |
+ # 'author' : u'MG Siegler', |
|
| 533 |
+ # 'orig_posted' : 1282197199 |
|
| 534 |
+ # 'tags' : [ u'Google', u'privacy' ] |
|
| 535 |
+ # 'qualified' : -1 |
|
| 536 |
+ # 'comment_times' : [ 1282197199, 1282197407 ] |
|
| 537 |
+ # 'comments' : [ 0, 15 ] |
|
| 538 |
+ # 'slash_comment_times' : [ 1282197199, 1282197407 ] |
|
| 539 |
+ # 'slash_comments' : [ 0, 5 ] |
|
| 540 |
+ # 'slash_comment_times' : [ 1282197199, 1282197407 ] |
|
| 541 |
+ # 'slash_comments' : [ 0, 3 ] |
|
| 542 |
+ # 'retweet_times' : [ 1282197199, 1282197407 ] |
|
| 543 |
+ # 'retweets' : [ 0, 43 ] |
|
| 544 |
+ # }, |
|
| 545 |
+ # { ... }
|
|
| 546 |
+ # ] |
|
| 547 |
+ # |
|
| 548 |
+ yaml_fullpath = os.path.join( localdir, 'techcrunch.yaml' ) |
|
| 549 |
+ if os.path.exists( yaml_fullpath ): |
|
| 550 |
+ f = file( yaml_fullpath, 'rb' ) |
|
| 551 |
+ items = yaml.load( f ) |
|
| 552 |
+ f.close() |
|
| 553 |
+ else: |
|
| 554 |
+ print "could not open", yaml_fullpath |
|
| 555 |
+ items = [] |
|
| 556 |
+ |
|
| 557 |
+ progress_text = [ "read techcrunch.yaml" ] |
|
| 558 |
+ process_feed( items ) |
|
| 559 |
+ |
|
| 560 |
+ # |
|
| 561 |
+ # If any work was done, then write files. |
|
| 562 |
+ # |
|
| 563 |
+ if True or any_entry_added: |
|
| 564 |
+ |
|
| 565 |
+ stats = analysis.Process_retweets_for_feed( items ) |
|
| 566 |
+ |
|
| 567 |
+ # We'll only look at the stats for the time 1:00 to 1:30 after posting. |
|
| 568 |
+ median, mean, sigma = stats[2] |
|
| 569 |
+ threshold = median + sigma |
|
| 570 |
+ for item in items: |
|
| 571 |
+ if item['qualified'] == -1: |
|
| 572 |
+ for i in range( len( item['retweet_times'] ) ): |
|
| 573 |
+ r_time = item['retweet_times'][i] |
|
| 574 |
+ if r_time - item['orig_posted'] < 5400: |
|
| 575 |
+ if item['retweets'][i] >= threshold: |
|
| 576 |
+ item['qualified'] = i |
|
| 577 |
+ if r_time - item['orig_posted'] >= 3600: |
|
| 578 |
+ break |
|
| 579 |
+ |
|
| 580 |
+ # |
|
| 581 |
+ # Write out the updated yaml file. |
|
| 582 |
+ # |
|
| 583 |
+ f = file( yaml_fullpath, 'wb' ) |
|
| 584 |
+ yaml.dump( items, f, width=120 ) |
|
| 585 |
+ f.close() |
|
| 586 |
+ f = file( os.path.join( localdir, 'techcrunch_text.yaml' ), 'w' ) |
|
| 587 |
+ yaml.dump( items, f, width=120 ) |
|
| 588 |
+ f.close() |
|
| 589 |
+ f = codecs.open( os.path.join( localdir, 'techcrunch_unicode.yaml' ), 'w', 'utf-8' ) |
|
| 590 |
+ yaml.dump( items, f, encoding='utf-8', width=120 ) |
|
| 591 |
+ f.close() |
|
| 592 |
+ |
|
| 593 |
+ Make_feed_file( items ) |
|
| 594 |
+ |
|
| 595 |
+ Make_index_html( items, stats ) |
|
| 596 |
+ else: |
|
| 597 |
+ print "No entries were added this time." |
|
| 598 |
+ |
|
| 599 |
+ except Exception, e: |
|
| 600 |
+ exceptional_text = "An exception occurred: " + str( e.__class__ ) + " " + str(e) |
|
| 601 |
+ print exceptional_text, ' '.join( progress_text ) |
|
| 602 |
+ traceback.print_exc( file = sys.stdout ) |
|
| 603 |
+ try: |
|
| 604 |
+ sendEmail( 'Exception thrown in techcrunch.py', |
|
| 605 |
+ exceptional_text, |
|
| 606 |
+ ( 'david.blume@gmail.com', ) ) |
|
| 607 |
+ except Exception, e: |
|
| 608 |
+ print "Could not send email to notify you of the exception. :("
|
|
| 609 |
+ |
|
| 610 |
+ message = sys.stdout.getvalue() |
|
| 611 |
+ sys.stdout = old_stdout |
|
| 612 |
+ sys.stderr = old_stderr |
|
| 613 |
+ if not debug: |
|
| 614 |
+ print message |
|
| 615 |
+ |
|
| 616 |
+ # Finally, let's save this to a statistics page |
|
| 617 |
+ if os.path.exists( os.path.join( localdir, 'stats.txt' ) ): |
|
| 618 |
+ f = open( os.path.join( localdir, 'stats.txt' )) |
|
| 619 |
+ try: |
|
| 620 |
+ lines = f.readlines() |
|
| 621 |
+ finally: |
|
| 622 |
+ f.close() |
|
| 623 |
+ else: |
|
| 624 |
+ lines = [] |
|
| 625 |
+ lines = lines[:168] # Just keep the past week's worth |
|
| 626 |
+ status = len( message.strip() ) and message.strip().replace( '\n', ' - ' ) or "OK" |
|
| 627 |
+ lines.insert( 0, "%s %3.0fs %s\n" % ( time.strftime('%H:%M, %Y-%m-%d', time.localtime()), time.time() - start_time, status ))
|
|
| 628 |
+ f = open( os.path.join( localdir,'stats.txt' ), 'w' ) |
|
| 629 |
+ f.writelines( lines ) |
|
| 630 |
+ f.close() |
|
| 0 | 631 |