David Blume commited on 2018-01-20 20:10:33
Showing 4 changed files, with 4291 additions and 0 deletions.
... | ... |
@@ -0,0 +1,19 @@ |
1 |
+Copyright (c) 2018, David Blume |
|
2 |
+ |
|
3 |
+Permission is hereby granted, free of charge, to any person obtaining |
|
4 |
+a copy of this software and associated documentation files (the "Software"), |
|
5 |
+to deal in the Software without restriction, including without limitation |
|
6 |
+the rights to use, copy, modify, merge, publish, distribute, sublicense, |
|
7 |
+and/or sell copies of the Software, and to permit persons to whom the |
|
8 |
+Software is furnished to do so, subject to the following conditions: |
|
9 |
+ |
|
10 |
+The above copyright notice and this permission notice shall be included |
|
11 |
+in all copies or substantial portions of the Software. |
|
12 |
+ |
|
13 |
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
|
14 |
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|
15 |
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
|
16 |
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|
17 |
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
|
18 |
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
|
19 |
+DEALINGS IN THE SOFTWARE. |
... | ... |
@@ -0,0 +1,30 @@ |
1 |
+[![License](https://img.shields.io/badge/license-MIT_license-blue.svg)](http://git.dlma.com/techcrunch.git/blob/master/LICENSE.txt) |
|
2 |
+![python2.x](https://img.shields.io/badge/python-2.x-yellow.svg) |
|
3 |
+# TechCrunch Feed Filter |
|
4 |
+ |
|
5 |
+This is a Python script run as a cronjob to read the TechCrunch article feed, |
|
6 |
+and decide which articles to include in its own feed. |
|
7 |
+ |
|
8 |
+Here's a [blog post about it](http://david.dlma.com/blog/my-techcrunch-feed-filter). |
|
9 |
+ |
|
10 |
+# History |
|
11 |
+ |
|
12 |
+This was originally archived in a Subversion repo. I'd forgotten about the |
|
13 |
+version control and had gotten into the habit of just modifying the production |
|
14 |
+site. |
|
15 |
+ |
|
16 |
+* 2010-09-03: Original |
|
17 |
+* 2010-09-03: Save off the disqus identifier for use later. |
|
18 |
+* 2011-02-04: Algorithm changes (tags and author checked), new chart drawing, spaces used instead of tabs. |
|
19 |
+* 2011-02-04: Update to the chart drawing algorithm. |
|
20 |
+* 2013-08-04: Miscellaneous changes to techcrunch.py |
|
21 |
+* 2015-11-23: Resync svn with production site. |
|
22 |
+* 2015-11-27: Remove obsolete disqus and retweet code, and refactor style to be more PEP-8ish. |
|
23 |
+ |
|
24 |
+# Is it any good? |
|
25 |
+ |
|
26 |
+[Yes](https://news.ycombinator.com/item?id=3067434). |
|
27 |
+ |
|
28 |
+# Licence |
|
29 |
+ |
|
30 |
+This software uses the [MIT license](http://git.dlma.com/techcrunch.git/blob/master/LICENSE.txt) |
... | ... |
@@ -0,0 +1,3612 @@ |
1 |
+#!/usr/bin/env python |
|
2 |
+"""Universal feed parser |
|
3 |
+ |
|
4 |
+Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds |
|
5 |
+ |
|
6 |
+Visit http://feedparser.org/ for the latest version |
|
7 |
+Visit http://feedparser.org/docs/ for the latest documentation |
|
8 |
+ |
|
9 |
+Required: Python 2.1 or later |
|
10 |
+Recommended: Python 2.3 or later |
|
11 |
+Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/> |
|
12 |
+""" |
|
13 |
+ |
|
14 |
+__version__ = "4.2-pre-" + "$Revision: 291 $"[11:14] + "-svn" |
|
15 |
+__license__ = """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved. |
|
16 |
+ |
|
17 |
+Redistribution and use in source and binary forms, with or without modification, |
|
18 |
+are permitted provided that the following conditions are met: |
|
19 |
+ |
|
20 |
+* Redistributions of source code must retain the above copyright notice, |
|
21 |
+ this list of conditions and the following disclaimer. |
|
22 |
+* Redistributions in binary form must reproduce the above copyright notice, |
|
23 |
+ this list of conditions and the following disclaimer in the documentation |
|
24 |
+ and/or other materials provided with the distribution. |
|
25 |
+ |
|
26 |
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' |
|
27 |
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
28 |
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
29 |
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
|
30 |
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
|
31 |
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
|
32 |
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|
33 |
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
|
34 |
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
|
35 |
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
|
36 |
+POSSIBILITY OF SUCH DAMAGE.""" |
|
37 |
+__author__ = "Mark Pilgrim <http://diveintomark.org/>" |
|
38 |
+__contributors__ = ["Jason Diamond <http://injektilo.org/>", |
|
39 |
+ "John Beimler <http://john.beimler.org/>", |
|
40 |
+ "Fazal Majid <http://www.majid.info/mylos/weblog/>", |
|
41 |
+ "Aaron Swartz <http://aaronsw.com/>", |
|
42 |
+ "Kevin Marks <http://epeus.blogspot.com/>", |
|
43 |
+ "Sam Ruby <http://intertwingly.net/>"] |
|
44 |
+_debug = 0 |
|
45 |
+ |
|
46 |
+# HTTP "User-Agent" header to send to servers when downloading feeds. |
|
47 |
+# If you are embedding feedparser in a larger application, you should |
|
48 |
+# change this to your application name and URL. |
|
49 |
+USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__ |
|
50 |
+ |
|
51 |
+# HTTP "Accept" header to send to servers when downloading feeds. If you don't |
|
52 |
+# want to send an Accept header, set this to None. |
|
53 |
+ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1" |
|
54 |
+ |
|
55 |
+# List of preferred XML parsers, by SAX driver name. These will be tried first, |
|
56 |
+# but if they're not installed, Python will keep searching through its own list |
|
57 |
+# of pre-installed parsers until it finds one that supports everything we need. |
|
58 |
+PREFERRED_XML_PARSERS = ["drv_libxml2"] |
|
59 |
+ |
|
60 |
+# If you want feedparser to automatically run HTML markup through HTML Tidy, set |
|
61 |
+# this to 1. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html> |
|
62 |
+# or utidylib <http://utidylib.berlios.de/>. |
|
63 |
+TIDY_MARKUP = 0 |
|
64 |
+ |
|
65 |
+# List of Python interfaces for HTML Tidy, in order of preference. Only useful |
|
66 |
+# if TIDY_MARKUP = 1 |
|
67 |
+PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] |
|
68 |
+ |
|
69 |
+# If you want feedparser to automatically resolve all relative URIs, set this |
|
70 |
+# to 1. |
|
71 |
+RESOLVE_RELATIVE_URIS = 1 |
|
72 |
+ |
|
73 |
+# If you want feedparser to automatically sanitize all potentially unsafe |
|
74 |
+# HTML content, set this to 1. |
|
75 |
+SANITIZE_HTML = 1 |
|
76 |
+ |
|
77 |
+# ---------- required modules (should come with any Python distribution) ---------- |
|
78 |
+import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2 |
|
79 |
+try: |
|
80 |
+ from cStringIO import StringIO as _StringIO |
|
81 |
+except: |
|
82 |
+ from StringIO import StringIO as _StringIO |
|
83 |
+ |
|
84 |
+# ---------- optional modules (feedparser will work without these, but with reduced functionality) ---------- |
|
85 |
+ |
|
86 |
+# gzip is included with most Python distributions, but may not be available if you compiled your own |
|
87 |
+try: |
|
88 |
+ import gzip |
|
89 |
+except: |
|
90 |
+ gzip = None |
|
91 |
+try: |
|
92 |
+ import zlib |
|
93 |
+except: |
|
94 |
+ zlib = None |
|
95 |
+ |
|
96 |
+# If a real XML parser is available, feedparser will attempt to use it. feedparser has |
|
97 |
+# been tested with the built-in SAX parser, PyXML, and libxml2. On platforms where the |
|
98 |
+# Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some |
|
99 |
+# versions of FreeBSD), feedparser will quietly fall back on regex-based parsing. |
|
100 |
+try: |
|
101 |
+ import xml.sax |
|
102 |
+ xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers |
|
103 |
+ from xml.sax.saxutils import escape as _xmlescape |
|
104 |
+ _XML_AVAILABLE = 1 |
|
105 |
+except: |
|
106 |
+ _XML_AVAILABLE = 0 |
|
107 |
+ def _xmlescape(data,entities={}): |
|
108 |
+ data = data.replace('&', '&') |
|
109 |
+ data = data.replace('>', '>') |
|
110 |
+ data = data.replace('<', '<') |
|
111 |
+ for char, entity in entities: |
|
112 |
+ data = data.replace(char, entity) |
|
113 |
+ return data |
|
114 |
+ |
|
115 |
+# base64 support for Atom feeds that contain embedded binary data |
|
116 |
+try: |
|
117 |
+ import base64, binascii |
|
118 |
+except: |
|
119 |
+ base64 = binascii = None |
|
120 |
+ |
|
121 |
+# cjkcodecs and iconv_codec provide support for more character encodings. |
|
122 |
+# Both are available from http://cjkpython.i18n.org/ |
|
123 |
+try: |
|
124 |
+ import cjkcodecs.aliases |
|
125 |
+except: |
|
126 |
+ pass |
|
127 |
+try: |
|
128 |
+ import iconv_codec |
|
129 |
+except: |
|
130 |
+ pass |
|
131 |
+ |
|
132 |
+# chardet library auto-detects character encodings |
|
133 |
+# Download from http://chardet.feedparser.org/ |
|
134 |
+try: |
|
135 |
+ import chardet |
|
136 |
+ if _debug: |
|
137 |
+ import chardet.constants |
|
138 |
+ chardet.constants._debug = 1 |
|
139 |
+except: |
|
140 |
+ chardet = None |
|
141 |
+ |
|
142 |
+# reversable htmlentitydefs mappings for Python 2.2 |
|
143 |
+try: |
|
144 |
+ from htmlentitydefs import name2codepoint, codepoint2name |
|
145 |
+except: |
|
146 |
+ import htmlentitydefs |
|
147 |
+ name2codepoint={} |
|
148 |
+ codepoint2name={} |
|
149 |
+ for (name,codepoint) in htmlentitydefs.entitydefs.iteritems(): |
|
150 |
+ if codepoint.startswith('&#'): codepoint=unichr(int(codepoint[2:-1])) |
|
151 |
+ name2codepoint[name]=ord(codepoint) |
|
152 |
+ codepoint2name[ord(codepoint)]=name |
|
153 |
+ |
|
154 |
+# BeautifulSoup parser used for parsing microformats from embedded HTML content |
|
155 |
+# http://www.crummy.com/software/BeautifulSoup/ |
|
156 |
+# feedparser is tested with BeautifulSoup 3.0.x, but it might work with the |
|
157 |
+# older 2.x series. If it doesn't, and you can figure out why, I'll accept a |
|
158 |
+# patch and modify the compatibility statement accordingly. |
|
159 |
+try: |
|
160 |
+ import BeautifulSoup |
|
161 |
+except: |
|
162 |
+ BeautifulSoup = None |
|
163 |
+ |
|
164 |
+# ---------- don't touch these ---------- |
|
165 |
+class ThingsNobodyCaresAboutButMe(Exception): pass |
|
166 |
+class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass |
|
167 |
+class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass |
|
168 |
+class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass |
|
169 |
+class UndeclaredNamespace(Exception): pass |
|
170 |
+ |
|
171 |
+sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') |
|
172 |
+sgmllib.special = re.compile('<!') |
|
173 |
+sgmllib.charref = re.compile('&#(\d+|x[0-9a-fA-F]+);') |
|
174 |
+ |
|
175 |
+if sgmllib.endbracket.search(' <').start(0): |
|
176 |
+ class EndBracketMatch: |
|
177 |
+ endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''') |
|
178 |
+ def search(self,string,index=0): |
|
179 |
+ self.match = self.endbracket.match(string,index) |
|
180 |
+ if self.match: return self |
|
181 |
+ def start(self,n): |
|
182 |
+ return self.match.end(n) |
|
183 |
+ sgmllib.endbracket = EndBracketMatch() |
|
184 |
+ |
|
185 |
+SUPPORTED_VERSIONS = {'': 'unknown', |
|
186 |
+ 'rss090': 'RSS 0.90', |
|
187 |
+ 'rss091n': 'RSS 0.91 (Netscape)', |
|
188 |
+ 'rss091u': 'RSS 0.91 (Userland)', |
|
189 |
+ 'rss092': 'RSS 0.92', |
|
190 |
+ 'rss093': 'RSS 0.93', |
|
191 |
+ 'rss094': 'RSS 0.94', |
|
192 |
+ 'rss20': 'RSS 2.0', |
|
193 |
+ 'rss10': 'RSS 1.0', |
|
194 |
+ 'rss': 'RSS (unknown version)', |
|
195 |
+ 'atom01': 'Atom 0.1', |
|
196 |
+ 'atom02': 'Atom 0.2', |
|
197 |
+ 'atom03': 'Atom 0.3', |
|
198 |
+ 'atom10': 'Atom 1.0', |
|
199 |
+ 'atom': 'Atom (unknown version)', |
|
200 |
+ 'cdf': 'CDF', |
|
201 |
+ 'hotrss': 'Hot RSS' |
|
202 |
+ } |
|
203 |
+ |
|
204 |
+try: |
|
205 |
+ UserDict = dict |
|
206 |
+except NameError: |
|
207 |
+ # Python 2.1 does not have dict |
|
208 |
+ from UserDict import UserDict |
|
209 |
+ def dict(aList): |
|
210 |
+ rc = {} |
|
211 |
+ for k, v in aList: |
|
212 |
+ rc[k] = v |
|
213 |
+ return rc |
|
214 |
+ |
|
215 |
+class FeedParserDict(UserDict): |
|
216 |
+ keymap = {'channel': 'feed', |
|
217 |
+ 'items': 'entries', |
|
218 |
+ 'guid': 'id', |
|
219 |
+ 'date': 'updated', |
|
220 |
+ 'date_parsed': 'updated_parsed', |
|
221 |
+ 'description': ['subtitle', 'summary'], |
|
222 |
+ 'url': ['href'], |
|
223 |
+ 'modified': 'updated', |
|
224 |
+ 'modified_parsed': 'updated_parsed', |
|
225 |
+ 'issued': 'published', |
|
226 |
+ 'issued_parsed': 'published_parsed', |
|
227 |
+ 'copyright': 'rights', |
|
228 |
+ 'copyright_detail': 'rights_detail', |
|
229 |
+ 'tagline': 'subtitle', |
|
230 |
+ 'tagline_detail': 'subtitle_detail'} |
|
231 |
+ def __getitem__(self, key): |
|
232 |
+ if key == 'category': |
|
233 |
+ return UserDict.__getitem__(self, 'tags')[0]['term'] |
|
234 |
+ if key == 'enclosures': |
|
235 |
+ norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel']) |
|
236 |
+ return [norel(link) for link in UserDict.__getitem__(self, 'links') if link['rel']=='enclosure'] |
|
237 |
+ if key == 'license': |
|
238 |
+ for link in UserDict.__getitem__(self, 'links'): |
|
239 |
+ if link['rel']=='license' and link.has_key('href'): |
|
240 |
+ return link['href'] |
|
241 |
+ if key == 'categories': |
|
242 |
+ return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')] |
|
243 |
+ realkey = self.keymap.get(key, key) |
|
244 |
+ if type(realkey) == types.ListType: |
|
245 |
+ for k in realkey: |
|
246 |
+ if UserDict.has_key(self, k): |
|
247 |
+ return UserDict.__getitem__(self, k) |
|
248 |
+ if UserDict.has_key(self, key): |
|
249 |
+ return UserDict.__getitem__(self, key) |
|
250 |
+ return UserDict.__getitem__(self, realkey) |
|
251 |
+ |
|
252 |
+ def __setitem__(self, key, value): |
|
253 |
+ for k in self.keymap.keys(): |
|
254 |
+ if key == k: |
|
255 |
+ key = self.keymap[k] |
|
256 |
+ if type(key) == types.ListType: |
|
257 |
+ key = key[0] |
|
258 |
+ return UserDict.__setitem__(self, key, value) |
|
259 |
+ |
|
260 |
+ def get(self, key, default=None): |
|
261 |
+ if self.has_key(key): |
|
262 |
+ return self[key] |
|
263 |
+ else: |
|
264 |
+ return default |
|
265 |
+ |
|
266 |
+ def setdefault(self, key, value): |
|
267 |
+ if not self.has_key(key): |
|
268 |
+ self[key] = value |
|
269 |
+ return self[key] |
|
270 |
+ |
|
271 |
+ def has_key(self, key): |
|
272 |
+ try: |
|
273 |
+ return hasattr(self, key) or UserDict.has_key(self, key) |
|
274 |
+ except AttributeError: |
|
275 |
+ return False |
|
276 |
+ |
|
277 |
+ def __getattr__(self, key): |
|
278 |
+ try: |
|
279 |
+ return self.__dict__[key] |
|
280 |
+ except KeyError: |
|
281 |
+ pass |
|
282 |
+ try: |
|
283 |
+ assert not key.startswith('_') |
|
284 |
+ return self.__getitem__(key) |
|
285 |
+ except: |
|
286 |
+ raise AttributeError, "object has no attribute '%s'" % key |
|
287 |
+ |
|
288 |
+ def __setattr__(self, key, value): |
|
289 |
+ if key.startswith('_') or key == 'data': |
|
290 |
+ self.__dict__[key] = value |
|
291 |
+ else: |
|
292 |
+ return self.__setitem__(key, value) |
|
293 |
+ |
|
294 |
+ def __contains__(self, key): |
|
295 |
+ return self.has_key(key) |
|
296 |
+ |
|
297 |
+def zopeCompatibilityHack(): |
|
298 |
+ global FeedParserDict |
|
299 |
+ del FeedParserDict |
|
300 |
+ def FeedParserDict(aDict=None): |
|
301 |
+ rc = {} |
|
302 |
+ if aDict: |
|
303 |
+ rc.update(aDict) |
|
304 |
+ return rc |
|
305 |
+ |
|
306 |
+_ebcdic_to_ascii_map = None |
|
307 |
+def _ebcdic_to_ascii(s): |
|
308 |
+ global _ebcdic_to_ascii_map |
|
309 |
+ if not _ebcdic_to_ascii_map: |
|
310 |
+ emap = ( |
|
311 |
+ 0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, |
|
312 |
+ 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, |
|
313 |
+ 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, |
|
314 |
+ 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, |
|
315 |
+ 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, |
|
316 |
+ 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, |
|
317 |
+ 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, |
|
318 |
+ 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, |
|
319 |
+ 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201, |
|
320 |
+ 202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208, |
|
321 |
+ 209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215, |
|
322 |
+ 216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231, |
|
323 |
+ 123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237, |
|
324 |
+ 125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243, |
|
325 |
+ 92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249, |
|
326 |
+ 48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255 |
|
327 |
+ ) |
|
328 |
+ import string |
|
329 |
+ _ebcdic_to_ascii_map = string.maketrans( \ |
|
330 |
+ ''.join(map(chr, range(256))), ''.join(map(chr, emap))) |
|
331 |
+ return s.translate(_ebcdic_to_ascii_map) |
|
332 |
+ |
|
333 |
+_cp1252 = { |
|
334 |
+ unichr(128): unichr(8364), # euro sign |
|
335 |
+ unichr(130): unichr(8218), # single low-9 quotation mark |
|
336 |
+ unichr(131): unichr( 402), # latin small letter f with hook |
|
337 |
+ unichr(132): unichr(8222), # double low-9 quotation mark |
|
338 |
+ unichr(133): unichr(8230), # horizontal ellipsis |
|
339 |
+ unichr(134): unichr(8224), # dagger |
|
340 |
+ unichr(135): unichr(8225), # double dagger |
|
341 |
+ unichr(136): unichr( 710), # modifier letter circumflex accent |
|
342 |
+ unichr(137): unichr(8240), # per mille sign |
|
343 |
+ unichr(138): unichr( 352), # latin capital letter s with caron |
|
344 |
+ unichr(139): unichr(8249), # single left-pointing angle quotation mark |
|
345 |
+ unichr(140): unichr( 338), # latin capital ligature oe |
|
346 |
+ unichr(142): unichr( 381), # latin capital letter z with caron |
|
347 |
+ unichr(145): unichr(8216), # left single quotation mark |
|
348 |
+ unichr(146): unichr(8217), # right single quotation mark |
|
349 |
+ unichr(147): unichr(8220), # left double quotation mark |
|
350 |
+ unichr(148): unichr(8221), # right double quotation mark |
|
351 |
+ unichr(149): unichr(8226), # bullet |
|
352 |
+ unichr(150): unichr(8211), # en dash |
|
353 |
+ unichr(151): unichr(8212), # em dash |
|
354 |
+ unichr(152): unichr( 732), # small tilde |
|
355 |
+ unichr(153): unichr(8482), # trade mark sign |
|
356 |
+ unichr(154): unichr( 353), # latin small letter s with caron |
|
357 |
+ unichr(155): unichr(8250), # single right-pointing angle quotation mark |
|
358 |
+ unichr(156): unichr( 339), # latin small ligature oe |
|
359 |
+ unichr(158): unichr( 382), # latin small letter z with caron |
|
360 |
+ unichr(159): unichr( 376)} # latin capital letter y with diaeresis |
|
361 |
+ |
|
362 |
+_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)') |
|
363 |
+def _urljoin(base, uri): |
|
364 |
+ uri = _urifixer.sub(r'\1\3', uri) |
|
365 |
+ try: |
|
366 |
+ return urlparse.urljoin(base, uri) |
|
367 |
+ except: |
|
368 |
+ uri = urlparse.urlunparse([urllib.quote(part) for part in urlparse.urlparse(uri)]) |
|
369 |
+ return urlparse.urljoin(base, uri) |
|
370 |
+ |
|
371 |
+class _FeedParserMixin: |
|
372 |
+ namespaces = {'': '', |
|
373 |
+ 'http://backend.userland.com/rss': '', |
|
374 |
+ 'http://blogs.law.harvard.edu/tech/rss': '', |
|
375 |
+ 'http://purl.org/rss/1.0/': '', |
|
376 |
+ 'http://my.netscape.com/rdf/simple/0.9/': '', |
|
377 |
+ 'http://example.com/newformat#': '', |
|
378 |
+ 'http://example.com/necho': '', |
|
379 |
+ 'http://purl.org/echo/': '', |
|
380 |
+ 'uri/of/echo/namespace#': '', |
|
381 |
+ 'http://purl.org/pie/': '', |
|
382 |
+ 'http://purl.org/atom/ns#': '', |
|
383 |
+ 'http://www.w3.org/2005/Atom': '', |
|
384 |
+ 'http://purl.org/rss/1.0/modules/rss091#': '', |
|
385 |
+ |
|
386 |
+ 'http://webns.net/mvcb/': 'admin', |
|
387 |
+ 'http://purl.org/rss/1.0/modules/aggregation/': 'ag', |
|
388 |
+ 'http://purl.org/rss/1.0/modules/annotate/': 'annotate', |
|
389 |
+ 'http://media.tangent.org/rss/1.0/': 'audio', |
|
390 |
+ 'http://backend.userland.com/blogChannelModule': 'blogChannel', |
|
391 |
+ 'http://web.resource.org/cc/': 'cc', |
|
392 |
+ 'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons', |
|
393 |
+ 'http://purl.org/rss/1.0/modules/company': 'co', |
|
394 |
+ 'http://purl.org/rss/1.0/modules/content/': 'content', |
|
395 |
+ 'http://my.theinfo.org/changed/1.0/rss/': 'cp', |
|
396 |
+ 'http://purl.org/dc/elements/1.1/': 'dc', |
|
397 |
+ 'http://purl.org/dc/terms/': 'dcterms', |
|
398 |
+ 'http://purl.org/rss/1.0/modules/email/': 'email', |
|
399 |
+ 'http://purl.org/rss/1.0/modules/event/': 'ev', |
|
400 |
+ 'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner', |
|
401 |
+ 'http://freshmeat.net/rss/fm/': 'fm', |
|
402 |
+ 'http://xmlns.com/foaf/0.1/': 'foaf', |
|
403 |
+ 'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo', |
|
404 |
+ 'http://postneo.com/icbm/': 'icbm', |
|
405 |
+ 'http://purl.org/rss/1.0/modules/image/': 'image', |
|
406 |
+ 'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes', |
|
407 |
+ 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes', |
|
408 |
+ 'http://purl.org/rss/1.0/modules/link/': 'l', |
|
409 |
+ 'http://search.yahoo.com/mrss': 'media', |
|
410 |
+ 'http://madskills.com/public/xml/rss/module/pingback/': 'pingback', |
|
411 |
+ 'http://prismstandard.org/namespaces/1.2/basic/': 'prism', |
|
412 |
+ 'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf', |
|
413 |
+ 'http://www.w3.org/2000/01/rdf-schema#': 'rdfs', |
|
414 |
+ 'http://purl.org/rss/1.0/modules/reference/': 'ref', |
|
415 |
+ 'http://purl.org/rss/1.0/modules/richequiv/': 'reqv', |
|
416 |
+ 'http://purl.org/rss/1.0/modules/search/': 'search', |
|
417 |
+ 'http://purl.org/rss/1.0/modules/slash/': 'slash', |
|
418 |
+ 'http://schemas.xmlsoap.org/soap/envelope/': 'soap', |
|
419 |
+ 'http://purl.org/rss/1.0/modules/servicestatus/': 'ss', |
|
420 |
+ 'http://hacks.benhammersley.com/rss/streaming/': 'str', |
|
421 |
+ 'http://purl.org/rss/1.0/modules/subscription/': 'sub', |
|
422 |
+ 'http://purl.org/rss/1.0/modules/syndication/': 'sy', |
|
423 |
+ 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf', |
|
424 |
+ 'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo', |
|
425 |
+ 'http://purl.org/rss/1.0/modules/threading/': 'thr', |
|
426 |
+ 'http://purl.org/rss/1.0/modules/textinput/': 'ti', |
|
427 |
+ 'http://madskills.com/public/xml/rss/module/trackback/':'trackback', |
|
428 |
+ 'http://wellformedweb.org/commentAPI/': 'wfw', |
|
429 |
+ 'http://purl.org/rss/1.0/modules/wiki/': 'wiki', |
|
430 |
+ 'http://www.w3.org/1999/xhtml': 'xhtml', |
|
431 |
+ 'http://www.w3.org/1999/xlink': 'xlink', |
|
432 |
+ 'http://www.w3.org/XML/1998/namespace': 'xml' |
|
433 |
+} |
|
434 |
+ _matchnamespaces = {} |
|
435 |
+ |
|
436 |
+ can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo'] |
|
437 |
+ can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'] |
|
438 |
+ can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'] |
|
439 |
+ html_types = ['text/html', 'application/xhtml+xml'] |
|
440 |
+ |
|
441 |
+ def __init__(self, baseuri=None, baselang=None, encoding='utf-8'): |
|
442 |
+ if _debug: sys.stderr.write('initializing FeedParser\n') |
|
443 |
+ if not self._matchnamespaces: |
|
444 |
+ for k, v in self.namespaces.items(): |
|
445 |
+ self._matchnamespaces[k.lower()] = v |
|
446 |
+ self.feeddata = FeedParserDict() # feed-level data |
|
447 |
+ self.encoding = encoding # character encoding |
|
448 |
+ self.entries = [] # list of entry-level data |
|
449 |
+ self.version = '' # feed type/version, see SUPPORTED_VERSIONS |
|
450 |
+ self.namespacesInUse = {} # dictionary of namespaces defined by the feed |
|
451 |
+ |
|
452 |
+ # the following are used internally to track state; |
|
453 |
+ # this is really out of control and should be refactored |
|
454 |
+ self.infeed = 0 |
|
455 |
+ self.inentry = 0 |
|
456 |
+ self.incontent = 0 |
|
457 |
+ self.intextinput = 0 |
|
458 |
+ self.inimage = 0 |
|
459 |
+ self.inauthor = 0 |
|
460 |
+ self.incontributor = 0 |
|
461 |
+ self.inpublisher = 0 |
|
462 |
+ self.insource = 0 |
|
463 |
+ self.sourcedata = FeedParserDict() |
|
464 |
+ self.contentparams = FeedParserDict() |
|
465 |
+ self._summaryKey = None |
|
466 |
+ self.namespacemap = {} |
|
467 |
+ self.elementstack = [] |
|
468 |
+ self.basestack = [] |
|
469 |
+ self.langstack = [] |
|
470 |
+ self.baseuri = baseuri or '' |
|
471 |
+ self.lang = baselang or None |
|
472 |
+ self.svgOK = 0 |
|
473 |
+ self.hasTitle = 0 |
|
474 |
+ if baselang: |
|
475 |
+ self.feeddata['language'] = baselang.replace('_','-') |
|
476 |
+ |
|
477 |
+ def unknown_starttag(self, tag, attrs): |
|
478 |
+ if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs)) |
|
479 |
+ # normalize attrs |
|
480 |
+ attrs = [(k.lower(), v) for k, v in attrs] |
|
481 |
+ attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] |
|
482 |
+ |
|
483 |
+ # track xml:base and xml:lang |
|
484 |
+ attrsD = dict(attrs) |
|
485 |
+ baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri |
|
486 |
+ if type(baseuri) != type(u''): |
|
487 |
+ try: |
|
488 |
+ baseuri = unicode(baseuri, self.encoding) |
|
489 |
+ except: |
|
490 |
+ baseuri = unicode(baseuri, 'iso-8859-1') |
|
491 |
+ self.baseuri = _urljoin(self.baseuri, baseuri) |
|
492 |
+ lang = attrsD.get('xml:lang', attrsD.get('lang')) |
|
493 |
+ if lang == '': |
|
494 |
+ # xml:lang could be explicitly set to '', we need to capture that |
|
495 |
+ lang = None |
|
496 |
+ elif lang is None: |
|
497 |
+ # if no xml:lang is specified, use parent lang |
|
498 |
+ lang = self.lang |
|
499 |
+ if lang: |
|
500 |
+ if tag in ('feed', 'rss', 'rdf:RDF'): |
|
501 |
+ self.feeddata['language'] = lang.replace('_','-') |
|
502 |
+ self.lang = lang |
|
503 |
+ self.basestack.append(self.baseuri) |
|
504 |
+ self.langstack.append(lang) |
|
505 |
+ |
|
506 |
+ # track namespaces |
|
507 |
+ for prefix, uri in attrs: |
|
508 |
+ if prefix.startswith('xmlns:'): |
|
509 |
+ self.trackNamespace(prefix[6:], uri) |
|
510 |
+ elif prefix == 'xmlns': |
|
511 |
+ self.trackNamespace(None, uri) |
|
512 |
+ |
|
513 |
+ # track inline content |
|
514 |
+ if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): |
|
515 |
+ if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007 |
|
516 |
+ # element declared itself as escaped markup, but it isn't really |
|
517 |
+ self.contentparams['type'] = 'application/xhtml+xml' |
|
518 |
+ if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml': |
|
519 |
+ if tag.find(':') <> -1: |
|
520 |
+ prefix, tag = tag.split(':', 1) |
|
521 |
+ namespace = self.namespacesInUse.get(prefix, '') |
|
522 |
+ if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML': |
|
523 |
+ attrs.append(('xmlns',namespace)) |
|
524 |
+ if tag=='svg' and namespace=='http://www.w3.org/2000/svg': |
|
525 |
+ attrs.append(('xmlns',namespace)) |
|
526 |
+ if tag == 'svg': self.svgOK += 1 |
|
527 |
+ return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0) |
|
528 |
+ |
|
529 |
+ # match namespaces |
|
530 |
+ if tag.find(':') <> -1: |
|
531 |
+ prefix, suffix = tag.split(':', 1) |
|
532 |
+ else: |
|
533 |
+ prefix, suffix = '', tag |
|
534 |
+ prefix = self.namespacemap.get(prefix, prefix) |
|
535 |
+ if prefix: |
|
536 |
+ prefix = prefix + '_' |
|
537 |
+ |
|
538 |
+ # special hack for better tracking of empty textinput/image elements in illformed feeds |
|
539 |
+ if (not prefix) and tag not in ('title', 'link', 'description', 'name'): |
|
540 |
+ self.intextinput = 0 |
|
541 |
+ if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'): |
|
542 |
+ self.inimage = 0 |
|
543 |
+ |
|
544 |
+ # call special handler (if defined) or default handler |
|
545 |
+ methodname = '_start_' + prefix + suffix |
|
546 |
+ try: |
|
547 |
+ method = getattr(self, methodname) |
|
548 |
+ return method(attrsD) |
|
549 |
+ except AttributeError: |
|
550 |
+ return self.push(prefix + suffix, 1) |
|
551 |
+ |
|
552 |
+ def unknown_endtag(self, tag): |
|
553 |
+ if _debug: sys.stderr.write('end %s\n' % tag) |
|
554 |
+ # match namespaces |
|
555 |
+ if tag.find(':') <> -1: |
|
556 |
+ prefix, suffix = tag.split(':', 1) |
|
557 |
+ else: |
|
558 |
+ prefix, suffix = '', tag |
|
559 |
+ prefix = self.namespacemap.get(prefix, prefix) |
|
560 |
+ if prefix: |
|
561 |
+ prefix = prefix + '_' |
|
562 |
+ if suffix == 'svg' and self.svgOK: self.svgOK -= 1 |
|
563 |
+ |
|
564 |
+ # call special handler (if defined) or default handler |
|
565 |
+ methodname = '_end_' + prefix + suffix |
|
566 |
+ try: |
|
567 |
+ if self.svgOK: raise AttributeError() |
|
568 |
+ method = getattr(self, methodname) |
|
569 |
+ method() |
|
570 |
+ except AttributeError: |
|
571 |
+ self.pop(prefix + suffix) |
|
572 |
+ |
|
573 |
+ # track inline content |
|
574 |
+ if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): |
|
575 |
+ # element declared itself as escaped markup, but it isn't really |
|
576 |
+ if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007 |
|
577 |
+ self.contentparams['type'] = 'application/xhtml+xml' |
|
578 |
+ if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml': |
|
579 |
+ tag = tag.split(':')[-1] |
|
580 |
+ self.handle_data('</%s>' % tag, escape=0) |
|
581 |
+ |
|
582 |
+ # track xml:base and xml:lang going out of scope |
|
583 |
+ if self.basestack: |
|
584 |
+ self.basestack.pop() |
|
585 |
+ if self.basestack and self.basestack[-1]: |
|
586 |
+ self.baseuri = self.basestack[-1] |
|
587 |
+ if self.langstack: |
|
588 |
+ self.langstack.pop() |
|
589 |
+ if self.langstack: # and (self.langstack[-1] is not None): |
|
590 |
+ self.lang = self.langstack[-1] |
|
591 |
+ |
|
592 |
+ def handle_charref(self, ref): |
|
593 |
+ # called for each character reference, e.g. for ' ', ref will be '160' |
|
594 |
+ if not self.elementstack: return |
|
595 |
+ ref = ref.lower() |
|
596 |
+ if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'): |
|
597 |
+ text = '&#%s;' % ref |
|
598 |
+ else: |
|
599 |
+ if ref[0] == 'x': |
|
600 |
+ c = int(ref[1:], 16) |
|
601 |
+ else: |
|
602 |
+ c = int(ref) |
|
603 |
+ text = unichr(c).encode('utf-8') |
|
604 |
+ self.elementstack[-1][2].append(text) |
|
605 |
+ |
|
606 |
+ def handle_entityref(self, ref): |
|
607 |
+ # called for each entity reference, e.g. for '©', ref will be 'copy' |
|
608 |
+ if not self.elementstack: return |
|
609 |
+ if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref) |
|
610 |
+ if ref in ('lt', 'gt', 'quot', 'amp', 'apos'): |
|
611 |
+ text = '&%s;' % ref |
|
612 |
+ elif ref in self.entities.keys(): |
|
613 |
+ text = self.entities[ref] |
|
614 |
+ if text.startswith('&#') and text.endswith(';'): |
|
615 |
+ return self.handle_entityref(text) |
|
616 |
+ else: |
|
617 |
+ try: name2codepoint[ref] |
|
618 |
+ except KeyError: text = '&%s;' % ref |
|
619 |
+ else: text = unichr(name2codepoint[ref]).encode('utf-8') |
|
620 |
+ self.elementstack[-1][2].append(text) |
|
621 |
+ |
|
622 |
+ def handle_data(self, text, escape=1): |
|
623 |
+ # called for each block of plain text, i.e. outside of any tag and |
|
624 |
+ # not containing any character or entity references |
|
625 |
+ if not self.elementstack: return |
|
626 |
+ if escape and self.contentparams.get('type') == 'application/xhtml+xml': |
|
627 |
+ text = _xmlescape(text) |
|
628 |
+ self.elementstack[-1][2].append(text) |
|
629 |
+ |
|
630 |
+ def handle_comment(self, text): |
|
631 |
+ # called for each comment, e.g. <!-- insert message here --> |
|
632 |
+ pass |
|
633 |
+ |
|
634 |
+ def handle_pi(self, text): |
|
635 |
+ # called for each processing instruction, e.g. <?instruction> |
|
636 |
+ pass |
|
637 |
+ |
|
638 |
+ def handle_decl(self, text): |
|
639 |
+ pass |
|
640 |
+ |
|
641 |
+ def parse_declaration(self, i): |
|
642 |
+ # override internal declaration handler to handle CDATA blocks |
|
643 |
+ if _debug: sys.stderr.write('entering parse_declaration\n') |
|
644 |
+ if self.rawdata[i:i+9] == '<![CDATA[': |
|
645 |
+ k = self.rawdata.find(']]>', i) |
|
646 |
+ if k == -1: k = len(self.rawdata) |
|
647 |
+ self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0) |
|
648 |
+ return k+3 |
|
649 |
+ else: |
|
650 |
+ k = self.rawdata.find('>', i) |
|
651 |
+ return k+1 |
|
652 |
+ |
|
653 |
+ def mapContentType(self, contentType): |
|
654 |
+ contentType = contentType.lower() |
|
655 |
+ if contentType == 'text': |
|
656 |
+ contentType = 'text/plain' |
|
657 |
+ elif contentType == 'html': |
|
658 |
+ contentType = 'text/html' |
|
659 |
+ elif contentType == 'xhtml': |
|
660 |
+ contentType = 'application/xhtml+xml' |
|
661 |
+ return contentType |
|
662 |
+ |
|
663 |
+ def trackNamespace(self, prefix, uri): |
|
664 |
+ loweruri = uri.lower() |
|
665 |
+ if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version: |
|
666 |
+ self.version = 'rss090' |
|
667 |
+ if loweruri == 'http://purl.org/rss/1.0/' and not self.version: |
|
668 |
+ self.version = 'rss10' |
|
669 |
+ if loweruri == 'http://www.w3.org/2005/atom' and not self.version: |
|
670 |
+ self.version = 'atom10' |
|
671 |
+ if loweruri.find('backend.userland.com/rss') <> -1: |
|
672 |
+ # match any backend.userland.com namespace |
|
673 |
+ uri = 'http://backend.userland.com/rss' |
|
674 |
+ loweruri = uri |
|
675 |
+ if self._matchnamespaces.has_key(loweruri): |
|
676 |
+ self.namespacemap[prefix] = self._matchnamespaces[loweruri] |
|
677 |
+ self.namespacesInUse[self._matchnamespaces[loweruri]] = uri |
|
678 |
+ else: |
|
679 |
+ self.namespacesInUse[prefix or ''] = uri |
|
680 |
+ |
|
681 |
+ def resolveURI(self, uri): |
|
682 |
+ return _urljoin(self.baseuri or '', uri) |
|
683 |
+ |
|
684 |
+ def decodeEntities(self, element, data): |
|
685 |
+ return data |
|
686 |
+ |
|
687 |
+ def strattrs(self, attrs): |
|
688 |
+ return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'"'})) for t in attrs]) |
|
689 |
+ |
|
690 |
+ def push(self, element, expectingText): |
|
691 |
+ self.elementstack.append([element, expectingText, []]) |
|
692 |
+ |
|
693 |
+ def pop(self, element, stripWhitespace=1): |
|
694 |
+ if not self.elementstack: return |
|
695 |
+ if self.elementstack[-1][0] != element: return |
|
696 |
+ |
|
697 |
+ element, expectingText, pieces = self.elementstack.pop() |
|
698 |
+ |
|
699 |
+ if self.version == 'atom10' and self.contentparams.get('type','text') == 'application/xhtml+xml': |
|
700 |
+ # remove enclosing child element, but only if it is a <div> and |
|
701 |
+ # only if all the remaining content is nested underneath it. |
|
702 |
+ # This means that the divs would be retained in the following: |
|
703 |
+ # <div>foo</div><div>bar</div> |
|
704 |
+ while pieces and len(pieces)>1 and not pieces[-1].strip(): |
|
705 |
+ del pieces[-1] |
|
706 |
+ while pieces and len(pieces)>1 and not pieces[0].strip(): |
|
707 |
+ del pieces[0] |
|
708 |
+ if pieces and (pieces[0] == '<div>' or pieces[0].startswith('<div ')) and pieces[-1]=='</div>': |
|
709 |
+ depth = 0 |
|
710 |
+ for piece in pieces[:-1]: |
|
711 |
+ if piece.startswith('</'): |
|
712 |
+ depth -= 1 |
|
713 |
+ if depth == 0: break |
|
714 |
+ elif piece.startswith('<') and not piece.endswith('/>'): |
|
715 |
+ depth += 1 |
|
716 |
+ else: |
|
717 |
+ pieces = pieces[1:-1] |
|
718 |
+ |
|
719 |
+ output = ''.join(pieces) |
|
720 |
+ if stripWhitespace: |
|
721 |
+ output = output.strip() |
|
722 |
+ if not expectingText: return output |
|
723 |
+ |
|
724 |
+ # decode base64 content |
|
725 |
+ if base64 and self.contentparams.get('base64', 0): |
|
726 |
+ try: |
|
727 |
+ output = base64.decodestring(output) |
|
728 |
+ except binascii.Error: |
|
729 |
+ pass |
|
730 |
+ except binascii.Incomplete: |
|
731 |
+ pass |
|
732 |
+ |
|
733 |
+ # resolve relative URIs |
|
734 |
+ if (element in self.can_be_relative_uri) and output: |
|
735 |
+ output = self.resolveURI(output) |
|
736 |
+ |
|
737 |
+ # decode entities within embedded markup |
|
738 |
+ if not self.contentparams.get('base64', 0): |
|
739 |
+ output = self.decodeEntities(element, output) |
|
740 |
+ |
|
741 |
+ if self.lookslikehtml(output): |
|
742 |
+ self.contentparams['type']='text/html' |
|
743 |
+ |
|
744 |
+ # remove temporary cruft from contentparams |
|
745 |
+ try: |
|
746 |
+ del self.contentparams['mode'] |
|
747 |
+ except KeyError: |
|
748 |
+ pass |
|
749 |
+ try: |
|
750 |
+ del self.contentparams['base64'] |
|
751 |
+ except KeyError: |
|
752 |
+ pass |
|
753 |
+ |
|
754 |
+ is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types |
|
755 |
+ # resolve relative URIs within embedded markup |
|
756 |
+ if is_htmlish and RESOLVE_RELATIVE_URIS: |
|
757 |
+ if element in self.can_contain_relative_uris: |
|
758 |
+ output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html')) |
|
759 |
+ |
|
760 |
+ # parse microformats |
|
761 |
+ # (must do this before sanitizing because some microformats |
|
762 |
+ # rely on elements that we sanitize) |
|
763 |
+ if is_htmlish and element in ['content', 'description', 'summary']: |
|
764 |
+ mfresults = _parseMicroformats(output, self.baseuri, self.encoding) |
|
765 |
+ if mfresults: |
|
766 |
+ for tag in mfresults.get('tags', []): |
|
767 |
+ self._addTag(tag['term'], tag['scheme'], tag['label']) |
|
768 |
+ for enclosure in mfresults.get('enclosures', []): |
|
769 |
+ self._start_enclosure(enclosure) |
|
770 |
+ for xfn in mfresults.get('xfn', []): |
|
771 |
+ self._addXFN(xfn['relationships'], xfn['href'], xfn['name']) |
|
772 |
+ vcard = mfresults.get('vcard') |
|
773 |
+ if vcard: |
|
774 |
+ self._getContext()['vcard'] = vcard |
|
775 |
+ |
|
776 |
+ # sanitize embedded markup |
|
777 |
+ if is_htmlish and SANITIZE_HTML: |
|
778 |
+ if element in self.can_contain_dangerous_markup: |
|
779 |
+ output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html')) |
|
780 |
+ |
|
781 |
+ if self.encoding and type(output) != type(u''): |
|
782 |
+ try: |
|
783 |
+ output = unicode(output, self.encoding) |
|
784 |
+ except: |
|
785 |
+ pass |
|
786 |
+ |
|
787 |
+ # address common error where people take data that is already |
|
788 |
+ # utf-8, presume that it is iso-8859-1, and re-encode it. |
|
789 |
+ if self.encoding=='utf-8' and type(output) == type(u''): |
|
790 |
+ try: |
|
791 |
+ output = unicode(output.encode('iso-8859-1'), 'utf-8') |
|
792 |
+ except: |
|
793 |
+ pass |
|
794 |
+ |
|
795 |
+ # map win-1252 extensions to the proper code points |
|
796 |
+ if type(output) == type(u''): |
|
797 |
+ output = u''.join([c in _cp1252.keys() and _cp1252[c] or c for c in output]) |
|
798 |
+ |
|
799 |
+ # categories/tags/keywords/whatever are handled in _end_category |
|
800 |
+ if element == 'category': |
|
801 |
+ return output |
|
802 |
+ |
|
803 |
+ if element == 'title' and self.hasTitle: |
|
804 |
+ return output |
|
805 |
+ |
|
806 |
+ # store output in appropriate place(s) |
|
807 |
+ if self.inentry and not self.insource: |
|
808 |
+ if element == 'content': |
|
809 |
+ self.entries[-1].setdefault(element, []) |
|
810 |
+ contentparams = copy.deepcopy(self.contentparams) |
|
811 |
+ contentparams['value'] = output |
|
812 |
+ self.entries[-1][element].append(contentparams) |
|
813 |
+ elif element == 'link': |
|
814 |
+ self.entries[-1][element] = output |
|
815 |
+ if output: |
|
816 |
+ self.entries[-1]['links'][-1]['href'] = output |
|
817 |
+ else: |
|
818 |
+ if element == 'description': |
|
819 |
+ element = 'summary' |
|
820 |
+ self.entries[-1][element] = output |
|
821 |
+ if self.incontent: |
|
822 |
+ contentparams = copy.deepcopy(self.contentparams) |
|
823 |
+ contentparams['value'] = output |
|
824 |
+ self.entries[-1][element + '_detail'] = contentparams |
|
825 |
+ elif (self.infeed or self.insource):# and (not self.intextinput) and (not self.inimage): |
|
826 |
+ context = self._getContext() |
|
827 |
+ if element == 'description': |
|
828 |
+ element = 'subtitle' |
|
829 |
+ context[element] = output |
|
830 |
+ if element == 'link': |
|
831 |
+ context['links'][-1]['href'] = output |
|
832 |
+ elif self.incontent: |
|
833 |
+ contentparams = copy.deepcopy(self.contentparams) |
|
834 |
+ contentparams['value'] = output |
|
835 |
+ context[element + '_detail'] = contentparams |
|
836 |
+ return output |
|
837 |
+ |
|
838 |
+ def pushContent(self, tag, attrsD, defaultContentType, expectingText): |
|
839 |
+ self.incontent += 1 |
|
840 |
+ if self.lang: self.lang=self.lang.replace('_','-') |
|
841 |
+ self.contentparams = FeedParserDict({ |
|
842 |
+ 'type': self.mapContentType(attrsD.get('type', defaultContentType)), |
|
843 |
+ 'language': self.lang, |
|
844 |
+ 'base': self.baseuri}) |
|
845 |
+ self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams) |
|
846 |
+ self.push(tag, expectingText) |
|
847 |
+ |
|
848 |
+ def popContent(self, tag): |
|
849 |
+ value = self.pop(tag) |
|
850 |
+ self.incontent -= 1 |
|
851 |
+ self.contentparams.clear() |
|
852 |
+ return value |
|
853 |
+ |
|
854 |
+ # a number of elements in a number of RSS variants are nominally plain |
|
855 |
+ # text, but this is routinely ignored. This is an attempt to detect |
|
856 |
+ # the most common cases. As false positives often result in silent |
|
857 |
+ # data loss, this function errs on the conservative side. |
|
858 |
+ def lookslikehtml(self, str): |
|
859 |
+ if self.version.startswith('atom'): return |
|
860 |
+ if self.contentparams.get('type','text/html') != 'text/plain': return |
|
861 |
+ |
|
862 |
+ # must have a close tag or a entity reference to qualify |
|
863 |
+ if not (re.search(r'</(\w+)>',str) or re.search("&#?\w+;",str)): return |
|
864 |
+ |
|
865 |
+ # all tags must be in a restricted subset of valid HTML tags |
|
866 |
+ if filter(lambda t: t.lower() not in _HTMLSanitizer.acceptable_elements, |
|
867 |
+ re.findall(r'</?(\w+)',str)): return |
|
868 |
+ |
|
869 |
+ # all entities must have been defined as valid HTML entities |
|
870 |
+ from htmlentitydefs import entitydefs |
|
871 |
+ if filter(lambda e: e not in entitydefs.keys(), |
|
872 |
+ re.findall(r'&(\w+);',str)): return |
|
873 |
+ |
|
874 |
+ return 1 |
|
875 |
+ |
|
876 |
+ def _mapToStandardPrefix(self, name): |
|
877 |
+ colonpos = name.find(':') |
|
878 |
+ if colonpos <> -1: |
|
879 |
+ prefix = name[:colonpos] |
|
880 |
+ suffix = name[colonpos+1:] |
|
881 |
+ prefix = self.namespacemap.get(prefix, prefix) |
|
882 |
+ name = prefix + ':' + suffix |
|
883 |
+ return name |
|
884 |
+ |
|
885 |
+ def _getAttribute(self, attrsD, name): |
|
886 |
+ return attrsD.get(self._mapToStandardPrefix(name)) |
|
887 |
+ |
|
888 |
+ def _isBase64(self, attrsD, contentparams): |
|
889 |
+ if attrsD.get('mode', '') == 'base64': |
|
890 |
+ return 1 |
|
891 |
+ if self.contentparams['type'].startswith('text/'): |
|
892 |
+ return 0 |
|
893 |
+ if self.contentparams['type'].endswith('+xml'): |
|
894 |
+ return 0 |
|
895 |
+ if self.contentparams['type'].endswith('/xml'): |
|
896 |
+ return 0 |
|
897 |
+ return 1 |
|
898 |
+ |
|
899 |
+ def _itsAnHrefDamnIt(self, attrsD): |
|
900 |
+ href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None))) |
|
901 |
+ if href: |
|
902 |
+ try: |
|
903 |
+ del attrsD['url'] |
|
904 |
+ except KeyError: |
|
905 |
+ pass |
|
906 |
+ try: |
|
907 |
+ del attrsD['uri'] |
|
908 |
+ except KeyError: |
|
909 |
+ pass |
|
910 |
+ attrsD['href'] = href |
|
911 |
+ return attrsD |
|
912 |
+ |
|
913 |
+ def _save(self, key, value): |
|
914 |
+ context = self._getContext() |
|
915 |
+ context.setdefault(key, value) |
|
916 |
+ |
|
917 |
+ def _start_rss(self, attrsD): |
|
918 |
+ versionmap = {'0.91': 'rss091u', |
|
919 |
+ '0.92': 'rss092', |
|
920 |
+ '0.93': 'rss093', |
|
921 |
+ '0.94': 'rss094'} |
|
922 |
+ if not self.version: |
|
923 |
+ attr_version = attrsD.get('version', '') |
|
924 |
+ version = versionmap.get(attr_version) |
|
925 |
+ if version: |
|
926 |
+ self.version = version |
|
927 |
+ elif attr_version.startswith('2.'): |
|
928 |
+ self.version = 'rss20' |
|
929 |
+ else: |
|
930 |
+ self.version = 'rss' |
|
931 |
+ |
|
932 |
+ def _start_dlhottitles(self, attrsD): |
|
933 |
+ self.version = 'hotrss' |
|
934 |
+ |
|
935 |
+ def _start_channel(self, attrsD): |
|
936 |
+ self.infeed = 1 |
|
937 |
+ self._cdf_common(attrsD) |
|
938 |
+ _start_feedinfo = _start_channel |
|
939 |
+ |
|
940 |
+ def _cdf_common(self, attrsD): |
|
941 |
+ if attrsD.has_key('lastmod'): |
|
942 |
+ self._start_modified({}) |
|
943 |
+ self.elementstack[-1][-1] = attrsD['lastmod'] |
|
944 |
+ self._end_modified() |
|
945 |
+ if attrsD.has_key('href'): |
|
946 |
+ self._start_link({}) |
|
947 |
+ self.elementstack[-1][-1] = attrsD['href'] |
|
948 |
+ self._end_link() |
|
949 |
+ |
|
950 |
+ def _start_feed(self, attrsD): |
|
951 |
+ self.infeed = 1 |
|
952 |
+ versionmap = {'0.1': 'atom01', |
|
953 |
+ '0.2': 'atom02', |
|
954 |
+ '0.3': 'atom03'} |
|
955 |
+ if not self.version: |
|
956 |
+ attr_version = attrsD.get('version') |
|
957 |
+ version = versionmap.get(attr_version) |
|
958 |
+ if version: |
|
959 |
+ self.version = version |
|
960 |
+ else: |
|
961 |
+ self.version = 'atom' |
|
962 |
+ |
|
963 |
+ def _end_channel(self): |
|
964 |
+ self.infeed = 0 |
|
965 |
+ _end_feed = _end_channel |
|
966 |
+ |
|
967 |
+ def _start_image(self, attrsD): |
|
968 |
+ context = self._getContext() |
|
969 |
+ context.setdefault('image', FeedParserDict()) |
|
970 |
+ self.inimage = 1 |
|
971 |
+ self.hasTitle = 0 |
|
972 |
+ self.push('image', 0) |
|
973 |
+ |
|
974 |
+ def _end_image(self): |
|
975 |
+ self.pop('image') |
|
976 |
+ self.inimage = 0 |
|
977 |
+ |
|
978 |
+ def _start_textinput(self, attrsD): |
|
979 |
+ context = self._getContext() |
|
980 |
+ context.setdefault('textinput', FeedParserDict()) |
|
981 |
+ self.intextinput = 1 |
|
982 |
+ self.hasTitle = 0 |
|
983 |
+ self.push('textinput', 0) |
|
984 |
+ _start_textInput = _start_textinput |
|
985 |
+ |
|
986 |
+ def _end_textinput(self): |
|
987 |
+ self.pop('textinput') |
|
988 |
+ self.intextinput = 0 |
|
989 |
+ _end_textInput = _end_textinput |
|
990 |
+ |
|
991 |
+ def _start_author(self, attrsD): |
|
992 |
+ self.inauthor = 1 |
|
993 |
+ self.push('author', 1) |
|
994 |
+ _start_managingeditor = _start_author |
|
995 |
+ _start_dc_author = _start_author |
|
996 |
+ _start_dc_creator = _start_author |
|
997 |
+ _start_itunes_author = _start_author |
|
998 |
+ |
|
999 |
+ def _end_author(self): |
|
1000 |
+ self.pop('author') |
|
1001 |
+ self.inauthor = 0 |
|
1002 |
+ self._sync_author_detail() |
|
1003 |
+ _end_managingeditor = _end_author |
|
1004 |
+ _end_dc_author = _end_author |
|
1005 |
+ _end_dc_creator = _end_author |
|
1006 |
+ _end_itunes_author = _end_author |
|
1007 |
+ |
|
1008 |
+ def _start_itunes_owner(self, attrsD): |
|
1009 |
+ self.inpublisher = 1 |
|
1010 |
+ self.push('publisher', 0) |
|
1011 |
+ |
|
1012 |
+ def _end_itunes_owner(self): |
|
1013 |
+ self.pop('publisher') |
|
1014 |
+ self.inpublisher = 0 |
|
1015 |
+ self._sync_author_detail('publisher') |
|
1016 |
+ |
|
1017 |
+ def _start_contributor(self, attrsD): |
|
1018 |
+ self.incontributor = 1 |
|
1019 |
+ context = self._getContext() |
|
1020 |
+ context.setdefault('contributors', []) |
|
1021 |
+ context['contributors'].append(FeedParserDict()) |
|
1022 |
+ self.push('contributor', 0) |
|
1023 |
+ |
|
1024 |
+ def _end_contributor(self): |
|
1025 |
+ self.pop('contributor') |
|
1026 |
+ self.incontributor = 0 |
|
1027 |
+ |
|
1028 |
+ def _start_dc_contributor(self, attrsD): |
|
1029 |
+ self.incontributor = 1 |
|
1030 |
+ context = self._getContext() |
|
1031 |
+ context.setdefault('contributors', []) |
|
1032 |
+ context['contributors'].append(FeedParserDict()) |
|
1033 |
+ self.push('name', 0) |
|
1034 |
+ |
|
1035 |
+ def _end_dc_contributor(self): |
|
1036 |
+ self._end_name() |
|
1037 |
+ self.incontributor = 0 |
|
1038 |
+ |
|
1039 |
+ def _start_name(self, attrsD): |
|
1040 |
+ self.push('name', 0) |
|
1041 |
+ _start_itunes_name = _start_name |
|
1042 |
+ |
|
1043 |
+ def _end_name(self): |
|
1044 |
+ value = self.pop('name') |
|
1045 |
+ if self.inpublisher: |
|
1046 |
+ self._save_author('name', value, 'publisher') |
|
1047 |
+ elif self.inauthor: |
|
1048 |
+ self._save_author('name', value) |
|
1049 |
+ elif self.incontributor: |
|
1050 |
+ self._save_contributor('name', value) |
|
1051 |
+ elif self.intextinput: |
|
1052 |
+ context = self._getContext() |
|
1053 |
+ context['name'] = value |
|
1054 |
+ _end_itunes_name = _end_name |
|
1055 |
+ |
|
1056 |
+ def _start_width(self, attrsD): |
|
1057 |
+ self.push('width', 0) |
|
1058 |
+ |
|
1059 |
+ def _end_width(self): |
|
1060 |
+ value = self.pop('width') |
|
1061 |
+ try: |
|
1062 |
+ value = int(value) |
|
1063 |
+ except: |
|
1064 |
+ value = 0 |
|
1065 |
+ if self.inimage: |
|
1066 |
+ context = self._getContext() |
|
1067 |
+ context['width'] = value |
|
1068 |
+ |
|
1069 |
+ def _start_height(self, attrsD): |
|
1070 |
+ self.push('height', 0) |
|
1071 |
+ |
|
1072 |
+ def _end_height(self): |
|
1073 |
+ value = self.pop('height') |
|
1074 |
+ try: |
|
1075 |
+ value = int(value) |
|
1076 |
+ except: |
|
1077 |
+ value = 0 |
|
1078 |
+ if self.inimage: |
|
1079 |
+ context = self._getContext() |
|
1080 |
+ context['height'] = value |
|
1081 |
+ |
|
1082 |
+ def _start_url(self, attrsD): |
|
1083 |
+ self.push('href', 1) |
|
1084 |
+ _start_homepage = _start_url |
|
1085 |
+ _start_uri = _start_url |
|
1086 |
+ |
|
1087 |
+ def _end_url(self): |
|
1088 |
+ value = self.pop('href') |
|
1089 |
+ if self.inauthor: |
|
1090 |
+ self._save_author('href', value) |
|
1091 |
+ elif self.incontributor: |
|
1092 |
+ self._save_contributor('href', value) |
|
1093 |
+ _end_homepage = _end_url |
|
1094 |
+ _end_uri = _end_url |
|
1095 |
+ |
|
1096 |
+ def _start_email(self, attrsD): |
|
1097 |
+ self.push('email', 0) |
|
1098 |
+ _start_itunes_email = _start_email |
|
1099 |
+ |
|
1100 |
+ def _end_email(self): |
|
1101 |
+ value = self.pop('email') |
|
1102 |
+ if self.inpublisher: |
|
1103 |
+ self._save_author('email', value, 'publisher') |
|
1104 |
+ elif self.inauthor: |
|
1105 |
+ self._save_author('email', value) |
|
1106 |
+ elif self.incontributor: |
|
1107 |
+ self._save_contributor('email', value) |
|
1108 |
+ _end_itunes_email = _end_email |
|
1109 |
+ |
|
1110 |
+ def _getContext(self): |
|
1111 |
+ if self.insource: |
|
1112 |
+ context = self.sourcedata |
|
1113 |
+ elif self.inimage: |
|
1114 |
+ context = self.feeddata['image'] |
|
1115 |
+ elif self.intextinput: |
|
1116 |
+ context = self.feeddata['textinput'] |
|
1117 |
+ elif self.inentry: |
|
1118 |
+ context = self.entries[-1] |
|
1119 |
+ else: |
|
1120 |
+ context = self.feeddata |
|
1121 |
+ return context |
|
1122 |
+ |
|
1123 |
+ def _save_author(self, key, value, prefix='author'): |
|
1124 |
+ context = self._getContext() |
|
1125 |
+ context.setdefault(prefix + '_detail', FeedParserDict()) |
|
1126 |
+ context[prefix + '_detail'][key] = value |
|
1127 |
+ self._sync_author_detail() |
|
1128 |
+ |
|
1129 |
+ def _save_contributor(self, key, value): |
|
1130 |
+ context = self._getContext() |
|
1131 |
+ context.setdefault('contributors', [FeedParserDict()]) |
|
1132 |
+ context['contributors'][-1][key] = value |
|
1133 |
+ |
|
1134 |
+ def _sync_author_detail(self, key='author'): |
|
1135 |
+ context = self._getContext() |
|
1136 |
+ detail = context.get('%s_detail' % key) |
|
1137 |
+ if detail: |
|
1138 |
+ name = detail.get('name') |
|
1139 |
+ email = detail.get('email') |
|
1140 |
+ if name and email: |
|
1141 |
+ context[key] = '%s (%s)' % (name, email) |
|
1142 |
+ elif name: |
|
1143 |
+ context[key] = name |
|
1144 |
+ elif email: |
|
1145 |
+ context[key] = email |
|
1146 |
+ else: |
|
1147 |
+ author, email = context.get(key), None |
|
1148 |
+ if not author: return |
|
1149 |
+ emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author) |
|
1150 |
+ if emailmatch: |
|
1151 |
+ email = emailmatch.group(0) |
|
1152 |
+ # probably a better way to do the following, but it passes all the tests |
|
1153 |
+ author = author.replace(email, '') |
|
1154 |
+ author = author.replace('()', '') |
|
1155 |
+ author = author.replace('<>', '') |
|
1156 |
+ author = author.replace('<>', '') |
|
1157 |
+ author = author.strip() |
|
1158 |
+ if author and (author[0] == '('): |
|
1159 |
+ author = author[1:] |
|
1160 |
+ if author and (author[-1] == ')'): |
|
1161 |
+ author = author[:-1] |
|
1162 |
+ author = author.strip() |
|
1163 |
+ if author or email: |
|
1164 |
+ context.setdefault('%s_detail' % key, FeedParserDict()) |
|
1165 |
+ if author: |
|
1166 |
+ context['%s_detail' % key]['name'] = author |
|
1167 |
+ if email: |
|
1168 |
+ context['%s_detail' % key]['email'] = email |
|
1169 |
+ |
|
1170 |
+ def _start_subtitle(self, attrsD): |
|
1171 |
+ self.pushContent('subtitle', attrsD, 'text/plain', 1) |
|
1172 |
+ _start_tagline = _start_subtitle |
|
1173 |
+ _start_itunes_subtitle = _start_subtitle |
|
1174 |
+ |
|
1175 |
+ def _end_subtitle(self): |
|
1176 |
+ self.popContent('subtitle') |
|
1177 |
+ _end_tagline = _end_subtitle |
|
1178 |
+ _end_itunes_subtitle = _end_subtitle |
|
1179 |
+ |
|
1180 |
+ def _start_rights(self, attrsD): |
|
1181 |
+ self.pushContent('rights', attrsD, 'text/plain', 1) |
|
1182 |
+ _start_dc_rights = _start_rights |
|
1183 |
+ _start_copyright = _start_rights |
|
1184 |
+ |
|
1185 |
+ def _end_rights(self): |
|
1186 |
+ self.popContent('rights') |
|
1187 |
+ _end_dc_rights = _end_rights |
|
1188 |
+ _end_copyright = _end_rights |
|
1189 |
+ |
|
1190 |
+ def _start_item(self, attrsD): |
|
1191 |
+ self.entries.append(FeedParserDict()) |
|
1192 |
+ self.push('item', 0) |
|
1193 |
+ self.inentry = 1 |
|
1194 |
+ self.guidislink = 0 |
|
1195 |
+ self.hasTitle = 0 |
|
1196 |
+ id = self._getAttribute(attrsD, 'rdf:about') |
|
1197 |
+ if id: |
|
1198 |
+ context = self._getContext() |
|
1199 |
+ context['id'] = id |
|
1200 |
+ self._cdf_common(attrsD) |
|
1201 |
+ _start_entry = _start_item |
|
1202 |
+ _start_product = _start_item |
|
1203 |
+ |
|
1204 |
+ def _end_item(self): |
|
1205 |
+ self.pop('item') |
|
1206 |
+ self.inentry = 0 |
|
1207 |
+ _end_entry = _end_item |
|
1208 |
+ |
|
1209 |
+ def _start_dc_language(self, attrsD): |
|
1210 |
+ self.push('language', 1) |
|
1211 |
+ _start_language = _start_dc_language |
|
1212 |
+ |
|
1213 |
+ def _end_dc_language(self): |
|
1214 |
+ self.lang = self.pop('language') |
|
1215 |
+ _end_language = _end_dc_language |
|
1216 |
+ |
|
1217 |
+ def _start_dc_publisher(self, attrsD): |
|
1218 |
+ self.push('publisher', 1) |
|
1219 |
+ _start_webmaster = _start_dc_publisher |
|
1220 |
+ |
|
1221 |
+ def _end_dc_publisher(self): |
|
1222 |
+ self.pop('publisher') |
|
1223 |
+ self._sync_author_detail('publisher') |
|
1224 |
+ _end_webmaster = _end_dc_publisher |
|
1225 |
+ |
|
1226 |
+ def _start_published(self, attrsD): |
|
1227 |
+ self.push('published', 1) |
|
1228 |
+ _start_dcterms_issued = _start_published |
|
1229 |
+ _start_issued = _start_published |
|
1230 |
+ |
|
1231 |
+ def _end_published(self): |
|
1232 |
+ value = self.pop('published') |
|
1233 |
+ self._save('published_parsed', _parse_date(value)) |
|
1234 |
+ _end_dcterms_issued = _end_published |
|
1235 |
+ _end_issued = _end_published |
|
1236 |
+ |
|
1237 |
+ def _start_updated(self, attrsD): |
|
1238 |
+ self.push('updated', 1) |
|
1239 |
+ _start_modified = _start_updated |
|
1240 |
+ _start_dcterms_modified = _start_updated |
|
1241 |
+ _start_pubdate = _start_updated |
|
1242 |
+ _start_dc_date = _start_updated |
|
1243 |
+ |
|
1244 |
+ def _end_updated(self): |
|
1245 |
+ value = self.pop('updated') |
|
1246 |
+ parsed_value = _parse_date(value) |
|
1247 |
+ self._save('updated_parsed', parsed_value) |
|
1248 |
+ _end_modified = _end_updated |
|
1249 |
+ _end_dcterms_modified = _end_updated |
|
1250 |
+ _end_pubdate = _end_updated |
|
1251 |
+ _end_dc_date = _end_updated |
|
1252 |
+ |
|
1253 |
+ def _start_created(self, attrsD): |
|
1254 |
+ self.push('created', 1) |
|
1255 |
+ _start_dcterms_created = _start_created |
|
1256 |
+ |
|
1257 |
+ def _end_created(self): |
|
1258 |
+ value = self.pop('created') |
|
1259 |
+ self._save('created_parsed', _parse_date(value)) |
|
1260 |
+ _end_dcterms_created = _end_created |
|
1261 |
+ |
|
1262 |
+ def _start_expirationdate(self, attrsD): |
|
1263 |
+ self.push('expired', 1) |
|
1264 |
+ |
|
1265 |
+ def _end_expirationdate(self): |
|
1266 |
+ self._save('expired_parsed', _parse_date(self.pop('expired'))) |
|
1267 |
+ |
|
1268 |
+ def _start_cc_license(self, attrsD): |
|
1269 |
+ context = self._getContext() |
|
1270 |
+ value = self._getAttribute(attrsD, 'rdf:resource') |
|
1271 |
+ attrsD = FeedParserDict() |
|
1272 |
+ attrsD['rel']='license' |
|
1273 |
+ if value: attrsD['href']=value |
|
1274 |
+ context.setdefault('links', []).append(attrsD) |
|
1275 |
+ |
|
1276 |
+ def _start_creativecommons_license(self, attrsD): |
|
1277 |
+ self.push('license', 1) |
|
1278 |
+ _start_creativeCommons_license = _start_creativecommons_license |
|
1279 |
+ |
|
1280 |
+ def _end_creativecommons_license(self): |
|
1281 |
+ value = self.pop('license') |
|
1282 |
+ context = self._getContext() |
|
1283 |
+ attrsD = FeedParserDict() |
|
1284 |
+ attrsD['rel']='license' |
|
1285 |
+ if value: attrsD['href']=value |
|
1286 |
+ context.setdefault('links', []).append(attrsD) |
|
1287 |
+ del context['license'] |
|
1288 |
+ _end_creativeCommons_license = _end_creativecommons_license |
|
1289 |
+ |
|
1290 |
+ def _addXFN(self, relationships, href, name): |
|
1291 |
+ context = self._getContext() |
|
1292 |
+ xfn = context.setdefault('xfn', []) |
|
1293 |
+ value = FeedParserDict({'relationships': relationships, 'href': href, 'name': name}) |
|
1294 |
+ if value not in xfn: |
|
1295 |
+ xfn.append(value) |
|
1296 |
+ |
|
1297 |
+ def _addTag(self, term, scheme, label): |
|
1298 |
+ context = self._getContext() |
|
1299 |
+ tags = context.setdefault('tags', []) |
|
1300 |
+ if (not term) and (not scheme) and (not label): return |
|
1301 |
+ value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label}) |
|
1302 |
+ if value not in tags: |
|
1303 |
+ tags.append(value) |
|
1304 |
+ |
|
1305 |
+ def _start_category(self, attrsD): |
|
1306 |
+ if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD)) |
|
1307 |
+ term = attrsD.get('term') |
|
1308 |
+ scheme = attrsD.get('scheme', attrsD.get('domain')) |
|
1309 |
+ label = attrsD.get('label') |
|
1310 |
+ self._addTag(term, scheme, label) |
|
1311 |
+ self.push('category', 1) |
|
1312 |
+ _start_dc_subject = _start_category |
|
1313 |
+ _start_keywords = _start_category |
|
1314 |
+ |
|
1315 |
+ def _end_itunes_keywords(self): |
|
1316 |
+ for term in self.pop('itunes_keywords').split(): |
|
1317 |
+ self._addTag(term, 'http://www.itunes.com/', None) |
|
1318 |
+ |
|
1319 |
+ def _start_itunes_category(self, attrsD): |
|
1320 |
+ self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None) |
|
1321 |
+ self.push('category', 1) |
|
1322 |
+ |
|
1323 |
+ def _end_category(self): |
|
1324 |
+ value = self.pop('category') |
|
1325 |
+ if not value: return |
|
1326 |
+ context = self._getContext() |
|
1327 |
+ tags = context['tags'] |
|
1328 |
+ if value and len(tags) and not tags[-1]['term']: |
|
1329 |
+ tags[-1]['term'] = value |
|
1330 |
+ else: |
|
1331 |
+ self._addTag(value, None, None) |
|
1332 |
+ _end_dc_subject = _end_category |
|
1333 |
+ _end_keywords = _end_category |
|
1334 |
+ _end_itunes_category = _end_category |
|
1335 |
+ |
|
1336 |
+ def _start_cloud(self, attrsD): |
|
1337 |
+ self._getContext()['cloud'] = FeedParserDict(attrsD) |
|
1338 |
+ |
|
1339 |
+ def _start_link(self, attrsD): |
|
1340 |
+ attrsD.setdefault('rel', 'alternate') |
|
1341 |
+ if attrsD['rel'] == 'self': |
|
1342 |
+ attrsD.setdefault('type', 'application/atom+xml') |
|
1343 |
+ else: |
|
1344 |
+ attrsD.setdefault('type', 'text/html') |
|
1345 |
+ context = self._getContext() |
|
1346 |
+ attrsD = self._itsAnHrefDamnIt(attrsD) |
|
1347 |
+ if attrsD.has_key('href'): |
|
1348 |
+ attrsD['href'] = self.resolveURI(attrsD['href']) |
|
1349 |
+ if attrsD.get('rel')=='enclosure' and not context.get('id'): |
|
1350 |
+ context['id'] = attrsD.get('href') |
|
1351 |
+ expectingText = self.infeed or self.inentry or self.insource |
|
1352 |
+ context.setdefault('links', []) |
|
1353 |
+ context['links'].append(FeedParserDict(attrsD)) |
|
1354 |
+ if attrsD.has_key('href'): |
|
1355 |
+ expectingText = 0 |
|
1356 |
+ if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types): |
|
1357 |
+ context['link'] = attrsD['href'] |
|
1358 |
+ else: |
|
1359 |
+ self.push('link', expectingText) |
|
1360 |
+ _start_producturl = _start_link |
|
1361 |
+ |
|
1362 |
+ def _end_link(self): |
|
1363 |
+ value = self.pop('link') |
|
1364 |
+ context = self._getContext() |
|
1365 |
+ _end_producturl = _end_link |
|
1366 |
+ |
|
1367 |
+ def _start_guid(self, attrsD): |
|
1368 |
+ self.guidislink = (attrsD.get('ispermalink', 'true') == 'true') |
|
1369 |
+ self.push('id', 1) |
|
1370 |
+ |
|
1371 |
+ def _end_guid(self): |
|
1372 |
+ value = self.pop('id') |
|
1373 |
+ self._save('guidislink', self.guidislink and not self._getContext().has_key('link')) |
|
1374 |
+ if self.guidislink: |
|
1375 |
+ # guid acts as link, but only if 'ispermalink' is not present or is 'true', |
|
1376 |
+ # and only if the item doesn't already have a link element |
|
1377 |
+ self._save('link', value) |
|
1378 |
+ |
|
1379 |
+ def _start_title(self, attrsD): |
|
1380 |
+ if self.svgOK: return self.unknown_starttag('title', attrsD.items()) |
|
1381 |
+ self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource) |
|
1382 |
+ _start_dc_title = _start_title |
|
1383 |
+ _start_media_title = _start_title |
|
1384 |
+ |
|
1385 |
+ def _end_title(self): |
|
1386 |
+ if self.svgOK: return |
|
1387 |
+ value = self.popContent('title') |
|
1388 |
+ if not value: return |
|
1389 |
+ context = self._getContext() |
|
1390 |
+ self.hasTitle = 1 |
|
1391 |
+ _end_dc_title = _end_title |
|
1392 |
+ |
|
1393 |
+ def _end_media_title(self): |
|
1394 |
+ hasTitle = self.hasTitle |
|
1395 |
+ self._end_title() |
|
1396 |
+ self.hasTitle = hasTitle |
|
1397 |
+ |
|
1398 |
+ def _start_description(self, attrsD): |
|
1399 |
+ context = self._getContext() |
|
1400 |
+ if context.has_key('summary'): |
|
1401 |
+ self._summaryKey = 'content' |
|
1402 |
+ self._start_content(attrsD) |
|
1403 |
+ else: |
|
1404 |
+ self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource) |
|
1405 |
+ _start_dc_description = _start_description |
|
1406 |
+ |
|
1407 |
+ def _start_abstract(self, attrsD): |
|
1408 |
+ self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource) |
|
1409 |
+ |
|
1410 |
+ def _end_description(self): |
|
1411 |
+ if self._summaryKey == 'content': |
|
1412 |
+ self._end_content() |
|
1413 |
+ else: |
|
1414 |
+ value = self.popContent('description') |
|
1415 |
+ self._summaryKey = None |
|
1416 |
+ _end_abstract = _end_description |
|
1417 |
+ _end_dc_description = _end_description |
|
1418 |
+ |
|
1419 |
+ def _start_info(self, attrsD): |
|
1420 |
+ self.pushContent('info', attrsD, 'text/plain', 1) |
|
1421 |
+ _start_feedburner_browserfriendly = _start_info |
|
1422 |
+ |
|
1423 |
+ def _end_info(self): |
|
1424 |
+ self.popContent('info') |
|
1425 |
+ _end_feedburner_browserfriendly = _end_info |
|
1426 |
+ |
|
1427 |
+ def _start_generator(self, attrsD): |
|
1428 |
+ if attrsD: |
|
1429 |
+ attrsD = self._itsAnHrefDamnIt(attrsD) |
|
1430 |
+ if attrsD.has_key('href'): |
|
1431 |
+ attrsD['href'] = self.resolveURI(attrsD['href']) |
|
1432 |
+ self._getContext()['generator_detail'] = FeedParserDict(attrsD) |
|
1433 |
+ self.push('generator', 1) |
|
1434 |
+ |
|
1435 |
+ def _end_generator(self): |
|
1436 |
+ value = self.pop('generator') |
|
1437 |
+ context = self._getContext() |
|
1438 |
+ if context.has_key('generator_detail'): |
|
1439 |
+ context['generator_detail']['name'] = value |
|
1440 |
+ |
|
1441 |
+ def _start_admin_generatoragent(self, attrsD): |
|
1442 |
+ self.push('generator', 1) |
|
1443 |
+ value = self._getAttribute(attrsD, 'rdf:resource') |
|
1444 |
+ if value: |
|
1445 |
+ self.elementstack[-1][2].append(value) |
|
1446 |
+ self.pop('generator') |
|
1447 |
+ self._getContext()['generator_detail'] = FeedParserDict({'href': value}) |
|
1448 |
+ |
|
1449 |
+ def _start_admin_errorreportsto(self, attrsD): |
|
1450 |
+ self.push('errorreportsto', 1) |
|
1451 |
+ value = self._getAttribute(attrsD, 'rdf:resource') |
|
1452 |
+ if value: |
|
1453 |
+ self.elementstack[-1][2].append(value) |
|
1454 |
+ self.pop('errorreportsto') |
|
1455 |
+ |
|
1456 |
+ def _start_summary(self, attrsD): |
|
1457 |
+ context = self._getContext() |
|
1458 |
+ if context.has_key('summary'): |
|
1459 |
+ self._summaryKey = 'content' |
|
1460 |
+ self._start_content(attrsD) |
|
1461 |
+ else: |
|
1462 |
+ self._summaryKey = 'summary' |
|
1463 |
+ self.pushContent(self._summaryKey, attrsD, 'text/plain', 1) |
|
1464 |
+ _start_itunes_summary = _start_summary |
|
1465 |
+ |
|
1466 |
+ def _end_summary(self): |
|
1467 |
+ if self._summaryKey == 'content': |
|
1468 |
+ self._end_content() |
|
1469 |
+ else: |
|
1470 |
+ self.popContent(self._summaryKey or 'summary') |
|
1471 |
+ self._summaryKey = None |
|
1472 |
+ _end_itunes_summary = _end_summary |
|
1473 |
+ |
|
1474 |
+ def _start_enclosure(self, attrsD): |
|
1475 |
+ attrsD = self._itsAnHrefDamnIt(attrsD) |
|
1476 |
+ context = self._getContext() |
|
1477 |
+ attrsD['rel']='enclosure' |
|
1478 |
+ context.setdefault('links', []).append(FeedParserDict(attrsD)) |
|
1479 |
+ href = attrsD.get('href') |
|
1480 |
+ if href and not context.get('id'): |
|
1481 |
+ context['id'] = href |
|
1482 |
+ |
|
1483 |
+ def _start_source(self, attrsD): |
|
1484 |
+ self.insource = 1 |
|
1485 |
+ self.hasTitle = 0 |
|
1486 |
+ |
|
1487 |
+ def _end_source(self): |
|
1488 |
+ self.insource = 0 |
|
1489 |
+ self._getContext()['source'] = copy.deepcopy(self.sourcedata) |
|
1490 |
+ self.sourcedata.clear() |
|
1491 |
+ |
|
1492 |
+ def _start_content(self, attrsD): |
|
1493 |
+ self.pushContent('content', attrsD, 'text/plain', 1) |
|
1494 |
+ src = attrsD.get('src') |
|
1495 |
+ if src: |
|
1496 |
+ self.contentparams['src'] = src |
|
1497 |
+ self.push('content', 1) |
|
1498 |
+ |
|
1499 |
+ def _start_prodlink(self, attrsD): |
|
1500 |
+ self.pushContent('content', attrsD, 'text/html', 1) |
|
1501 |
+ |
|
1502 |
+ def _start_body(self, attrsD): |
|
1503 |
+ self.pushContent('content', attrsD, 'application/xhtml+xml', 1) |
|
1504 |
+ _start_xhtml_body = _start_body |
|
1505 |
+ |
|
1506 |
+ def _start_content_encoded(self, attrsD): |
|
1507 |
+ self.pushContent('content', attrsD, 'text/html', 1) |
|
1508 |
+ _start_fullitem = _start_content_encoded |
|
1509 |
+ |
|
1510 |
+ def _end_content(self): |
|
1511 |
+ copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types) |
|
1512 |
+ value = self.popContent('content') |
|
1513 |
+ if copyToDescription: |
|
1514 |
+ self._save('description', value) |
|
1515 |
+ |
|
1516 |
+ _end_body = _end_content |
|
1517 |
+ _end_xhtml_body = _end_content |
|
1518 |
+ _end_content_encoded = _end_content |
|
1519 |
+ _end_fullitem = _end_content |
|
1520 |
+ _end_prodlink = _end_content |
|
1521 |
+ |
|
1522 |
+ def _start_itunes_image(self, attrsD): |
|
1523 |
+ self.push('itunes_image', 0) |
|
1524 |
+ self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')}) |
|
1525 |
+ _start_itunes_link = _start_itunes_image |
|
1526 |
+ |
|
1527 |
+ def _end_itunes_block(self): |
|
1528 |
+ value = self.pop('itunes_block', 0) |
|
1529 |
+ self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0 |
|
1530 |
+ |
|
1531 |
+ def _end_itunes_explicit(self): |
|
1532 |
+ value = self.pop('itunes_explicit', 0) |
|
1533 |
+ self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0 |
|
1534 |
+ |
|
1535 |
+if _XML_AVAILABLE: |
|
1536 |
+ class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler): |
|
1537 |
+ def __init__(self, baseuri, baselang, encoding): |
|
1538 |
+ if _debug: sys.stderr.write('trying StrictFeedParser\n') |
|
1539 |
+ xml.sax.handler.ContentHandler.__init__(self) |
|
1540 |
+ _FeedParserMixin.__init__(self, baseuri, baselang, encoding) |
|
1541 |
+ self.bozo = 0 |
|
1542 |
+ self.exc = None |
|
1543 |
+ |
|
1544 |
+ def startPrefixMapping(self, prefix, uri): |
|
1545 |
+ self.trackNamespace(prefix, uri) |
|
1546 |
+ |
|
1547 |
+ def startElementNS(self, name, qname, attrs): |
|
1548 |
+ namespace, localname = name |
|
1549 |
+ lowernamespace = str(namespace or '').lower() |
|
1550 |
+ if lowernamespace.find('backend.userland.com/rss') <> -1: |
|
1551 |
+ # match any backend.userland.com namespace |
|
1552 |
+ namespace = 'http://backend.userland.com/rss' |
|
1553 |
+ lowernamespace = namespace |
|
1554 |
+ if qname and qname.find(':') > 0: |
|
1555 |
+ givenprefix = qname.split(':')[0] |
|
1556 |
+ else: |
|
1557 |
+ givenprefix = None |
|
1558 |
+ prefix = self._matchnamespaces.get(lowernamespace, givenprefix) |
|
1559 |
+ if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix): |
|
1560 |
+ raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix |
|
1561 |
+ localname = str(localname).lower() |
|
1562 |
+ |
|
1563 |
+ # qname implementation is horribly broken in Python 2.1 (it |
|
1564 |
+ # doesn't report any), and slightly broken in Python 2.2 (it |
|
1565 |
+ # doesn't report the xml: namespace). So we match up namespaces |
|
1566 |
+ # with a known list first, and then possibly override them with |
|
1567 |
+ # the qnames the SAX parser gives us (if indeed it gives us any |
|
1568 |
+ # at all). Thanks to MatejC for helping me test this and |
|
1569 |
+ # tirelessly telling me that it didn't work yet. |
|
1570 |
+ attrsD = {} |
|
1571 |
+ if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML': |
|
1572 |
+ attrsD['xmlns']=namespace |
|
1573 |
+ if localname=='svg' and namespace=='http://www.w3.org/2000/svg': |
|
1574 |
+ attrsD['xmlns']=namespace |
|
1575 |
+ |
|
1576 |
+ if prefix: |
|
1577 |
+ localname = prefix.lower() + ':' + localname |
|
1578 |
+ elif namespace and not qname: #Expat |
|
1579 |
+ for name,value in self.namespacesInUse.items(): |
|
1580 |
+ if name and value == namespace: |
|
1581 |
+ localname = name + ':' + localname |
|
1582 |
+ break |
|
1583 |
+ if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname)) |
|
1584 |
+ |
|
1585 |
+ for (namespace, attrlocalname), attrvalue in attrs._attrs.items(): |
|
1586 |
+ lowernamespace = (namespace or '').lower() |
|
1587 |
+ prefix = self._matchnamespaces.get(lowernamespace, '') |
|
1588 |
+ if prefix: |
|
1589 |
+ attrlocalname = prefix + ':' + attrlocalname |
|
1590 |
+ attrsD[str(attrlocalname).lower()] = attrvalue |
|
1591 |
+ for qname in attrs.getQNames(): |
|
1592 |
+ attrsD[str(qname).lower()] = attrs.getValueByQName(qname) |
|
1593 |
+ self.unknown_starttag(localname, attrsD.items()) |
|
1594 |
+ |
|
1595 |
+ def characters(self, text): |
|
1596 |
+ self.handle_data(text) |
|
1597 |
+ |
|
1598 |
+ def endElementNS(self, name, qname): |
|
1599 |
+ namespace, localname = name |
|
1600 |
+ lowernamespace = str(namespace or '').lower() |
|
1601 |
+ if qname and qname.find(':') > 0: |
|
1602 |
+ givenprefix = qname.split(':')[0] |
|
1603 |
+ else: |
|
1604 |
+ givenprefix = '' |
|
1605 |
+ prefix = self._matchnamespaces.get(lowernamespace, givenprefix) |
|
1606 |
+ if prefix: |
|
1607 |
+ localname = prefix + ':' + localname |
|
1608 |
+ elif namespace and not qname: #Expat |
|
1609 |
+ for name,value in self.namespacesInUse.items(): |
|
1610 |
+ if name and value == namespace: |
|
1611 |
+ localname = name + ':' + localname |
|
1612 |
+ break |
|
1613 |
+ localname = str(localname).lower() |
|
1614 |
+ self.unknown_endtag(localname) |
|
1615 |
+ |
|
1616 |
+ def error(self, exc): |
|
1617 |
+ self.bozo = 1 |
|
1618 |
+ self.exc = exc |
|
1619 |
+ |
|
1620 |
+ def fatalError(self, exc): |
|
1621 |
+ self.error(exc) |
|
1622 |
+ raise exc |
|
1623 |
+ |
|
1624 |
+class _BaseHTMLProcessor(sgmllib.SGMLParser): |
|
1625 |
+ special = re.compile('''[<>'"]''') |
|
1626 |
+ bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)") |
|
1627 |
+ elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', |
|
1628 |
+ 'img', 'input', 'isindex', 'link', 'meta', 'param'] |
|
1629 |
+ |
|
1630 |
+ def __init__(self, encoding, type): |
|
1631 |
+ self.encoding = encoding |
|
1632 |
+ self.type = type |
|
1633 |
+ if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding) |
|
1634 |
+ sgmllib.SGMLParser.__init__(self) |
|
1635 |
+ |
|
1636 |
+ def reset(self): |
|
1637 |
+ self.pieces = [] |
|
1638 |
+ sgmllib.SGMLParser.reset(self) |
|
1639 |
+ |
|
1640 |
+ def _shorttag_replace(self, match): |
|
1641 |
+ tag = match.group(1) |
|
1642 |
+ if tag in self.elements_no_end_tag: |
|
1643 |
+ return '<' + tag + ' />' |
|
1644 |
+ else: |
|
1645 |
+ return '<' + tag + '></' + tag + '>' |
|
1646 |
+ |
|
1647 |
+ def parse_starttag(self,i): |
|
1648 |
+ j=sgmllib.SGMLParser.parse_starttag(self, i) |
|
1649 |
+ if self.type == 'application/xhtml+xml': |
|
1650 |
+ if j>2 and self.rawdata[j-2:j]=='/>': |
|
1651 |
+ self.unknown_endtag(self.lasttag) |
|
1652 |
+ return j |
|
1653 |
+ |
|
1654 |
+ def feed(self, data): |
|
1655 |
+ data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data) |
|
1656 |
+ #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace |
|
1657 |
+ data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data) |
|
1658 |
+ data = data.replace(''', "'") |
|
1659 |
+ data = data.replace('"', '"') |
|
1660 |
+ if self.encoding and type(data) == type(u''): |
|
1661 |
+ data = data.encode(self.encoding) |
|
1662 |
+ sgmllib.SGMLParser.feed(self, data) |
|
1663 |
+ sgmllib.SGMLParser.close(self) |
|
1664 |
+ |
|
1665 |
+ def normalize_attrs(self, attrs): |
|
1666 |
+ if not attrs: return attrs |
|
1667 |
+ # utility method to be called by descendants |
|
1668 |
+ attrs = dict([(k.lower(), v) for k, v in attrs]).items() |
|
1669 |
+ attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] |
|
1670 |
+ attrs.sort() |
|
1671 |
+ return attrs |
|
1672 |
+ |
|
1673 |
+ def unknown_starttag(self, tag, attrs): |
|
1674 |
+ # called for each start tag |
|
1675 |
+ # attrs is a list of (attr, value) tuples |
|
1676 |
+ # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')] |
|
1677 |
+ if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag) |
|
1678 |
+ uattrs = [] |
|
1679 |
+ strattrs='' |
|
1680 |
+ if attrs: |
|
1681 |
+ for key, value in attrs: |
|
1682 |
+ value=value.replace('>','>').replace('<','<').replace('"','"') |
|
1683 |
+ value = self.bare_ampersand.sub("&", value) |
|
1684 |
+ # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds |
|
1685 |
+ if type(value) != type(u''): |
|
1686 |
+ try: |
|
1687 |
+ value = unicode(value, self.encoding) |
|
1688 |
+ except: |
|
1689 |
+ value = unicode(value, 'iso-8859-1') |
|
1690 |
+ uattrs.append((unicode(key, self.encoding), value)) |
|
1691 |
+ strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]) |
|
1692 |
+ if self.encoding: |
|
1693 |
+ try: |
|
1694 |
+ strattrs=strattrs.encode(self.encoding) |
|
1695 |
+ except: |
|
1696 |
+ pass |
|
1697 |
+ if tag in self.elements_no_end_tag: |
|
1698 |
+ self.pieces.append('<%(tag)s%(strattrs)s />' % locals()) |
|
1699 |
+ else: |
|
1700 |
+ self.pieces.append('<%(tag)s%(strattrs)s>' % locals()) |
|
1701 |
+ |
|
1702 |
+ def unknown_endtag(self, tag): |
|
1703 |
+ # called for each end tag, e.g. for </pre>, tag will be 'pre' |
|
1704 |
+ # Reconstruct the original end tag. |
|
1705 |
+ if tag not in self.elements_no_end_tag: |
|
1706 |
+ self.pieces.append("</%(tag)s>" % locals()) |
|
1707 |
+ |
|
1708 |
+ def handle_charref(self, ref): |
|
1709 |
+ # called for each character reference, e.g. for ' ', ref will be '160' |
|
1710 |
+ # Reconstruct the original character reference. |
|
1711 |
+ if ref.startswith('x'): |
|
1712 |
+ value = unichr(int(ref[1:],16)) |
|
1713 |
+ else: |
|
1714 |
+ value = unichr(int(ref)) |
|
1715 |
+ |
|
1716 |
+ if value in _cp1252.keys(): |
|
1717 |
+ self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:]) |
|
1718 |
+ else: |
|
1719 |
+ self.pieces.append('&#%(ref)s;' % locals()) |
|
1720 |
+ |
|
1721 |
+ def handle_entityref(self, ref): |
|
1722 |
+ # called for each entity reference, e.g. for '©', ref will be 'copy' |
|
1723 |
+ # Reconstruct the original entity reference. |
|
1724 |
+ if name2codepoint.has_key(ref): |
|
1725 |
+ self.pieces.append('&%(ref)s;' % locals()) |
|
1726 |
+ else: |
|
1727 |
+ self.pieces.append('&%(ref)s' % locals()) |
|
1728 |
+ |
|
1729 |
+ def handle_data(self, text): |
|
1730 |
+ # called for each block of plain text, i.e. outside of any tag and |
|
1731 |
+ # not containing any character or entity references |
|
1732 |
+ # Store the original text verbatim. |
|
1733 |
+ if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text) |
|
1734 |
+ self.pieces.append(text) |
|
1735 |
+ |
|
1736 |
+ def handle_comment(self, text): |
|
1737 |
+ # called for each HTML comment, e.g. <!-- insert Javascript code here --> |
|
1738 |
+ # Reconstruct the original comment. |
|
1739 |
+ self.pieces.append('<!--%(text)s-->' % locals()) |
|
1740 |
+ |
|
1741 |
+ def handle_pi(self, text): |
|
1742 |
+ # called for each processing instruction, e.g. <?instruction> |
|
1743 |
+ # Reconstruct original processing instruction. |
|
1744 |
+ self.pieces.append('<?%(text)s>' % locals()) |
|
1745 |
+ |
|
1746 |
+ def handle_decl(self, text): |
|
1747 |
+ # called for the DOCTYPE, if present, e.g. |
|
1748 |
+ # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" |
|
1749 |
+ # "http://www.w3.org/TR/html4/loose.dtd"> |
|
1750 |
+ # Reconstruct original DOCTYPE |
|
1751 |
+ self.pieces.append('<!%(text)s>' % locals()) |
|
1752 |
+ |
|
1753 |
+ _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match |
|
1754 |
+ def _scan_name(self, i, declstartpos): |
|
1755 |
+ rawdata = self.rawdata |
|
1756 |
+ n = len(rawdata) |
|
1757 |
+ if i == n: |
|
1758 |
+ return None, -1 |
|
1759 |
+ m = self._new_declname_match(rawdata, i) |
|
1760 |
+ if m: |
|
1761 |
+ s = m.group() |
|
1762 |
+ name = s.strip() |
|
1763 |
+ if (i + len(s)) == n: |
|
1764 |
+ return None, -1 # end of buffer |
|
1765 |
+ return name.lower(), m.end() |
|
1766 |
+ else: |
|
1767 |
+ self.handle_data(rawdata) |
|
1768 |
+# self.updatepos(declstartpos, i) |
|
1769 |
+ return None, -1 |
|
1770 |
+ |
|
1771 |
+ def convert_charref(self, name): |
|
1772 |
+ return '&#%s;' % name |
|
1773 |
+ |
|
1774 |
+ def convert_entityref(self, name): |
|
1775 |
+ return '&%s;' % name |
|
1776 |
+ |
|
1777 |
+ def output(self): |
|
1778 |
+ '''Return processed HTML as a single string''' |
|
1779 |
+ return ''.join([str(p) for p in self.pieces]) |
|
1780 |
+ |
|
1781 |
+class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor): |
|
1782 |
+ def __init__(self, baseuri, baselang, encoding, entities): |
|
1783 |
+ sgmllib.SGMLParser.__init__(self) |
|
1784 |
+ _FeedParserMixin.__init__(self, baseuri, baselang, encoding) |
|
1785 |
+ _BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml') |
|
1786 |
+ self.entities=entities |
|
1787 |
+ |
|
1788 |
+ def decodeEntities(self, element, data): |
|
1789 |
+ data = data.replace('<', '<') |
|
1790 |
+ data = data.replace('<', '<') |
|
1791 |
+ data = data.replace('<', '<') |
|
1792 |
+ data = data.replace('>', '>') |
|
1793 |
+ data = data.replace('>', '>') |
|
1794 |
+ data = data.replace('>', '>') |
|
1795 |
+ data = data.replace('&', '&') |
|
1796 |
+ data = data.replace('&', '&') |
|
1797 |
+ data = data.replace('"', '"') |
|
1798 |
+ data = data.replace('"', '"') |
|
1799 |
+ data = data.replace(''', ''') |
|
1800 |
+ data = data.replace(''', ''') |
|
1801 |
+ if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): |
|
1802 |
+ data = data.replace('<', '<') |
|
1803 |
+ data = data.replace('>', '>') |
|
1804 |
+ data = data.replace('&', '&') |
|
1805 |
+ data = data.replace('"', '"') |
|
1806 |
+ data = data.replace(''', "'") |
|
1807 |
+ return data |
|
1808 |
+ |
|
1809 |
+ def strattrs(self, attrs): |
|
1810 |
+ return ''.join([' %s="%s"' % (n,v.replace('"','"')) for n,v in attrs]) |
|
1811 |
+ |
|
1812 |
+class _MicroformatsParser: |
|
1813 |
+ STRING = 1 |
|
1814 |
+ DATE = 2 |
|
1815 |
+ URI = 3 |
|
1816 |
+ NODE = 4 |
|
1817 |
+ EMAIL = 5 |
|
1818 |
+ |
|
1819 |
+ known_xfn_relationships = ['contact', 'acquaintance', 'friend', 'met', 'co-worker', 'coworker', 'colleague', 'co-resident', 'coresident', 'neighbor', 'child', 'parent', 'sibling', 'brother', 'sister', 'spouse', 'wife', 'husband', 'kin', 'relative', 'muse', 'crush', 'date', 'sweetheart', 'me'] |
|
1820 |
+ known_binary_extensions = ['zip','rar','exe','gz','tar','tgz','tbz2','bz2','z','7z','dmg','img','sit','sitx','hqx','deb','rpm','bz2','jar','rar','iso','bin','msi','mp2','mp3','ogg','ogm','mp4','m4v','m4a','avi','wma','wmv'] |
|
1821 |
+ |
|
1822 |
+ def __init__(self, data, baseuri, encoding): |
|
1823 |
+ self.document = BeautifulSoup.BeautifulSoup(data) |
|
1824 |
+ self.baseuri = baseuri |
|
1825 |
+ self.encoding = encoding |
|
1826 |
+ if type(data) == type(u''): |
|
1827 |
+ data = data.encode(encoding) |
|
1828 |
+ self.tags = [] |
|
1829 |
+ self.enclosures = [] |
|
1830 |
+ self.xfn = [] |
|
1831 |
+ self.vcard = None |
|
1832 |
+ |
|
1833 |
+ def vcardEscape(self, s): |
|
1834 |
+ if type(s) in (type(''), type(u'')): |
|
1835 |
+ s = s.replace(',', '\\,').replace(';', '\\;').replace('\n', '\\n') |
|
1836 |
+ return s |
|
1837 |
+ |
|
1838 |
+ def vcardFold(self, s): |
|
1839 |
+ s = re.sub(';+$', '', s) |
|
1840 |
+ sFolded = '' |
|
1841 |
+ iMax = 75 |
|
1842 |
+ sPrefix = '' |
|
1843 |
+ while len(s) > iMax: |
|
1844 |
+ sFolded += sPrefix + s[:iMax] + '\n' |
|
1845 |
+ s = s[iMax:] |
|
1846 |
+ sPrefix = ' ' |
|
1847 |
+ iMax = 74 |
|
1848 |
+ sFolded += sPrefix + s |
|
1849 |
+ return sFolded |
|
1850 |
+ |
|
1851 |
+ def normalize(self, s): |
|
1852 |
+ return re.sub(r'\s+', ' ', s).strip() |
|
1853 |
+ |
|
1854 |
+ def unique(self, aList): |
|
1855 |
+ results = [] |
|
1856 |
+ for element in aList: |
|
1857 |
+ if element not in results: |
|
1858 |
+ results.append(element) |
|
1859 |
+ return results |
|
1860 |
+ |
|
1861 |
+ def toISO8601(self, dt): |
|
1862 |
+ return time.strftime('%Y-%m-%dT%H:%M:%SZ', dt) |
|
1863 |
+ |
|
1864 |
+ def getPropertyValue(self, elmRoot, sProperty, iPropertyType=4, bAllowMultiple=0, bAutoEscape=0): |
|
1865 |
+ all = lambda x: 1 |
|
1866 |
+ sProperty = sProperty.lower() |
|
1867 |
+ bFound = 0 |
|
1868 |
+ bNormalize = 1 |
|
1869 |
+ propertyMatch = {'class': re.compile(r'\b%s\b' % sProperty)} |
|
1870 |
+ if bAllowMultiple and (iPropertyType != self.NODE): |
|
1871 |
+ snapResults = [] |
|
1872 |
+ containers = elmRoot(['ul', 'ol'], propertyMatch) |
|
1873 |
+ for container in containers: |
|
1874 |
+ snapResults.extend(container('li')) |
|
1875 |
+ bFound = (len(snapResults) != 0) |
|
1876 |
+ if not bFound: |
|
1877 |
+ snapResults = elmRoot(all, propertyMatch) |
|
1878 |
+ bFound = (len(snapResults) != 0) |
|
1879 |
+ if (not bFound) and (sProperty == 'value'): |
|
1880 |
+ snapResults = elmRoot('pre') |
|
1881 |
+ bFound = (len(snapResults) != 0) |
|
1882 |
+ bNormalize = not bFound |
|
1883 |
+ if not bFound: |
|
1884 |
+ snapResults = [elmRoot] |
|
1885 |
+ bFound = (len(snapResults) != 0) |
|
1886 |
+ arFilter = [] |
|
1887 |
+ if sProperty == 'vcard': |
|
1888 |
+ snapFilter = elmRoot(all, propertyMatch) |
|
1889 |
+ for node in snapFilter: |
|
1890 |
+ if node.findParent(all, propertyMatch): |
|
1891 |
+ arFilter.append(node) |
|
1892 |
+ arResults = [] |
|
1893 |
+ for node in snapResults: |
|
1894 |
+ if node not in arFilter: |
|
1895 |
+ arResults.append(node) |
|
1896 |
+ bFound = (len(arResults) != 0) |
|
1897 |
+ if not bFound: |
|
1898 |
+ if bAllowMultiple: return [] |
|
1899 |
+ elif iPropertyType == self.STRING: return '' |
|
1900 |
+ elif iPropertyType == self.DATE: return None |
|
1901 |
+ elif iPropertyType == self.URI: return '' |
|
1902 |
+ elif iPropertyType == self.NODE: return None |
|
1903 |
+ else: return None |
|
1904 |
+ arValues = [] |
|
1905 |
+ for elmResult in arResults: |
|
1906 |
+ sValue = None |
|
1907 |
+ if iPropertyType == self.NODE: |
|
1908 |
+ if bAllowMultiple: |
|
1909 |
+ arValues.append(elmResult) |
|
1910 |
+ continue |
|
1911 |
+ else: |
|
1912 |
+ return elmResult |
|
1913 |
+ sNodeName = elmResult.name.lower() |
|
1914 |
+ if (iPropertyType == self.EMAIL) and (sNodeName == 'a'): |
|
1915 |
+ sValue = (elmResult.get('href') or '').split('mailto:').pop().split('?')[0] |
|
1916 |
+ if sValue: |
|
1917 |
+ sValue = bNormalize and self.normalize(sValue) or sValue.strip() |
|
1918 |
+ if (not sValue) and (sNodeName == 'abbr'): |
|
1919 |
+ sValue = elmResult.get('title') |
|
1920 |
+ if sValue: |
|
1921 |
+ sValue = bNormalize and self.normalize(sValue) or sValue.strip() |
|
1922 |
+ if (not sValue) and (iPropertyType == self.URI): |
|
1923 |
+ if sNodeName == 'a': sValue = elmResult.get('href') |
|
1924 |
+ elif sNodeName == 'img': sValue = elmResult.get('src') |
|
1925 |
+ elif sNodeName == 'object': sValue = elmResult.get('data') |
|
1926 |
+ if sValue: |
|
1927 |
+ sValue = bNormalize and self.normalize(sValue) or sValue.strip() |
|
1928 |
+ if (not sValue) and (sNodeName == 'img'): |
|
1929 |
+ sValue = elmResult.get('alt') |
|
1930 |
+ if sValue: |
|
1931 |
+ sValue = bNormalize and self.normalize(sValue) or sValue.strip() |
|
1932 |
+ if not sValue: |
|
1933 |
+ sValue = elmResult.renderContents() |
|
1934 |
+ sValue = re.sub(r'<\S[^>]*>', '', sValue) |
|
1935 |
+ sValue = sValue.replace('\r\n', '\n') |
|
1936 |
+ sValue = sValue.replace('\r', '\n') |
|
1937 |
+ if sValue: |
|
1938 |
+ sValue = bNormalize and self.normalize(sValue) or sValue.strip() |
|
1939 |
+ if not sValue: continue |
|
1940 |
+ if iPropertyType == self.DATE: |
|
1941 |
+ sValue = _parse_date_iso8601(sValue) |
|
1942 |
+ if bAllowMultiple: |
|
1943 |
+ arValues.append(bAutoEscape and self.vcardEscape(sValue) or sValue) |
|
1944 |
+ else: |
|
1945 |
+ return bAutoEscape and self.vcardEscape(sValue) or sValue |
|
1946 |
+ return arValues |
|
1947 |
+ |
|
1948 |
+ def findVCards(self, elmRoot, bAgentParsing=0): |
|
1949 |
+ sVCards = '' |
|
1950 |
+ |
|
1951 |
+ if not bAgentParsing: |
|
1952 |
+ arCards = self.getPropertyValue(elmRoot, 'vcard', bAllowMultiple=1) |
|
1953 |
+ else: |
|
1954 |
+ arCards = [elmRoot] |
|
1955 |
+ |
|
1956 |
+ for elmCard in arCards: |
|
1957 |
+ arLines = [] |
|
1958 |
+ |
|
1959 |
+ def processSingleString(sProperty): |
|
1960 |
+ sValue = self.getPropertyValue(elmCard, sProperty, self.STRING, bAutoEscape=1) |
|
1961 |
+ if sValue: |
|
1962 |
+ arLines.append(self.vcardFold(sProperty.upper() + ':' + sValue)) |
|
1963 |
+ return sValue or '' |
|
1964 |
+ |
|
1965 |
+ def processSingleURI(sProperty): |
|
1966 |
+ sValue = self.getPropertyValue(elmCard, sProperty, self.URI) |
|
1967 |
+ if sValue: |
|
1968 |
+ sContentType = '' |
|
1969 |
+ sEncoding = '' |
|
1970 |
+ sValueKey = '' |
|
1971 |
+ if sValue.startswith('data:'): |
|
1972 |
+ sEncoding = ';ENCODING=b' |
|
1973 |
+ sContentType = sValue.split(';')[0].split('/').pop() |
|
1974 |
+ sValue = sValue.split(',', 1).pop() |
|
1975 |
+ else: |
|
1976 |
+ elmValue = self.getPropertyValue(elmCard, sProperty) |
|
1977 |
+ if elmValue: |
|
1978 |
+ if sProperty != 'url': |
|
1979 |
+ sValueKey = ';VALUE=uri' |
|
1980 |
+ sContentType = elmValue.get('type', '').strip().split('/').pop().strip() |
|
1981 |
+ sContentType = sContentType.upper() |
|
1982 |
+ if sContentType == 'OCTET-STREAM': |
|
1983 |
+ sContentType = '' |
|
1984 |
+ if sContentType: |
|
1985 |
+ sContentType = ';TYPE=' + sContentType.upper() |
|
1986 |
+ arLines.append(self.vcardFold(sProperty.upper() + sEncoding + sContentType + sValueKey + ':' + sValue)) |
|
1987 |
+ |
|
1988 |
+ def processTypeValue(sProperty, arDefaultType, arForceType=None): |
|
1989 |
+ arResults = self.getPropertyValue(elmCard, sProperty, bAllowMultiple=1) |
|
1990 |
+ for elmResult in arResults: |
|
1991 |
+ arType = self.getPropertyValue(elmResult, 'type', self.STRING, 1, 1) |
|
1992 |
+ if arForceType: |
|
1993 |
+ arType = self.unique(arForceType + arType) |
|
1994 |
+ if not arType: |
|
1995 |
+ arType = arDefaultType |
|
1996 |
+ sValue = self.getPropertyValue(elmResult, 'value', self.EMAIL, 0) |
|
1997 |
+ if sValue: |
|
1998 |
+ arLines.append(self.vcardFold(sProperty.upper() + ';TYPE=' + ','.join(arType) + ':' + sValue)) |
|
1999 |
+ |
|
2000 |
+ # AGENT |
|
2001 |
+ # must do this before all other properties because it is destructive |
|
2002 |
+ # (removes nested class="vcard" nodes so they don't interfere with |
|
2003 |
+ # this vcard's other properties) |
|
2004 |
+ arAgent = self.getPropertyValue(elmCard, 'agent', bAllowMultiple=1) |
|
2005 |
+ for elmAgent in arAgent: |
|
2006 |
+ if re.compile(r'\bvcard\b').search(elmAgent.get('class')): |
|
2007 |
+ sAgentValue = self.findVCards(elmAgent, 1) + '\n' |
|
2008 |
+ sAgentValue = sAgentValue.replace('\n', '\\n') |
|
2009 |
+ sAgentValue = sAgentValue.replace(';', '\\;') |
|
2010 |
+ if sAgentValue: |
|
2011 |
+ arLines.append(self.vcardFold('AGENT:' + sAgentValue)) |
|
2012 |
+ elmAgent['class'] = '' |
|
2013 |
+ elmAgent.contents = [] |
|
2014 |
+ else: |
|
2015 |
+ sAgentValue = self.getPropertyValue(elmAgent, 'value', self.URI, bAutoEscape=1); |
|
2016 |
+ if sAgentValue: |
|
2017 |
+ arLines.append(self.vcardFold('AGENT;VALUE=uri:' + sAgentValue)) |
|
2018 |
+ |
|
2019 |
+ # FN (full name) |
|
2020 |
+ sFN = processSingleString('fn') |
|
2021 |
+ |
|
2022 |
+ # N (name) |
|
2023 |
+ elmName = self.getPropertyValue(elmCard, 'n') |
|
2024 |
+ if elmName: |
|
2025 |
+ sFamilyName = self.getPropertyValue(elmName, 'family-name', self.STRING, bAutoEscape=1) |
|
2026 |
+ sGivenName = self.getPropertyValue(elmName, 'given-name', self.STRING, bAutoEscape=1) |
|
2027 |
+ arAdditionalNames = self.getPropertyValue(elmName, 'additional-name', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'additional-names', self.STRING, 1, 1) |
|
2028 |
+ arHonorificPrefixes = self.getPropertyValue(elmName, 'honorific-prefix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-prefixes', self.STRING, 1, 1) |
|
2029 |
+ arHonorificSuffixes = self.getPropertyValue(elmName, 'honorific-suffix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-suffixes', self.STRING, 1, 1) |
|
2030 |
+ arLines.append(self.vcardFold('N:' + sFamilyName + ';' + |
|
2031 |
+ sGivenName + ';' + |
|
2032 |
+ ','.join(arAdditionalNames) + ';' + |
|
2033 |
+ ','.join(arHonorificPrefixes) + ';' + |
|
2034 |
+ ','.join(arHonorificSuffixes))) |
|
2035 |
+ elif sFN: |
|
2036 |
+ # implied "N" optimization |
|
2037 |
+ # http://microformats.org/wiki/hcard#Implied_.22N.22_Optimization |
|
2038 |
+ arNames = self.normalize(sFN).split() |
|
2039 |
+ if len(arNames) == 2: |
|
2040 |
+ bFamilyNameFirst = (arNames[0].endswith(',') or |
|
2041 |
+ len(arNames[1]) == 1 or |
|
2042 |
+ ((len(arNames[1]) == 2) and (arNames[1].endswith('.')))) |
|
2043 |
+ if bFamilyNameFirst: |
|
2044 |
+ arLines.append(self.vcardFold('N:' + arNames[0] + ';' + arNames[1])) |
|
2045 |
+ else: |
|
2046 |
+ arLines.append(self.vcardFold('N:' + arNames[1] + ';' + arNames[0])) |
|
2047 |
+ |
|
2048 |
+ # SORT-STRING |
|
2049 |
+ sSortString = self.getPropertyValue(elmCard, 'sort-string', self.STRING, bAutoEscape=1) |
|
2050 |
+ if sSortString: |
|
2051 |
+ arLines.append(self.vcardFold('SORT-STRING:' + sSortString)) |
|
2052 |
+ |
|
2053 |
+ # NICKNAME |
|
2054 |
+ arNickname = self.getPropertyValue(elmCard, 'nickname', self.STRING, 1, 1) |
|
2055 |
+ if arNickname: |
|
2056 |
+ arLines.append(self.vcardFold('NICKNAME:' + ','.join(arNickname))) |
|
2057 |
+ |
|
2058 |
+ # PHOTO |
|
2059 |
+ processSingleURI('photo') |
|
2060 |
+ |
|
2061 |
+ # BDAY |
|
2062 |
+ dtBday = self.getPropertyValue(elmCard, 'bday', self.DATE) |
|
2063 |
+ if dtBday: |
|
2064 |
+ arLines.append(self.vcardFold('BDAY:' + self.toISO8601(dtBday))) |
|
2065 |
+ |
|
2066 |
+ # ADR (address) |
|
2067 |
+ arAdr = self.getPropertyValue(elmCard, 'adr', bAllowMultiple=1) |
|
2068 |
+ for elmAdr in arAdr: |
|
2069 |
+ arType = self.getPropertyValue(elmAdr, 'type', self.STRING, 1, 1) |
|
2070 |
+ if not arType: |
|
2071 |
+ arType = ['intl','postal','parcel','work'] # default adr types, see RFC 2426 section 3.2.1 |
|
2072 |
+ sPostOfficeBox = self.getPropertyValue(elmAdr, 'post-office-box', self.STRING, 0, 1) |
|
2073 |
+ sExtendedAddress = self.getPropertyValue(elmAdr, 'extended-address', self.STRING, 0, 1) |
|
2074 |
+ sStreetAddress = self.getPropertyValue(elmAdr, 'street-address', self.STRING, 0, 1) |
|
2075 |
+ sLocality = self.getPropertyValue(elmAdr, 'locality', self.STRING, 0, 1) |
|
2076 |
+ sRegion = self.getPropertyValue(elmAdr, 'region', self.STRING, 0, 1) |
|
2077 |
+ sPostalCode = self.getPropertyValue(elmAdr, 'postal-code', self.STRING, 0, 1) |
|
2078 |
+ sCountryName = self.getPropertyValue(elmAdr, 'country-name', self.STRING, 0, 1) |
|
2079 |
+ arLines.append(self.vcardFold('ADR;TYPE=' + ','.join(arType) + ':' + |
|
2080 |
+ sPostOfficeBox + ';' + |
|
2081 |
+ sExtendedAddress + ';' + |
|
2082 |
+ sStreetAddress + ';' + |
|
2083 |
+ sLocality + ';' + |
|
2084 |
+ sRegion + ';' + |
|
2085 |
+ sPostalCode + ';' + |
|
2086 |
+ sCountryName)) |
|
2087 |
+ |
|
2088 |
+ # LABEL |
|
2089 |
+ processTypeValue('label', ['intl','postal','parcel','work']) |
|
2090 |
+ |
|
2091 |
+ # TEL (phone number) |
|
2092 |
+ processTypeValue('tel', ['voice']) |
|
2093 |
+ |
|
2094 |
|
|
2095 |
+ processTypeValue('email', ['internet'], ['internet']) |
|
2096 |
+ |
|
2097 |
+ # MAILER |
|
2098 |
+ processSingleString('mailer') |
|
2099 |
+ |
|
2100 |
+ # TZ (timezone) |
|
2101 |
+ processSingleString('tz') |
|
2102 |
+ |
|
2103 |
+ # GEO (geographical information) |
|
2104 |
+ elmGeo = self.getPropertyValue(elmCard, 'geo') |
|
2105 |
+ if elmGeo: |
|
2106 |
+ sLatitude = self.getPropertyValue(elmGeo, 'latitude', self.STRING, 0, 1) |
|
2107 |
+ sLongitude = self.getPropertyValue(elmGeo, 'longitude', self.STRING, 0, 1) |
|
2108 |
+ arLines.append(self.vcardFold('GEO:' + sLatitude + ';' + sLongitude)) |
|
2109 |
+ |
|
2110 |
+ # TITLE |
|
2111 |
+ processSingleString('title') |
|
2112 |
+ |
|
2113 |
+ # ROLE |
|
2114 |
+ processSingleString('role') |
|
2115 |
+ |
|
2116 |
+ # LOGO |
|
2117 |
+ processSingleURI('logo') |
|
2118 |
+ |
|
2119 |
+ # ORG (organization) |
|
2120 |
+ elmOrg = self.getPropertyValue(elmCard, 'org') |
|
2121 |
+ if elmOrg: |
|
2122 |
+ sOrganizationName = self.getPropertyValue(elmOrg, 'organization-name', self.STRING, 0, 1) |
|
2123 |
+ if not sOrganizationName: |
|
2124 |
+ # implied "organization-name" optimization |
|
2125 |
+ # http://microformats.org/wiki/hcard#Implied_.22organization-name.22_Optimization |
|
2126 |
+ sOrganizationName = self.getPropertyValue(elmCard, 'org', self.STRING, 0, 1) |
|
2127 |
+ if sOrganizationName: |
|
2128 |
+ arLines.append(self.vcardFold('ORG:' + sOrganizationName)) |
|
2129 |
+ else: |
|
2130 |
+ arOrganizationUnit = self.getPropertyValue(elmOrg, 'organization-unit', self.STRING, 1, 1) |
|
2131 |
+ arLines.append(self.vcardFold('ORG:' + sOrganizationName + ';' + ';'.join(arOrganizationUnit))) |
|
2132 |
+ |
|
2133 |
+ # CATEGORY |
|
2134 |
+ arCategory = self.getPropertyValue(elmCard, 'category', self.STRING, 1, 1) + self.getPropertyValue(elmCard, 'categories', self.STRING, 1, 1) |
|
2135 |
+ if arCategory: |
|
2136 |
+ arLines.append(self.vcardFold('CATEGORIES:' + ','.join(arCategory))) |
|
2137 |
+ |
|
2138 |
+ # NOTE |
|
2139 |
+ processSingleString('note') |
|
2140 |
+ |
|
2141 |
+ # REV |
|
2142 |
+ processSingleString('rev') |
|
2143 |
+ |
|
2144 |
+ # SOUND |
|
2145 |
+ processSingleURI('sound') |
|
2146 |
+ |
|
2147 |
+ # UID |
|
2148 |
+ processSingleString('uid') |
|
2149 |
+ |
|
2150 |
+ # URL |
|
2151 |
+ processSingleURI('url') |
|
2152 |
+ |
|
2153 |
+ # CLASS |
|
2154 |
+ processSingleString('class') |
|
2155 |
+ |
|
2156 |
+ # KEY |
|
2157 |
+ processSingleURI('key') |
|
2158 |
+ |
|
2159 |
+ if arLines: |
|
2160 |
+ arLines = ['BEGIN:vCard','VERSION:3.0'] + arLines + ['END:vCard'] |
|
2161 |
+ sVCards += '\n'.join(arLines) + '\n' |
|
2162 |
+ |
|
2163 |
+ return sVCards.strip() |
|
2164 |
+ |
|
2165 |
+ def isProbablyDownloadable(self, elm): |
|
2166 |
+ attrsD = elm.attrMap |
|
2167 |
+ if not attrsD.has_key('href'): return 0 |
|
2168 |
+ linktype = attrsD.get('type', '').strip() |
|
2169 |
+ if linktype.startswith('audio/') or \ |
|
2170 |
+ linktype.startswith('video/') or \ |
|
2171 |
+ (linktype.startswith('application/') and not linktype.endswith('xml')): |
|
2172 |
+ return 1 |
|
2173 |
+ path = urlparse.urlparse(attrsD['href'])[2] |
|
2174 |
+ if path.find('.') == -1: return 0 |
|
2175 |
+ fileext = path.split('.').pop().lower() |
|
2176 |
+ return fileext in self.known_binary_extensions |
|
2177 |
+ |
|
2178 |
+ def findTags(self): |
|
2179 |
+ all = lambda x: 1 |
|
2180 |
+ for elm in self.document(all, {'rel': re.compile(r'\btag\b')}): |
|
2181 |
+ href = elm.get('href') |
|
2182 |
+ if not href: continue |
|
2183 |
+ urlscheme, domain, path, params, query, fragment = \ |
|
2184 |
+ urlparse.urlparse(_urljoin(self.baseuri, href)) |
|
2185 |
+ segments = path.split('/') |
|
2186 |
+ tag = segments.pop() |
|
2187 |
+ if not tag: |
|
2188 |
+ tag = segments.pop() |
|
2189 |
+ tagscheme = urlparse.urlunparse((urlscheme, domain, '/'.join(segments), '', '', '')) |
|
2190 |
+ if not tagscheme.endswith('/'): |
|
2191 |
+ tagscheme += '/' |
|
2192 |
+ self.tags.append(FeedParserDict({"term": tag, "scheme": tagscheme, "label": elm.string or ''})) |
|
2193 |
+ |
|
2194 |
+ def findEnclosures(self): |
|
2195 |
+ all = lambda x: 1 |
|
2196 |
+ enclosure_match = re.compile(r'\benclosure\b') |
|
2197 |
+ for elm in self.document(all, {'href': re.compile(r'.+')}): |
|
2198 |
+ if not enclosure_match.search(elm.get('rel', '')) and not self.isProbablyDownloadable(elm): continue |
|
2199 |
+ if elm.attrMap not in self.enclosures: |
|
2200 |
+ self.enclosures.append(elm.attrMap) |
|
2201 |
+ if elm.string and not elm.get('title'): |
|
2202 |
+ self.enclosures[-1]['title'] = elm.string |
|
2203 |
+ |
|
2204 |
+ def findXFN(self): |
|
2205 |
+ all = lambda x: 1 |
|
2206 |
+ for elm in self.document(all, {'rel': re.compile('.+'), 'href': re.compile('.+')}): |
|
2207 |
+ rels = elm.get('rel', '').split() |
|
2208 |
+ xfn_rels = [] |
|
2209 |
+ for rel in rels: |
|
2210 |
+ if rel in self.known_xfn_relationships: |
|
2211 |
+ xfn_rels.append(rel) |
|
2212 |
+ if xfn_rels: |
|
2213 |
+ self.xfn.append({"relationships": xfn_rels, "href": elm.get('href', ''), "name": elm.string}) |
|
2214 |
+ |
|
2215 |
+def _parseMicroformats(htmlSource, baseURI, encoding): |
|
2216 |
+ if not BeautifulSoup: return |
|
2217 |
+ if _debug: sys.stderr.write('entering _parseMicroformats\n') |
|
2218 |
+ p = _MicroformatsParser(htmlSource, baseURI, encoding) |
|
2219 |
+ p.vcard = p.findVCards(p.document) |
|
2220 |
+ p.findTags() |
|
2221 |
+ p.findEnclosures() |
|
2222 |
+ p.findXFN() |
|
2223 |
+ return {"tags": p.tags, "enclosures": p.enclosures, "xfn": p.xfn, "vcard": p.vcard} |
|
2224 |
+ |
|
2225 |
+class _RelativeURIResolver(_BaseHTMLProcessor): |
|
2226 |
+ relative_uris = [('a', 'href'), |
|
2227 |
+ ('applet', 'codebase'), |
|
2228 |
+ ('area', 'href'), |
|
2229 |
+ ('blockquote', 'cite'), |
|
2230 |
+ ('body', 'background'), |
|
2231 |
+ ('del', 'cite'), |
|
2232 |
+ ('form', 'action'), |
|
2233 |
+ ('frame', 'longdesc'), |
|
2234 |
+ ('frame', 'src'), |
|
2235 |
+ ('iframe', 'longdesc'), |
|
2236 |
+ ('iframe', 'src'), |
|
2237 |
+ ('head', 'profile'), |
|
2238 |
+ ('img', 'longdesc'), |
|
2239 |
+ ('img', 'src'), |
|
2240 |
+ ('img', 'usemap'), |
|
2241 |
+ ('input', 'src'), |
|
2242 |
+ ('input', 'usemap'), |
|
2243 |
+ ('ins', 'cite'), |
|
2244 |
+ ('link', 'href'), |
|
2245 |
+ ('object', 'classid'), |
|
2246 |
+ ('object', 'codebase'), |
|
2247 |
+ ('object', 'data'), |
|
2248 |
+ ('object', 'usemap'), |
|
2249 |
+ ('q', 'cite'), |
|
2250 |
+ ('script', 'src')] |
|
2251 |
+ |
|
2252 |
+ def __init__(self, baseuri, encoding, type): |
|
2253 |
+ _BaseHTMLProcessor.__init__(self, encoding, type) |
|
2254 |
+ self.baseuri = baseuri |
|
2255 |
+ |
|
2256 |
+ def resolveURI(self, uri): |
|
2257 |
+ return _urljoin(self.baseuri, uri.strip()) |
|
2258 |
+ |
|
2259 |
+ def unknown_starttag(self, tag, attrs): |
|
2260 |
+ attrs = self.normalize_attrs(attrs) |
|
2261 |
+ attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs] |
|
2262 |
+ _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) |
|
2263 |
+ |
|
2264 |
+def _resolveRelativeURIs(htmlSource, baseURI, encoding, type): |
|
2265 |
+ if _debug: sys.stderr.write('entering _resolveRelativeURIs\n') |
|
2266 |
+ p = _RelativeURIResolver(baseURI, encoding, type) |
|
2267 |
+ p.feed(htmlSource) |
|
2268 |
+ return p.output() |
|
2269 |
+ |
|
2270 |
+class _HTMLSanitizer(_BaseHTMLProcessor): |
|
2271 |
+ acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'article', |
|
2272 |
+ 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button', 'canvas', |
|
2273 |
+ 'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'command', |
|
2274 |
+ 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir', |
|
2275 |
+ 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', 'figure', 'footer', |
|
2276 |
+ 'font', 'form', 'header', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', |
|
2277 |
+ 'img', 'input', 'ins', 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', |
|
2278 |
+ 'menu', 'meter', 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', |
|
2279 |
+ 'option', 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select', |
|
2280 |
+ 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', 'sub', |
|
2281 |
+ 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', 'th', 'thead', |
|
2282 |
+ 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript'] |
|
2283 |
+ |
|
2284 |
+ acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey', |
|
2285 |
+ 'action', 'align', 'alt', 'autoplay', 'autocomplete', 'autofocus', 'axis', |
|
2286 |
+ 'background', 'balance', 'bgcolor', 'bgproperties', 'border', |
|
2287 |
+ 'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding', |
|
2288 |
+ 'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff', |
|
2289 |
+ 'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols', |
|
2290 |
+ 'colspan', 'compact', 'contenteditable', 'coords', 'data', 'datafld', |
|
2291 |
+ 'datapagesize', 'datasrc', 'datetime', 'default', 'delay', 'dir', |
|
2292 |
+ 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for', |
|
2293 |
+ 'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus', |
|
2294 |
+ 'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode', |
|
2295 |
+ 'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc', |
|
2296 |
+ 'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max', |
|
2297 |
+ 'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref', |
|
2298 |
+ 'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size', |
|
2299 |
+ 'prompt', 'pqg', 'radiogroup', 'readonly', 'rel', 'repeat-max', |
|
2300 |
+ 'repeat-min', 'replace', 'required', 'rev', 'rightspacing', 'rows', |
|
2301 |
+ 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', |
|
2302 |
+ 'start', 'step', 'summary', 'suppress', 'tabindex', 'target', 'template', |
|
2303 |
+ 'title', 'toppadding', 'type', 'unselectable', 'usemap', 'urn', 'valign', |
|
2304 |
+ 'value', 'variable', 'volume', 'vspace', 'vrml', 'width', 'wrap', |
|
2305 |
+ 'xml:lang'] |
|
2306 |
+ |
|
2307 |
+ unacceptable_elements_with_end_tag = ['script', 'applet', 'style'] |
|
2308 |
+ |
|
2309 |
+ acceptable_css_properties = ['azimuth', 'background-color', |
|
2310 |
+ 'border-bottom-color', 'border-collapse', 'border-color', |
|
2311 |
+ 'border-left-color', 'border-right-color', 'border-top-color', 'clear', |
|
2312 |
+ 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font', |
|
2313 |
+ 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight', |
|
2314 |
+ 'height', 'letter-spacing', 'line-height', 'overflow', 'pause', |
|
2315 |
+ 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness', |
|
2316 |
+ 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation', |
|
2317 |
+ 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent', |
|
2318 |
+ 'unicode-bidi', 'vertical-align', 'voice-family', 'volume', |
|
2319 |
+ 'white-space', 'width'] |
|
2320 |
+ |
|
2321 |
+ # survey of common keywords found in feeds |
|
2322 |
+ acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue', |
|
2323 |
+ 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed', |
|
2324 |
+ 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left', |
|
2325 |
+ 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive', |
|
2326 |
+ 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top', |
|
2327 |
+ 'transparent', 'underline', 'white', 'yellow'] |
|
2328 |
+ |
|
2329 |
+ valid_css_values = re.compile('^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' + |
|
2330 |
+ '\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$') |
|
2331 |
+ |
|
2332 |
+ mathml_elements = ['annotation', 'annotation-xml', 'maction', 'math', |
|
2333 |
+ 'merror', 'mfenced', 'mfrac', 'mi', 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', |
|
2334 |
+ 'mphantom', 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', |
|
2335 |
+ 'msub', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder', |
|
2336 |
+ 'munderover', 'none', 'semantics'] |
|
2337 |
+ |
|
2338 |
+ mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign', |
|
2339 |
+ 'columnalign', 'close', 'columnlines', 'columnspacing', 'columnspan', 'depth', |
|
2340 |
+ 'display', 'displaystyle', 'encoding', 'equalcolumns', 'equalrows', |
|
2341 |
+ 'fence', 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', |
|
2342 |
+ 'lspace', 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', |
|
2343 |
+ 'maxsize', 'minsize', 'open', 'other', 'rowalign', 'rowalign', 'rowalign', |
|
2344 |
+ 'rowlines', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection', |
|
2345 |
+ 'separator', 'separators', 'stretchy', 'width', 'width', 'xlink:href', |
|
2346 |
+ 'xlink:show', 'xlink:type', 'xmlns', 'xmlns:xlink'] |
|
2347 |
+ |
|
2348 |
+ # svgtiny - foreignObject + linearGradient + radialGradient + stop |
|
2349 |
+ svg_elements = ['a', 'animate', 'animateColor', 'animateMotion', |
|
2350 |
+ 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject', |
|
2351 |
+ 'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', |
|
2352 |
+ 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath', |
|
2353 |
+ 'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop', |
|
2354 |
+ 'svg', 'switch', 'text', 'title', 'tspan', 'use'] |
|
2355 |
+ |
|
2356 |
+ # svgtiny + class + opacity + offset + xmlns + xmlns:xlink |
|
2357 |
+ svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic', |
|
2358 |
+ 'arabic-form', 'ascent', 'attributeName', 'attributeType', |
|
2359 |
+ 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height', |
|
2360 |
+ 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx', |
|
2361 |
+ 'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity', |
|
2362 |
+ 'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style', |
|
2363 |
+ 'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', |
|
2364 |
+ 'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x', |
|
2365 |
+ 'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines', |
|
2366 |
+ 'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid', |
|
2367 |
+ 'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max', |
|
2368 |
+ 'min', 'name', 'offset', 'opacity', 'orient', 'origin', |
|
2369 |
+ 'overline-position', 'overline-thickness', 'panose-1', 'path', |
|
2370 |
+ 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY', |
|
2371 |
+ 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures', |
|
2372 |
+ 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', |
|
2373 |
+ 'stop-color', 'stop-opacity', 'strikethrough-position', |
|
2374 |
+ 'strikethrough-thickness', 'stroke', 'stroke-dasharray', |
|
2375 |
+ 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin', |
|
2376 |
+ 'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage', |
|
2377 |
+ 'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2', |
|
2378 |
+ 'underline-position', 'underline-thickness', 'unicode', 'unicode-range', |
|
2379 |
+ 'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width', |
|
2380 |
+ 'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole', |
|
2381 |
+ 'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type', |
|
2382 |
+ 'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', |
|
2383 |
+ 'y2', 'zoomAndPan'] |
|
2384 |
+ |
|
2385 |
+ svg_attr_map = None |
|
2386 |
+ svg_elem_map = None |
|
2387 |
+ |
|
2388 |
+ acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule', |
|
2389 |
+ 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin', |
|
2390 |
+ 'stroke-opacity'] |
|
2391 |
+ |
|
2392 |
+ def reset(self): |
|
2393 |
+ _BaseHTMLProcessor.reset(self) |
|
2394 |
+ self.unacceptablestack = 0 |
|
2395 |
+ self.mathmlOK = 0 |
|
2396 |
+ self.svgOK = 0 |
|
2397 |
+ |
|
2398 |
+ def unknown_starttag(self, tag, attrs): |
|
2399 |
+ acceptable_attributes = self.acceptable_attributes |
|
2400 |
+ keymap = {} |
|
2401 |
+ if not tag in self.acceptable_elements or self.svgOK: |
|
2402 |
+ if tag in self.unacceptable_elements_with_end_tag: |
|
2403 |
+ self.unacceptablestack += 1 |
|
2404 |
+ |
|
2405 |
+ # not otherwise acceptable, perhaps it is MathML or SVG? |
|
2406 |
+ if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs: |
|
2407 |
+ self.mathmlOK += 1 |
|
2408 |
+ if tag=='svg' and ('xmlns','http://www.w3.org/2000/svg') in attrs: |
|
2409 |
+ self.svgOK += 1 |
|
2410 |
+ |
|
2411 |
+ # chose acceptable attributes based on tag class, else bail |
|
2412 |
+ if self.mathmlOK and tag in self.mathml_elements: |
|
2413 |
+ acceptable_attributes = self.mathml_attributes |
|
2414 |
+ elif self.svgOK and tag in self.svg_elements: |
|
2415 |
+ # for most vocabularies, lowercasing is a good idea. Many |
|
2416 |
+ # svg elements, however, are camel case |
|
2417 |
+ if not self.svg_attr_map: |
|
2418 |
+ lower=[attr.lower() for attr in self.svg_attributes] |
|
2419 |
+ mix=[a for a in self.svg_attributes if a not in lower] |
|
2420 |
+ self.svg_attributes = lower |
|
2421 |
+ self.svg_attr_map = dict([(a.lower(),a) for a in mix]) |
|
2422 |
+ |
|
2423 |
+ lower=[attr.lower() for attr in self.svg_elements] |
|
2424 |
+ mix=[a for a in self.svg_elements if a not in lower] |
|
2425 |
+ self.svg_elements = lower |
|
2426 |
+ self.svg_elem_map = dict([(a.lower(),a) for a in mix]) |
|
2427 |
+ acceptable_attributes = self.svg_attributes |
|
2428 |
+ tag = self.svg_elem_map.get(tag,tag) |
|
2429 |
+ keymap = self.svg_attr_map |
|
2430 |
+ elif not tag in self.acceptable_elements: |
|
2431 |
+ return |
|
2432 |
+ |
|
2433 |
+ # declare xlink namespace, if needed |
|
2434 |
+ if self.mathmlOK or self.svgOK: |
|
2435 |
+ if filter(lambda (n,v): n.startswith('xlink:'),attrs): |
|
2436 |
+ if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs: |
|
2437 |
+ attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink')) |
|
2438 |
+ |
|
2439 |
+ clean_attrs = [] |
|
2440 |
+ for key, value in self.normalize_attrs(attrs): |
|
2441 |
+ if key in acceptable_attributes: |
|
2442 |
+ key=keymap.get(key,key) |
|
2443 |
+ clean_attrs.append((key,value)) |
|
2444 |
+ elif key=='style': |
|
2445 |
+ clean_value = self.sanitize_style(value) |
|
2446 |
+ if clean_value: clean_attrs.append((key,clean_value)) |
|
2447 |
+ _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs) |
|
2448 |
+ |
|
2449 |
+ def unknown_endtag(self, tag): |
|
2450 |
+ if not tag in self.acceptable_elements: |
|
2451 |
+ if tag in self.unacceptable_elements_with_end_tag: |
|
2452 |
+ self.unacceptablestack -= 1 |
|
2453 |
+ if self.mathmlOK and tag in self.mathml_elements: |
|
2454 |
+ if tag == 'math' and self.mathmlOK: self.mathmlOK -= 1 |
|
2455 |
+ elif self.svgOK and tag in self.svg_elements: |
|
2456 |
+ tag = self.svg_elem_map.get(tag,tag) |
|
2457 |
+ if tag == 'svg' and self.svgOK: self.svgOK -= 1 |
|
2458 |
+ else: |
|
2459 |
+ return |
|
2460 |
+ _BaseHTMLProcessor.unknown_endtag(self, tag) |
|
2461 |
+ |
|
2462 |
+ def handle_pi(self, text): |
|
2463 |
+ pass |
|
2464 |
+ |
|
2465 |
+ def handle_decl(self, text): |
|
2466 |
+ pass |
|
2467 |
+ |
|
2468 |
+ def handle_data(self, text): |
|
2469 |
+ if not self.unacceptablestack: |
|
2470 |
+ _BaseHTMLProcessor.handle_data(self, text) |
|
2471 |
+ |
|
2472 |
+ def sanitize_style(self, style): |
|
2473 |
+ # disallow urls |
|
2474 |
+ style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style) |
|
2475 |
+ |
|
2476 |
+ # gauntlet |
|
2477 |
+ if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return '' |
|
2478 |
+ if not re.match("^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$", style): return '' |
|
2479 |
+ |
|
2480 |
+ clean = [] |
|
2481 |
+ for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style): |
|
2482 |
+ if not value: continue |
|
2483 |
+ if prop.lower() in self.acceptable_css_properties: |
|
2484 |
+ clean.append(prop + ': ' + value + ';') |
|
2485 |
+ elif prop.split('-')[0].lower() in ['background','border','margin','padding']: |
|
2486 |
+ for keyword in value.split(): |
|
2487 |
+ if not keyword in self.acceptable_css_keywords and \ |
|
2488 |
+ not self.valid_css_values.match(keyword): |
|
2489 |
+ break |
|
2490 |
+ else: |
|
2491 |
+ clean.append(prop + ': ' + value + ';') |
|
2492 |
+ elif self.svgOK and prop.lower() in self.acceptable_svg_properties: |
|
2493 |
+ clean.append(prop + ': ' + value + ';') |
|
2494 |
+ |
|
2495 |
+ return ' '.join(clean) |
|
2496 |
+ |
|
2497 |
+ |
|
2498 |
+def _sanitizeHTML(htmlSource, encoding, type): |
|
2499 |
+ p = _HTMLSanitizer(encoding, type) |
|
2500 |
+ p.feed(htmlSource) |
|
2501 |
+ data = p.output() |
|
2502 |
+ if TIDY_MARKUP: |
|
2503 |
+ # loop through list of preferred Tidy interfaces looking for one that's installed, |
|
2504 |
+ # then set up a common _tidy function to wrap the interface-specific API. |
|
2505 |
+ _tidy = None |
|
2506 |
+ for tidy_interface in PREFERRED_TIDY_INTERFACES: |
|
2507 |
+ try: |
|
2508 |
+ if tidy_interface == "uTidy": |
|
2509 |
+ from tidy import parseString as _utidy |
|
2510 |
+ def _tidy(data, **kwargs): |
|
2511 |
+ return str(_utidy(data, **kwargs)) |
|
2512 |
+ break |
|
2513 |
+ elif tidy_interface == "mxTidy": |
|
2514 |
+ from mx.Tidy import Tidy as _mxtidy |
|
2515 |
+ def _tidy(data, **kwargs): |
|
2516 |
+ nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs) |
|
2517 |
+ return data |
|
2518 |
+ break |
|
2519 |
+ except: |
|
2520 |
+ pass |
|
2521 |
+ if _tidy: |
|
2522 |
+ utf8 = type(data) == type(u'') |
|
2523 |
+ if utf8: |
|
2524 |
+ data = data.encode('utf-8') |
|
2525 |
+ data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8") |
|
2526 |
+ if utf8: |
|
2527 |
+ data = unicode(data, 'utf-8') |
|
2528 |
+ if data.count('<body'): |
|
2529 |
+ data = data.split('<body', 1)[1] |
|
2530 |
+ if data.count('>'): |
|
2531 |
+ data = data.split('>', 1)[1] |
|
2532 |
+ if data.count('</body'): |
|
2533 |
+ data = data.split('</body', 1)[0] |
|
2534 |
+ data = data.strip().replace('\r\n', '\n') |
|
2535 |
+ return data |
|
2536 |
+ |
|
2537 |
+class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler): |
|
2538 |
+ def http_error_default(self, req, fp, code, msg, headers): |
|
2539 |
+ if ((code / 100) == 3) and (code != 304): |
|
2540 |
+ return self.http_error_302(req, fp, code, msg, headers) |
|
2541 |
+ infourl = urllib.addinfourl(fp, headers, req.get_full_url()) |
|
2542 |
+ infourl.status = code |
|
2543 |
+ return infourl |
|
2544 |
+ |
|
2545 |
+ def http_error_302(self, req, fp, code, msg, headers): |
|
2546 |
+ if headers.dict.has_key('location'): |
|
2547 |
+ infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers) |
|
2548 |
+ else: |
|
2549 |
+ infourl = urllib.addinfourl(fp, headers, req.get_full_url()) |
|
2550 |
+ if not hasattr(infourl, 'status'): |
|
2551 |
+ infourl.status = code |
|
2552 |
+ return infourl |
|
2553 |
+ |
|
2554 |
+ def http_error_301(self, req, fp, code, msg, headers): |
|
2555 |
+ if headers.dict.has_key('location'): |
|
2556 |
+ infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers) |
|
2557 |
+ else: |
|
2558 |
+ infourl = urllib.addinfourl(fp, headers, req.get_full_url()) |
|
2559 |
+ if not hasattr(infourl, 'status'): |
|
2560 |
+ infourl.status = code |
|
2561 |
+ return infourl |
|
2562 |
+ |
|
2563 |
+ http_error_300 = http_error_302 |
|
2564 |
+ http_error_303 = http_error_302 |
|
2565 |
+ http_error_307 = http_error_302 |
|
2566 |
+ |
|
2567 |
+ def http_error_401(self, req, fp, code, msg, headers): |
|
2568 |
+ # Check if |
|
2569 |
+ # - server requires digest auth, AND |
|
2570 |
+ # - we tried (unsuccessfully) with basic auth, AND |
|
2571 |
+ # - we're using Python 2.3.3 or later (digest auth is irreparably broken in earlier versions) |
|
2572 |
+ # If all conditions hold, parse authentication information |
|
2573 |
+ # out of the Authorization header we sent the first time |
|
2574 |
+ # (for the username and password) and the WWW-Authenticate |
|
2575 |
+ # header the server sent back (for the realm) and retry |
|
2576 |
+ # the request with the appropriate digest auth headers instead. |
|
2577 |
+ # This evil genius hack has been brought to you by Aaron Swartz. |
|
2578 |
+ host = urlparse.urlparse(req.get_full_url())[1] |
|
2579 |
+ try: |
|
2580 |
+ assert sys.version.split()[0] >= '2.3.3' |
|
2581 |
+ assert base64 != None |
|
2582 |
+ user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':') |
|
2583 |
+ realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0] |
|
2584 |
+ self.add_password(realm, host, user, passw) |
|
2585 |
+ retry = self.http_error_auth_reqed('www-authenticate', host, req, headers) |
|
2586 |
+ self.reset_retry_count() |
|
2587 |
+ return retry |
|
2588 |
+ except: |
|
2589 |
+ return self.http_error_default(req, fp, code, msg, headers) |
|
2590 |
+ |
|
2591 |
+def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers): |
|
2592 |
+ """URL, filename, or string --> stream |
|
2593 |
+ |
|
2594 |
+ This function lets you define parsers that take any input source |
|
2595 |
+ (URL, pathname to local or network file, or actual data as a string) |
|
2596 |
+ and deal with it in a uniform manner. Returned object is guaranteed |
|
2597 |
+ to have all the basic stdio read methods (read, readline, readlines). |
|
2598 |
+ Just .close() the object when you're done with it. |
|
2599 |
+ |
|
2600 |
+ If the etag argument is supplied, it will be used as the value of an |
|
2601 |
+ If-None-Match request header. |
|
2602 |
+ |
|
2603 |
+ If the modified argument is supplied, it can be a tuple of 9 integers |
|
2604 |
+ (as returned by gmtime() in the standard Python time module) or a date |
|
2605 |
+ string in any format supported by feedparser. Regardless, it MUST |
|
2606 |
+ be in GMT (Greenwich Mean Time). It will be reformatted into an |
|
2607 |
+ RFC 1123-compliant date and used as the value of an If-Modified-Since |
|
2608 |
+ request header. |
|
2609 |
+ |
|
2610 |
+ If the agent argument is supplied, it will be used as the value of a |
|
2611 |
+ User-Agent request header. |
|
2612 |
+ |
|
2613 |
+ If the referrer argument is supplied, it will be used as the value of a |
|
2614 |
+ Referer[sic] request header. |
|
2615 |
+ |
|
2616 |
+ If handlers is supplied, it is a list of handlers used to build a |
|
2617 |
+ urllib2 opener. |
|
2618 |
+ """ |
|
2619 |
+ |
|
2620 |
+ if hasattr(url_file_stream_or_string, 'read'): |
|
2621 |
+ return url_file_stream_or_string |
|
2622 |
+ |
|
2623 |
+ if url_file_stream_or_string == '-': |
|
2624 |
+ return sys.stdin |
|
2625 |
+ |
|
2626 |
+ if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'): |
|
2627 |
+ if not agent: |
|
2628 |
+ agent = USER_AGENT |
|
2629 |
+ # test for inline user:password for basic auth |
|
2630 |
+ auth = None |
|
2631 |
+ if base64: |
|
2632 |
+ urltype, rest = urllib.splittype(url_file_stream_or_string) |
|
2633 |
+ realhost, rest = urllib.splithost(rest) |
|
2634 |
+ if realhost: |
|
2635 |
+ user_passwd, realhost = urllib.splituser(realhost) |
|
2636 |
+ if user_passwd: |
|
2637 |
+ url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest) |
|
2638 |
+ auth = base64.encodestring(user_passwd).strip() |
|
2639 |
+ |
|
2640 |
+ # iri support |
|
2641 |
+ try: |
|
2642 |
+ if isinstance(url_file_stream_or_string,unicode): |
|
2643 |
+ url_file_stream_or_string = url_file_stream_or_string.encode('idna') |
|
2644 |
+ else: |
|
2645 |
+ url_file_stream_or_string = url_file_stream_or_string.decode('utf-8').encode('idna') |
|
2646 |
+ except: |
|
2647 |
+ pass |
|
2648 |
+ |
|
2649 |
+ # try to open with urllib2 (to use optional headers) |
|
2650 |
+ request = urllib2.Request(url_file_stream_or_string) |
|
2651 |
+ request.add_header('User-Agent', agent) |
|
2652 |
+ if etag: |
|
2653 |
+ request.add_header('If-None-Match', etag) |
|
2654 |
+ if type(modified) == type(''): |
|
2655 |
+ modified = _parse_date(modified) |
|
2656 |
+ if modified: |
|
2657 |
+ # format into an RFC 1123-compliant timestamp. We can't use |
|
2658 |
+ # time.strftime() since the %a and %b directives can be affected |
|
2659 |
+ # by the current locale, but RFC 2616 states that dates must be |
|
2660 |
+ # in English. |
|
2661 |
+ short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] |
|
2662 |
+ months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] |
|
2663 |
+ request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5])) |
|
2664 |
+ if referrer: |
|
2665 |
+ request.add_header('Referer', referrer) |
|
2666 |
+ if gzip and zlib: |
|
2667 |
+ request.add_header('Accept-encoding', 'gzip, deflate') |
|
2668 |
+ elif gzip: |
|
2669 |
+ request.add_header('Accept-encoding', 'gzip') |
|
2670 |
+ elif zlib: |
|
2671 |
+ request.add_header('Accept-encoding', 'deflate') |
|
2672 |
+ else: |
|
2673 |
+ request.add_header('Accept-encoding', '') |
|
2674 |
+ if auth: |
|
2675 |
+ request.add_header('Authorization', 'Basic %s' % auth) |
|
2676 |
+ if ACCEPT_HEADER: |
|
2677 |
+ request.add_header('Accept', ACCEPT_HEADER) |
|
2678 |
+ request.add_header('A-IM', 'feed') # RFC 3229 support |
|
2679 |
+ opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers)) |
|
2680 |
+ opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent |
|
2681 |
+ try: |
|
2682 |
+ return opener.open(request) |
|
2683 |
+ finally: |
|
2684 |
+ opener.close() # JohnD |
|
2685 |
+ |
|
2686 |
+ # try to open with native open function (if url_file_stream_or_string is a filename) |
|
2687 |
+ try: |
|
2688 |
+ return open(url_file_stream_or_string) |
|
2689 |
+ except: |
|
2690 |
+ pass |
|
2691 |
+ |
|
2692 |
+ # treat url_file_stream_or_string as string |
|
2693 |
+ return _StringIO(str(url_file_stream_or_string)) |
|
2694 |
+ |
|
2695 |
+_date_handlers = [] |
|
2696 |
+def registerDateHandler(func): |
|
2697 |
+ '''Register a date handler function (takes string, returns 9-tuple date in GMT)''' |
|
2698 |
+ _date_handlers.insert(0, func) |
|
2699 |
+ |
|
2700 |
+# ISO-8601 date parsing routines written by Fazal Majid. |
|
2701 |
+# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601 |
|
2702 |
+# parser is beyond the scope of feedparser and would be a worthwhile addition |
|
2703 |
+# to the Python library. |
|
2704 |
+# A single regular expression cannot parse ISO 8601 date formats into groups |
|
2705 |
+# as the standard is highly irregular (for instance is 030104 2003-01-04 or |
|
2706 |
+# 0301-04-01), so we use templates instead. |
|
2707 |
+# Please note the order in templates is significant because we need a |
|
2708 |
+# greedy match. |
|
2709 |
+_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-0MM?-?DD', 'YYYY-MM', 'YYYY-?OOO', |
|
2710 |
+ 'YY-?MM-?DD', 'YY-?OOO', 'YYYY', |
|
2711 |
+ '-YY-?MM', '-OOO', '-YY', |
|
2712 |
+ '--MM-?DD', '--MM', |
|
2713 |
+ '---DD', |
|
2714 |
+ 'CC', ''] |
|
2715 |
+_iso8601_re = [ |
|
2716 |
+ tmpl.replace( |
|
2717 |
+ 'YYYY', r'(?P<year>\d{4})').replace( |
|
2718 |
+ 'YY', r'(?P<year>\d\d)').replace( |
|
2719 |
+ 'MM', r'(?P<month>[01]\d)').replace( |
|
2720 |
+ 'DD', r'(?P<day>[0123]\d)').replace( |
|
2721 |
+ 'OOO', r'(?P<ordinal>[0123]\d\d)').replace( |
|
2722 |
+ 'CC', r'(?P<century>\d\d$)') |
|
2723 |
+ + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})' |
|
2724 |
+ + r'(:(?P<second>\d{2}(\.\d*)?))?' |
|
2725 |
+ + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?' |
|
2726 |
+ for tmpl in _iso8601_tmpl] |
|
2727 |
+del tmpl |
|
2728 |
+_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re] |
|
2729 |
+del regex |
|
2730 |
+def _parse_date_iso8601(dateString): |
|
2731 |
+ '''Parse a variety of ISO-8601-compatible formats like 20040105''' |
|
2732 |
+ m = None |
|
2733 |
+ for _iso8601_match in _iso8601_matches: |
|
2734 |
+ m = _iso8601_match(dateString) |
|
2735 |
+ if m: break |
|
2736 |
+ if not m: return |
|
2737 |
+ if m.span() == (0, 0): return |
|
2738 |
+ params = m.groupdict() |
|
2739 |
+ ordinal = params.get('ordinal', 0) |
|
2740 |
+ if ordinal: |
|
2741 |
+ ordinal = int(ordinal) |
|
2742 |
+ else: |
|
2743 |
+ ordinal = 0 |
|
2744 |
+ year = params.get('year', '--') |
|
2745 |
+ if not year or year == '--': |
|
2746 |
+ year = time.gmtime()[0] |
|
2747 |
+ elif len(year) == 2: |
|
2748 |
+ # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993 |
|
2749 |
+ year = 100 * int(time.gmtime()[0] / 100) + int(year) |
|
2750 |
+ else: |
|
2751 |
+ year = int(year) |
|
2752 |
+ month = params.get('month', '-') |
|
2753 |
+ if not month or month == '-': |
|
2754 |
+ # ordinals are NOT normalized by mktime, we simulate them |
|
2755 |
+ # by setting month=1, day=ordinal |
|
2756 |
+ if ordinal: |
|
2757 |
+ month = 1 |
|
2758 |
+ else: |
|
2759 |
+ month = time.gmtime()[1] |
|
2760 |
+ month = int(month) |
|
2761 |
+ day = params.get('day', 0) |
|
2762 |
+ if not day: |
|
2763 |
+ # see above |
|
2764 |
+ if ordinal: |
|
2765 |
+ day = ordinal |
|
2766 |
+ elif params.get('century', 0) or \ |
|
2767 |
+ params.get('year', 0) or params.get('month', 0): |
|
2768 |
+ day = 1 |
|
2769 |
+ else: |
|
2770 |
+ day = time.gmtime()[2] |
|
2771 |
+ else: |
|
2772 |
+ day = int(day) |
|
2773 |
+ # special case of the century - is the first year of the 21st century |
|
2774 |
+ # 2000 or 2001 ? The debate goes on... |
|
2775 |
+ if 'century' in params.keys(): |
|
2776 |
+ year = (int(params['century']) - 1) * 100 + 1 |
|
2777 |
+ # in ISO 8601 most fields are optional |
|
2778 |
+ for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']: |
|
2779 |
+ if not params.get(field, None): |
|
2780 |
+ params[field] = 0 |
|
2781 |
+ hour = int(params.get('hour', 0)) |
|
2782 |
+ minute = int(params.get('minute', 0)) |
|
2783 |
+ second = int(float(params.get('second', 0))) |
|
2784 |
+ # weekday is normalized by mktime(), we can ignore it |
|
2785 |
+ weekday = 0 |
|
2786 |
+ daylight_savings_flag = -1 |
|
2787 |
+ tm = [year, month, day, hour, minute, second, weekday, |
|
2788 |
+ ordinal, daylight_savings_flag] |
|
2789 |
+ # ISO 8601 time zone adjustments |
|
2790 |
+ tz = params.get('tz') |
|
2791 |
+ if tz and tz != 'Z': |
|
2792 |
+ if tz[0] == '-': |
|
2793 |
+ tm[3] += int(params.get('tzhour', 0)) |
|
2794 |
+ tm[4] += int(params.get('tzmin', 0)) |
|
2795 |
+ elif tz[0] == '+': |
|
2796 |
+ tm[3] -= int(params.get('tzhour', 0)) |
|
2797 |
+ tm[4] -= int(params.get('tzmin', 0)) |
|
2798 |
+ else: |
|
2799 |
+ return None |
|
2800 |
+ # Python's time.mktime() is a wrapper around the ANSI C mktime(3c) |
|
2801 |
+ # which is guaranteed to normalize d/m/y/h/m/s. |
|
2802 |
+ # Many implementations have bugs, but we'll pretend they don't. |
|
2803 |
+ return time.localtime(time.mktime(tm)) |
|
2804 |
+registerDateHandler(_parse_date_iso8601) |
|
2805 |
+ |
|
2806 |
+# 8-bit date handling routines written by ytrewq1. |
|
2807 |
+_korean_year = u'\ub144' # b3e2 in euc-kr |
|
2808 |
+_korean_month = u'\uc6d4' # bff9 in euc-kr |
|
2809 |
+_korean_day = u'\uc77c' # c0cf in euc-kr |
|
2810 |
+_korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr |
|
2811 |
+_korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr |
|
2812 |
+ |
|
2813 |
+_korean_onblog_date_re = \ |
|
2814 |
+ re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \ |
|
2815 |
+ (_korean_year, _korean_month, _korean_day)) |
|
2816 |
+_korean_nate_date_re = \ |
|
2817 |
+ re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \ |
|
2818 |
+ (_korean_am, _korean_pm)) |
|
2819 |
+def _parse_date_onblog(dateString): |
|
2820 |
+ '''Parse a string according to the OnBlog 8-bit date format''' |
|
2821 |
+ m = _korean_onblog_date_re.match(dateString) |
|
2822 |
+ if not m: return |
|
2823 |
+ w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ |
|
2824 |
+ {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ |
|
2825 |
+ 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\ |
|
2826 |
+ 'zonediff': '+09:00'} |
|
2827 |
+ if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate) |
|
2828 |
+ return _parse_date_w3dtf(w3dtfdate) |
|
2829 |
+registerDateHandler(_parse_date_onblog) |
|
2830 |
+ |
|
2831 |
+def _parse_date_nate(dateString): |
|
2832 |
+ '''Parse a string according to the Nate 8-bit date format''' |
|
2833 |
+ m = _korean_nate_date_re.match(dateString) |
|
2834 |
+ if not m: return |
|
2835 |
+ hour = int(m.group(5)) |
|
2836 |
+ ampm = m.group(4) |
|
2837 |
+ if (ampm == _korean_pm): |
|
2838 |
+ hour += 12 |
|
2839 |
+ hour = str(hour) |
|
2840 |
+ if len(hour) == 1: |
|
2841 |
+ hour = '0' + hour |
|
2842 |
+ w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ |
|
2843 |
+ {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ |
|
2844 |
+ 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\ |
|
2845 |
+ 'zonediff': '+09:00'} |
|
2846 |
+ if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate) |
|
2847 |
+ return _parse_date_w3dtf(w3dtfdate) |
|
2848 |
+registerDateHandler(_parse_date_nate) |
|
2849 |
+ |
|
2850 |
+_mssql_date_re = \ |
|
2851 |
+ re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?') |
|
2852 |
+def _parse_date_mssql(dateString): |
|
2853 |
+ '''Parse a string according to the MS SQL date format''' |
|
2854 |
+ m = _mssql_date_re.match(dateString) |
|
2855 |
+ if not m: return |
|
2856 |
+ w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ |
|
2857 |
+ {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ |
|
2858 |
+ 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\ |
|
2859 |
+ 'zonediff': '+09:00'} |
|
2860 |
+ if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate) |
|
2861 |
+ return _parse_date_w3dtf(w3dtfdate) |
|
2862 |
+registerDateHandler(_parse_date_mssql) |
|
2863 |
+ |
|
2864 |
+# Unicode strings for Greek date strings |
|
2865 |
+_greek_months = \ |
|
2866 |
+ { \ |
|
2867 |
+ u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7 |
|
2868 |
+ u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7 |
|
2869 |
+ u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7 |
|
2870 |
+ u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7 |
|
2871 |
+ u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7 |
|
2872 |
+ u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7 |
|
2873 |
+ u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7 |
|
2874 |
+ u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7 |
|
2875 |
+ u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7 |
|
2876 |
+ u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7 |
|
2877 |
+ u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7 |
|
2878 |
+ u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7 |
|
2879 |
+ u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7 |
|
2880 |
+ u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7 |
|
2881 |
+ u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7 |
|
2882 |
+ u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7 |
|
2883 |
+ u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7 |
|
2884 |
+ u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7 |
|
2885 |
+ u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7 |
|
2886 |
+ } |
|
2887 |
+ |
|
2888 |
+_greek_wdays = \ |
|
2889 |
+ { \ |
|
2890 |
+ u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7 |
|
2891 |
+ u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7 |
|
2892 |
+ u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7 |
|
2893 |
+ u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7 |
|
2894 |
+ u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7 |
|
2895 |
+ u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7 |
|
2896 |
+ u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7 |
|
2897 |
+ } |
|
2898 |
+ |
|
2899 |
+_greek_date_format_re = \ |
|
2900 |
+ re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)') |
|
2901 |
+ |
|
2902 |
+def _parse_date_greek(dateString): |
|
2903 |
+ '''Parse a string according to a Greek 8-bit date format.''' |
|
2904 |
+ m = _greek_date_format_re.match(dateString) |
|
2905 |
+ if not m: return |
|
2906 |
+ try: |
|
2907 |
+ wday = _greek_wdays[m.group(1)] |
|
2908 |
+ month = _greek_months[m.group(3)] |
|
2909 |
+ except: |
|
2910 |
+ return |
|
2911 |
+ rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \ |
|
2912 |
+ {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\ |
|
2913 |
+ 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\ |
|
2914 |
+ 'zonediff': m.group(8)} |
|
2915 |
+ if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date) |
|
2916 |
+ return _parse_date_rfc822(rfc822date) |
|
2917 |
+registerDateHandler(_parse_date_greek) |
|
2918 |
+ |
|
2919 |
+# Unicode strings for Hungarian date strings |
|
2920 |
+_hungarian_months = \ |
|
2921 |
+ { \ |
|
2922 |
+ u'janu\u00e1r': u'01', # e1 in iso-8859-2 |
|
2923 |
+ u'febru\u00e1ri': u'02', # e1 in iso-8859-2 |
|
2924 |
+ u'm\u00e1rcius': u'03', # e1 in iso-8859-2 |
|
2925 |
+ u'\u00e1prilis': u'04', # e1 in iso-8859-2 |
|
2926 |
+ u'm\u00e1ujus': u'05', # e1 in iso-8859-2 |
|
2927 |
+ u'j\u00fanius': u'06', # fa in iso-8859-2 |
|
2928 |
+ u'j\u00falius': u'07', # fa in iso-8859-2 |
|
2929 |
+ u'augusztus': u'08', |
|
2930 |
+ u'szeptember': u'09', |
|
2931 |
+ u'okt\u00f3ber': u'10', # f3 in iso-8859-2 |
|
2932 |
+ u'november': u'11', |
|
2933 |
+ u'december': u'12', |
|
2934 |
+ } |
|
2935 |
+ |
|
2936 |
+_hungarian_date_format_re = \ |
|
2937 |
+ re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))') |
|
2938 |
+ |
|
2939 |
+def _parse_date_hungarian(dateString): |
|
2940 |
+ '''Parse a string according to a Hungarian 8-bit date format.''' |
|
2941 |
+ m = _hungarian_date_format_re.match(dateString) |
|
2942 |
+ if not m: return |
|
2943 |
+ try: |
|
2944 |
+ month = _hungarian_months[m.group(2)] |
|
2945 |
+ day = m.group(3) |
|
2946 |
+ if len(day) == 1: |
|
2947 |
+ day = '0' + day |
|
2948 |
+ hour = m.group(4) |
|
2949 |
+ if len(hour) == 1: |
|
2950 |
+ hour = '0' + hour |
|
2951 |
+ except: |
|
2952 |
+ return |
|
2953 |
+ w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \ |
|
2954 |
+ {'year': m.group(1), 'month': month, 'day': day,\ |
|
2955 |
+ 'hour': hour, 'minute': m.group(5),\ |
|
2956 |
+ 'zonediff': m.group(6)} |
|
2957 |
+ if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate) |
|
2958 |
+ return _parse_date_w3dtf(w3dtfdate) |
|
2959 |
+registerDateHandler(_parse_date_hungarian) |
|
2960 |
+ |
|
2961 |
+# W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by |
|
2962 |
+# Drake and licensed under the Python license. Removed all range checking |
|
2963 |
+# for month, day, hour, minute, and second, since mktime will normalize |
|
2964 |
+# these later |
|
2965 |
+def _parse_date_w3dtf(dateString): |
|
2966 |
+ def __extract_date(m): |
|
2967 |
+ year = int(m.group('year')) |
|
2968 |
+ if year < 100: |
|
2969 |
+ year = 100 * int(time.gmtime()[0] / 100) + int(year) |
|
2970 |
+ if year < 1000: |
|
2971 |
+ return 0, 0, 0 |
|
2972 |
+ julian = m.group('julian') |
|
2973 |
+ if julian: |
|
2974 |
+ julian = int(julian) |
|
2975 |
+ month = julian / 30 + 1 |
|
2976 |
+ day = julian % 30 + 1 |
|
2977 |
+ jday = None |
|
2978 |
+ while jday != julian: |
|
2979 |
+ t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0)) |
|
2980 |
+ jday = time.gmtime(t)[-2] |
|
2981 |
+ diff = abs(jday - julian) |
|
2982 |
+ if jday > julian: |
|
2983 |
+ if diff < day: |
|
2984 |
+ day = day - diff |
|
2985 |
+ else: |
|
2986 |
+ month = month - 1 |
|
2987 |
+ day = 31 |
|
2988 |
+ elif jday < julian: |
|
2989 |
+ if day + diff < 28: |
|
2990 |
+ day = day + diff |
|
2991 |
+ else: |
|
2992 |
+ month = month + 1 |
|
2993 |
+ return year, month, day |
|
2994 |
+ month = m.group('month') |
|
2995 |
+ day = 1 |
|
2996 |
+ if month is None: |
|
2997 |
+ month = 1 |
|
2998 |
+ else: |
|
2999 |
+ month = int(month) |
|
3000 |
+ day = m.group('day') |
|
3001 |
+ if day: |
|
3002 |
+ day = int(day) |
|
3003 |
+ else: |
|
3004 |
+ day = 1 |
|
3005 |
+ return year, month, day |
|
3006 |
+ |
|
3007 |
+ def __extract_time(m): |
|
3008 |
+ if not m: |
|
3009 |
+ return 0, 0, 0 |
|
3010 |
+ hours = m.group('hours') |
|
3011 |
+ if not hours: |
|
3012 |
+ return 0, 0, 0 |
|
3013 |
+ hours = int(hours) |
|
3014 |
+ minutes = int(m.group('minutes')) |
|
3015 |
+ seconds = m.group('seconds') |
|
3016 |
+ if seconds: |
|
3017 |
+ seconds = int(seconds) |
|
3018 |
+ else: |
|
3019 |
+ seconds = 0 |
|
3020 |
+ return hours, minutes, seconds |
|
3021 |
+ |
|
3022 |
+ def __extract_tzd(m): |
|
3023 |
+ '''Return the Time Zone Designator as an offset in seconds from UTC.''' |
|
3024 |
+ if not m: |
|
3025 |
+ return 0 |
|
3026 |
+ tzd = m.group('tzd') |
|
3027 |
+ if not tzd: |
|
3028 |
+ return 0 |
|
3029 |
+ if tzd == 'Z': |
|
3030 |
+ return 0 |
|
3031 |
+ hours = int(m.group('tzdhours')) |
|
3032 |
+ minutes = m.group('tzdminutes') |
|
3033 |
+ if minutes: |
|
3034 |
+ minutes = int(minutes) |
|
3035 |
+ else: |
|
3036 |
+ minutes = 0 |
|
3037 |
+ offset = (hours*60 + minutes) * 60 |
|
3038 |
+ if tzd[0] == '+': |
|
3039 |
+ return -offset |
|
3040 |
+ return offset |
|
3041 |
+ |
|
3042 |
+ __date_re = ('(?P<year>\d\d\d\d)' |
|
3043 |
+ '(?:(?P<dsep>-|)' |
|
3044 |
+ '(?:(?P<julian>\d\d\d)' |
|
3045 |
+ '|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?') |
|
3046 |
+ __tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)' |
|
3047 |
+ __tzd_rx = re.compile(__tzd_re) |
|
3048 |
+ __time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)' |
|
3049 |
+ '(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?' |
|
3050 |
+ + __tzd_re) |
|
3051 |
+ __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re) |
|
3052 |
+ __datetime_rx = re.compile(__datetime_re) |
|
3053 |
+ m = __datetime_rx.match(dateString) |
|
3054 |
+ if (m is None) or (m.group() != dateString): return |
|
3055 |
+ gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0) |
|
3056 |
+ if gmt[0] == 0: return |
|
3057 |
+ return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone) |
|
3058 |
+registerDateHandler(_parse_date_w3dtf) |
|
3059 |
+ |
|
3060 |
+def _parse_date_rfc822(dateString): |
|
3061 |
+ '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date''' |
|
3062 |
+ data = dateString.split() |
|
3063 |
+ if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames: |
|
3064 |
+ del data[0] |
|
3065 |
+ if len(data) == 4: |
|
3066 |
+ s = data[3] |
|
3067 |
+ i = s.find('+') |
|
3068 |
+ if i > 0: |
|
3069 |
+ data[3:] = [s[:i], s[i+1:]] |
|
3070 |
+ else: |
|
3071 |
+ data.append('') |
|
3072 |
+ dateString = " ".join(data) |
|
3073 |
+ if len(data) < 5: |
|
3074 |
+ dateString += ' 00:00:00 GMT' |
|
3075 |
+ tm = rfc822.parsedate_tz(dateString) |
|
3076 |
+ if tm: |
|
3077 |
+ return time.gmtime(rfc822.mktime_tz(tm)) |
|
3078 |
+# rfc822.py defines several time zones, but we define some extra ones. |
|
3079 |
+# 'ET' is equivalent to 'EST', etc. |
|
3080 |
+_additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800} |
|
3081 |
+rfc822._timezones.update(_additional_timezones) |
|
3082 |
+registerDateHandler(_parse_date_rfc822) |
|
3083 |
+ |
|
3084 |
+def _parse_date_perforce(aDateString): |
|
3085 |
+ """parse a date in yyyy/mm/dd hh:mm:ss TTT format""" |
|
3086 |
+ # Fri, 2006/09/15 08:19:53 EDT |
|
3087 |
+ _my_date_pattern = re.compile( \ |
|
3088 |
+ r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})') |
|
3089 |
+ |
|
3090 |
+ dow, year, month, day, hour, minute, second, tz = \ |
|
3091 |
+ _my_date_pattern.search(aDateString).groups() |
|
3092 |
+ months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] |
|
3093 |
+ dateString = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz) |
|
3094 |
+ tm = rfc822.parsedate_tz(dateString) |
|
3095 |
+ if tm: |
|
3096 |
+ return time.gmtime(rfc822.mktime_tz(tm)) |
|
3097 |
+registerDateHandler(_parse_date_perforce) |
|
3098 |
+ |
|
3099 |
+def _parse_date(dateString): |
|
3100 |
+ '''Parses a variety of date formats into a 9-tuple in GMT''' |
|
3101 |
+ for handler in _date_handlers: |
|
3102 |
+ try: |
|
3103 |
+ date9tuple = handler(dateString) |
|
3104 |
+ if not date9tuple: continue |
|
3105 |
+ if len(date9tuple) != 9: |
|
3106 |
+ if _debug: sys.stderr.write('date handler function must return 9-tuple\n') |
|
3107 |
+ raise ValueError |
|
3108 |
+ map(int, date9tuple) |
|
3109 |
+ return date9tuple |
|
3110 |
+ except Exception as e: |
|
3111 |
+ if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e))) |
|
3112 |
+ pass |
|
3113 |
+ return None |
|
3114 |
+ |
|
3115 |
+def _getCharacterEncoding(http_headers, xml_data): |
|
3116 |
+ '''Get the character encoding of the XML document |
|
3117 |
+ |
|
3118 |
+ http_headers is a dictionary |
|
3119 |
+ xml_data is a raw string (not Unicode) |
|
3120 |
+ |
|
3121 |
+ This is so much trickier than it sounds, it's not even funny. |
|
3122 |
+ According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type |
|
3123 |
+ is application/xml, application/*+xml, |
|
3124 |
+ application/xml-external-parsed-entity, or application/xml-dtd, |
|
3125 |
+ the encoding given in the charset parameter of the HTTP Content-Type |
|
3126 |
+ takes precedence over the encoding given in the XML prefix within the |
|
3127 |
+ document, and defaults to 'utf-8' if neither are specified. But, if |
|
3128 |
+ the HTTP Content-Type is text/xml, text/*+xml, or |
|
3129 |
+ text/xml-external-parsed-entity, the encoding given in the XML prefix |
|
3130 |
+ within the document is ALWAYS IGNORED and only the encoding given in |
|
3131 |
+ the charset parameter of the HTTP Content-Type header should be |
|
3132 |
+ respected, and it defaults to 'us-ascii' if not specified. |
|
3133 |
+ |
|
3134 |
+ Furthermore, discussion on the atom-syntax mailing list with the |
|
3135 |
+ author of RFC 3023 leads me to the conclusion that any document |
|
3136 |
+ served with a Content-Type of text/* and no charset parameter |
|
3137 |
+ must be treated as us-ascii. (We now do this.) And also that it |
|
3138 |
+ must always be flagged as non-well-formed. (We now do this too.) |
|
3139 |
+ |
|
3140 |
+ If Content-Type is unspecified (input was local file or non-HTTP source) |
|
3141 |
+ or unrecognized (server just got it totally wrong), then go by the |
|
3142 |
+ encoding given in the XML prefix of the document and default to |
|
3143 |
+ 'iso-8859-1' as per the HTTP specification (RFC 2616). |
|
3144 |
+ |
|
3145 |
+ Then, assuming we didn't find a character encoding in the HTTP headers |
|
3146 |
+ (and the HTTP Content-type allowed us to look in the body), we need |
|
3147 |
+ to sniff the first few bytes of the XML data and try to determine |
|
3148 |
+ whether the encoding is ASCII-compatible. Section F of the XML |
|
3149 |
+ specification shows the way here: |
|
3150 |
+ http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info |
|
3151 |
+ |
|
3152 |
+ If the sniffed encoding is not ASCII-compatible, we need to make it |
|
3153 |
+ ASCII compatible so that we can sniff further into the XML declaration |
|
3154 |
+ to find the encoding attribute, which will tell us the true encoding. |
|
3155 |
+ |
|
3156 |
+ Of course, none of this guarantees that we will be able to parse the |
|
3157 |
+ feed in the declared character encoding (assuming it was declared |
|
3158 |
+ correctly, which many are not). CJKCodecs and iconv_codec help a lot; |
|
3159 |
+ you should definitely install them if you can. |
|
3160 |
+ http://cjkpython.i18n.org/ |
|
3161 |
+ ''' |
|
3162 |
+ |
|
3163 |
+ def _parseHTTPContentType(content_type): |
|
3164 |
+ '''takes HTTP Content-Type header and returns (content type, charset) |
|
3165 |
+ |
|
3166 |
+ If no charset is specified, returns (content type, '') |
|
3167 |
+ If no content type is specified, returns ('', '') |
|
3168 |
+ Both return parameters are guaranteed to be lowercase strings |
|
3169 |
+ ''' |
|
3170 |
+ content_type = content_type or '' |
|
3171 |
+ content_type, params = cgi.parse_header(content_type) |
|
3172 |
+ return content_type, params.get('charset', '').replace("'", '') |
|
3173 |
+ |
|
3174 |
+ sniffed_xml_encoding = '' |
|
3175 |
+ xml_encoding = '' |
|
3176 |
+ true_encoding = '' |
|
3177 |
+ http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type')) |
|
3178 |
+ # Must sniff for non-ASCII-compatible character encodings before |
|
3179 |
+ # searching for XML declaration. This heuristic is defined in |
|
3180 |
+ # section F of the XML specification: |
|
3181 |
+ # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info |
|
3182 |
+ try: |
|
3183 |
+ if xml_data[:4] == '\x4c\x6f\xa7\x94': |
|
3184 |
+ # EBCDIC |
|
3185 |
+ xml_data = _ebcdic_to_ascii(xml_data) |
|
3186 |
+ elif xml_data[:4] == '\x00\x3c\x00\x3f': |
|
3187 |
+ # UTF-16BE |
|
3188 |
+ sniffed_xml_encoding = 'utf-16be' |
|
3189 |
+ xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') |
|
3190 |
+ elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'): |
|
3191 |
+ # UTF-16BE with BOM |
|
3192 |
+ sniffed_xml_encoding = 'utf-16be' |
|
3193 |
+ xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') |
|
3194 |
+ elif xml_data[:4] == '\x3c\x00\x3f\x00': |
|
3195 |
+ # UTF-16LE |
|
3196 |
+ sniffed_xml_encoding = 'utf-16le' |
|
3197 |
+ xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') |
|
3198 |
+ elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'): |
|
3199 |
+ # UTF-16LE with BOM |
|
3200 |
+ sniffed_xml_encoding = 'utf-16le' |
|
3201 |
+ xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') |
|
3202 |
+ elif xml_data[:4] == '\x00\x00\x00\x3c': |
|
3203 |
+ # UTF-32BE |
|
3204 |
+ sniffed_xml_encoding = 'utf-32be' |
|
3205 |
+ xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') |
|
3206 |
+ elif xml_data[:4] == '\x3c\x00\x00\x00': |
|
3207 |
+ # UTF-32LE |
|
3208 |
+ sniffed_xml_encoding = 'utf-32le' |
|
3209 |
+ xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') |
|
3210 |
+ elif xml_data[:4] == '\x00\x00\xfe\xff': |
|
3211 |
+ # UTF-32BE with BOM |
|
3212 |
+ sniffed_xml_encoding = 'utf-32be' |
|
3213 |
+ xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') |
|
3214 |
+ elif xml_data[:4] == '\xff\xfe\x00\x00': |
|
3215 |
+ # UTF-32LE with BOM |
|
3216 |
+ sniffed_xml_encoding = 'utf-32le' |
|
3217 |
+ xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') |
|
3218 |
+ elif xml_data[:3] == '\xef\xbb\xbf': |
|
3219 |
+ # UTF-8 with BOM |
|
3220 |
+ sniffed_xml_encoding = 'utf-8' |
|
3221 |
+ xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') |
|
3222 |
+ else: |
|
3223 |
+ # ASCII-compatible |
|
3224 |
+ pass |
|
3225 |
+ xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data) |
|
3226 |
+ except: |
|
3227 |
+ xml_encoding_match = None |
|
3228 |
+ if xml_encoding_match: |
|
3229 |
+ xml_encoding = xml_encoding_match.groups()[0].lower() |
|
3230 |
+ if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')): |
|
3231 |
+ xml_encoding = sniffed_xml_encoding |
|
3232 |
+ acceptable_content_type = 0 |
|
3233 |
+ application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity') |
|
3234 |
+ text_content_types = ('text/xml', 'text/xml-external-parsed-entity') |
|
3235 |
+ if (http_content_type in application_content_types) or \ |
|
3236 |
+ (http_content_type.startswith('application/') and http_content_type.endswith('+xml')): |
|
3237 |
+ acceptable_content_type = 1 |
|
3238 |
+ true_encoding = http_encoding or xml_encoding or 'utf-8' |
|
3239 |
+ elif (http_content_type in text_content_types) or \ |
|
3240 |
+ (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'): |
|
3241 |
+ acceptable_content_type = 1 |
|
3242 |
+ true_encoding = http_encoding or 'us-ascii' |
|
3243 |
+ elif http_content_type.startswith('text/'): |
|
3244 |
+ true_encoding = http_encoding or 'us-ascii' |
|
3245 |
+ elif http_headers and (not http_headers.has_key('content-type')): |
|
3246 |
+ true_encoding = xml_encoding or 'iso-8859-1' |
|
3247 |
+ else: |
|
3248 |
+ true_encoding = xml_encoding or 'utf-8' |
|
3249 |
+ # some feeds claim to be gb2312 but are actually gb18030. |
|
3250 |
+ # apparently MSIE and Firefox both do the following switch: |
|
3251 |
+ if true_encoding.lower() == 'gb2312': |
|
3252 |
+ true_encoding = 'gb18030' |
|
3253 |
+ return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type |
|
3254 |
+ |
|
3255 |
+def _toUTF8(data, encoding): |
|
3256 |
+ '''Changes an XML data stream on the fly to specify a new encoding |
|
3257 |
+ |
|
3258 |
+ data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already |
|
3259 |
+ encoding is a string recognized by encodings.aliases |
|
3260 |
+ ''' |
|
3261 |
+ if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding) |
|
3262 |
+ # strip Byte Order Mark (if present) |
|
3263 |
+ if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'): |
|
3264 |
+ if _debug: |
|
3265 |
+ sys.stderr.write('stripping BOM\n') |
|
3266 |
+ if encoding != 'utf-16be': |
|
3267 |
+ sys.stderr.write('trying utf-16be instead\n') |
|
3268 |
+ encoding = 'utf-16be' |
|
3269 |
+ data = data[2:] |
|
3270 |
+ elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'): |
|
3271 |
+ if _debug: |
|
3272 |
+ sys.stderr.write('stripping BOM\n') |
|
3273 |
+ if encoding != 'utf-16le': |
|
3274 |
+ sys.stderr.write('trying utf-16le instead\n') |
|
3275 |
+ encoding = 'utf-16le' |
|
3276 |
+ data = data[2:] |
|
3277 |
+ elif data[:3] == '\xef\xbb\xbf': |
|
3278 |
+ if _debug: |
|
3279 |
+ sys.stderr.write('stripping BOM\n') |
|
3280 |
+ if encoding != 'utf-8': |
|
3281 |
+ sys.stderr.write('trying utf-8 instead\n') |
|
3282 |
+ encoding = 'utf-8' |
|
3283 |
+ data = data[3:] |
|
3284 |
+ elif data[:4] == '\x00\x00\xfe\xff': |
|
3285 |
+ if _debug: |
|
3286 |
+ sys.stderr.write('stripping BOM\n') |
|
3287 |
+ if encoding != 'utf-32be': |
|
3288 |
+ sys.stderr.write('trying utf-32be instead\n') |
|
3289 |
+ encoding = 'utf-32be' |
|
3290 |
+ data = data[4:] |
|
3291 |
+ elif data[:4] == '\xff\xfe\x00\x00': |
|
3292 |
+ if _debug: |
|
3293 |
+ sys.stderr.write('stripping BOM\n') |
|
3294 |
+ if encoding != 'utf-32le': |
|
3295 |
+ sys.stderr.write('trying utf-32le instead\n') |
|
3296 |
+ encoding = 'utf-32le' |
|
3297 |
+ data = data[4:] |
|
3298 |
+ newdata = unicode(data, encoding) |
|
3299 |
+ if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding) |
|
3300 |
+ declmatch = re.compile('^<\?xml[^>]*?>') |
|
3301 |
+ newdecl = '''<?xml version='1.0' encoding='utf-8'?>''' |
|
3302 |
+ if declmatch.search(newdata): |
|
3303 |
+ newdata = declmatch.sub(newdecl, newdata) |
|
3304 |
+ else: |
|
3305 |
+ newdata = newdecl + u'\n' + newdata |
|
3306 |
+ return newdata.encode('utf-8') |
|
3307 |
+ |
|
3308 |
+def _stripDoctype(data): |
|
3309 |
+ '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data) |
|
3310 |
+ |
|
3311 |
+ rss_version may be 'rss091n' or None |
|
3312 |
+ stripped_data is the same XML document, minus the DOCTYPE |
|
3313 |
+ ''' |
|
3314 |
+ start = re.search('<\w',data) |
|
3315 |
+ start = start and start.start() or -1 |
|
3316 |
+ head,data = data[:start+1], data[start+1:] |
|
3317 |
+ |
|
3318 |
+ entity_pattern = re.compile(r'^\s*<!ENTITY([^>]*?)>', re.MULTILINE) |
|
3319 |
+ entity_results=entity_pattern.findall(head) |
|
3320 |
+ head = entity_pattern.sub('', head) |
|
3321 |
+ doctype_pattern = re.compile(r'^\s*<!DOCTYPE([^>]*?)>', re.MULTILINE) |
|
3322 |
+ doctype_results = doctype_pattern.findall(head) |
|
3323 |
+ doctype = doctype_results and doctype_results[0] or '' |
|
3324 |
+ if doctype.lower().count('netscape'): |
|
3325 |
+ version = 'rss091n' |
|
3326 |
+ else: |
|
3327 |
+ version = None |
|
3328 |
+ |
|
3329 |
+ # only allow in 'safe' inline entity definitions |
|
3330 |
+ replacement='' |
|
3331 |
+ if len(doctype_results)==1 and entity_results: |
|
3332 |
+ safe_pattern=re.compile('\s+(\w+)\s+"(&#\w+;|[^&"]*)"') |
|
3333 |
+ safe_entities=filter(lambda e: safe_pattern.match(e),entity_results) |
|
3334 |
+ if safe_entities: |
|
3335 |
+ replacement='<!DOCTYPE feed [\n <!ENTITY %s>\n]>' % '>\n <!ENTITY '.join(safe_entities) |
|
3336 |
+ data = doctype_pattern.sub(replacement, head) + data |
|
3337 |
+ |
|
3338 |
+ return version, data, dict(replacement and safe_pattern.findall(replacement)) |
|
3339 |
+ |
|
3340 |
+def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]): |
|
3341 |
+ '''Parse a feed from a URL, file, stream, or string''' |
|
3342 |
+ result = FeedParserDict() |
|
3343 |
+ result['feed'] = FeedParserDict() |
|
3344 |
+ result['entries'] = [] |
|
3345 |
+ if _XML_AVAILABLE: |
|
3346 |
+ result['bozo'] = 0 |
|
3347 |
+ if type(handlers) == types.InstanceType: |
|
3348 |
+ handlers = [handlers] |
|
3349 |
+ try: |
|
3350 |
+ f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers) |
|
3351 |
+ data = f.read() |
|
3352 |
+ except Exception as e: |
|
3353 |
+ result['bozo'] = 1 |
|
3354 |
+ result['bozo_exception'] = e |
|
3355 |
+ data = '' |
|
3356 |
+ f = None |
|
3357 |
+ |
|
3358 |
+ # if feed is gzip-compressed, decompress it |
|
3359 |
+ if f and data and hasattr(f, 'headers'): |
|
3360 |
+ if gzip and f.headers.get('content-encoding', '') == 'gzip': |
|
3361 |
+ try: |
|
3362 |
+ data = gzip.GzipFile(fileobj=_StringIO(data)).read() |
|
3363 |
+ except Exception as e: |
|
3364 |
+ # Some feeds claim to be gzipped but they're not, so |
|
3365 |
+ # we get garbage. Ideally, we should re-request the |
|
3366 |
+ # feed without the 'Accept-encoding: gzip' header, |
|
3367 |
+ # but we don't. |
|
3368 |
+ result['bozo'] = 1 |
|
3369 |
+ result['bozo_exception'] = e |
|
3370 |
+ data = '' |
|
3371 |
+ elif zlib and f.headers.get('content-encoding', '') == 'deflate': |
|
3372 |
+ try: |
|
3373 |
+ data = zlib.decompress(data, -zlib.MAX_WBITS) |
|
3374 |
+ except Exception as e: |
|
3375 |
+ result['bozo'] = 1 |
|
3376 |
+ result['bozo_exception'] = e |
|
3377 |
+ data = '' |
|
3378 |
+ |
|
3379 |
+ # save HTTP headers |
|
3380 |
+ if hasattr(f, 'info'): |
|
3381 |
+ info = f.info() |
|
3382 |
+ etag = info.getheader('ETag') |
|
3383 |
+ if etag: |
|
3384 |
+ result['etag'] = etag |
|
3385 |
+ last_modified = info.getheader('Last-Modified') |
|
3386 |
+ if last_modified: |
|
3387 |
+ result['modified'] = _parse_date(last_modified) |
|
3388 |
+ if hasattr(f, 'url'): |
|
3389 |
+ result['href'] = f.url |
|
3390 |
+ result['status'] = 200 |
|
3391 |
+ if hasattr(f, 'status'): |
|
3392 |
+ result['status'] = f.status |
|
3393 |
+ if hasattr(f, 'headers'): |
|
3394 |
+ result['headers'] = f.headers.dict |
|
3395 |
+ if hasattr(f, 'close'): |
|
3396 |
+ f.close() |
|
3397 |
+ |
|
3398 |
+ # there are four encodings to keep track of: |
|
3399 |
+ # - http_encoding is the encoding declared in the Content-Type HTTP header |
|
3400 |
+ # - xml_encoding is the encoding declared in the <?xml declaration |
|
3401 |
+ # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data |
|
3402 |
+ # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications |
|
3403 |
+ http_headers = result.get('headers', {}) |
|
3404 |
+ result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \ |
|
3405 |
+ _getCharacterEncoding(http_headers, data) |
|
3406 |
+ if http_headers and (not acceptable_content_type): |
|
3407 |
+ if http_headers.has_key('content-type'): |
|
3408 |
+ bozo_message = '%s is not an XML media type' % http_headers['content-type'] |
|
3409 |
+ else: |
|
3410 |
+ bozo_message = 'no Content-type specified' |
|
3411 |
+ result['bozo'] = 1 |
|
3412 |
+ result['bozo_exception'] = NonXMLContentType(bozo_message) |
|
3413 |
+ |
|
3414 |
+ result['version'], data, entities = _stripDoctype(data) |
|
3415 |
+ |
|
3416 |
+ baseuri = http_headers.get('content-location', result.get('href')) |
|
3417 |
+ baselang = http_headers.get('content-language', None) |
|
3418 |
+ |
|
3419 |
+ # if server sent 304, we're done |
|
3420 |
+ if result.get('status', 0) == 304: |
|
3421 |
+ result['version'] = '' |
|
3422 |
+ result['debug_message'] = 'The feed has not changed since you last checked, ' + \ |
|
3423 |
+ 'so the server sent no data. This is a feature, not a bug!' |
|
3424 |
+ return result |
|
3425 |
+ |
|
3426 |
+ # if there was a problem downloading, we're done |
|
3427 |
+ if not data: |
|
3428 |
+ return result |
|
3429 |
+ |
|
3430 |
+ # determine character encoding |
|
3431 |
+ use_strict_parser = 0 |
|
3432 |
+ known_encoding = 0 |
|
3433 |
+ tried_encodings = [] |
|
3434 |
+ # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM |
|
3435 |
+ for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding): |
|
3436 |
+ if not proposed_encoding: continue |
|
3437 |
+ if proposed_encoding in tried_encodings: continue |
|
3438 |
+ tried_encodings.append(proposed_encoding) |
|
3439 |
+ try: |
|
3440 |
+ data = _toUTF8(data, proposed_encoding) |
|
3441 |
+ known_encoding = use_strict_parser = 1 |
|
3442 |
+ break |
|
3443 |
+ except: |
|
3444 |
+ pass |
|
3445 |
+ # if no luck and we have auto-detection library, try that |
|
3446 |
+ if (not known_encoding) and chardet: |
|
3447 |
+ try: |
|
3448 |
+ proposed_encoding = chardet.detect(data)['encoding'] |
|
3449 |
+ if proposed_encoding and (proposed_encoding not in tried_encodings): |
|
3450 |
+ tried_encodings.append(proposed_encoding) |
|
3451 |
+ data = _toUTF8(data, proposed_encoding) |
|
3452 |
+ known_encoding = use_strict_parser = 1 |
|
3453 |
+ except: |
|
3454 |
+ pass |
|
3455 |
+ # if still no luck and we haven't tried utf-8 yet, try that |
|
3456 |
+ if (not known_encoding) and ('utf-8' not in tried_encodings): |
|
3457 |
+ try: |
|
3458 |
+ proposed_encoding = 'utf-8' |
|
3459 |
+ tried_encodings.append(proposed_encoding) |
|
3460 |
+ data = _toUTF8(data, proposed_encoding) |
|
3461 |
+ known_encoding = use_strict_parser = 1 |
|
3462 |
+ except: |
|
3463 |
+ pass |
|
3464 |
+ # if still no luck and we haven't tried windows-1252 yet, try that |
|
3465 |
+ if (not known_encoding) and ('windows-1252' not in tried_encodings): |
|
3466 |
+ try: |
|
3467 |
+ proposed_encoding = 'windows-1252' |
|
3468 |
+ tried_encodings.append(proposed_encoding) |
|
3469 |
+ data = _toUTF8(data, proposed_encoding) |
|
3470 |
+ known_encoding = use_strict_parser = 1 |
|
3471 |
+ except: |
|
3472 |
+ pass |
|
3473 |
+ # if still no luck and we haven't tried iso-8859-2 yet, try that. |
|
3474 |
+ if (not known_encoding) and ('iso-8859-2' not in tried_encodings): |
|
3475 |
+ try: |
|
3476 |
+ proposed_encoding = 'iso-8859-2' |
|
3477 |
+ tried_encodings.append(proposed_encoding) |
|
3478 |
+ data = _toUTF8(data, proposed_encoding) |
|
3479 |
+ known_encoding = use_strict_parser = 1 |
|
3480 |
+ except: |
|
3481 |
+ pass |
|
3482 |
+ # if still no luck, give up |
|
3483 |
+ if not known_encoding: |
|
3484 |
+ result['bozo'] = 1 |
|
3485 |
+ result['bozo_exception'] = CharacterEncodingUnknown( \ |
|
3486 |
+ 'document encoding unknown, I tried ' + \ |
|
3487 |
+ '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' % \ |
|
3488 |
+ (result['encoding'], xml_encoding)) |
|
3489 |
+ result['encoding'] = '' |
|
3490 |
+ elif proposed_encoding != result['encoding']: |
|
3491 |
+ result['bozo'] = 1 |
|
3492 |
+ result['bozo_exception'] = CharacterEncodingOverride( \ |
|
3493 |
+ 'documented declared as %s, but parsed as %s' % \ |
|
3494 |
+ (result['encoding'], proposed_encoding)) |
|
3495 |
+ result['encoding'] = proposed_encoding |
|
3496 |
+ |
|
3497 |
+ if not _XML_AVAILABLE: |
|
3498 |
+ use_strict_parser = 0 |
|
3499 |
+ if use_strict_parser: |
|
3500 |
+ # initialize the SAX parser |
|
3501 |
+ feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8') |
|
3502 |
+ saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS) |
|
3503 |
+ saxparser.setFeature(xml.sax.handler.feature_namespaces, 1) |
|
3504 |
+ saxparser.setContentHandler(feedparser) |
|
3505 |
+ saxparser.setErrorHandler(feedparser) |
|
3506 |
+ source = xml.sax.xmlreader.InputSource() |
|
3507 |
+ source.setByteStream(_StringIO(data)) |
|
3508 |
+ if hasattr(saxparser, '_ns_stack'): |
|
3509 |
+ # work around bug in built-in SAX parser (doesn't recognize xml: namespace) |
|
3510 |
+ # PyXML doesn't have this problem, and it doesn't have _ns_stack either |
|
3511 |
+ saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'}) |
|
3512 |
+ try: |
|
3513 |
+ saxparser.parse(source) |
|
3514 |
+ except Exception, e: |
|
3515 |
+ if _debug: |
|
3516 |
+ import traceback |
|
3517 |
+ traceback.print_stack() |
|
3518 |
+ traceback.print_exc() |
|
3519 |
+ sys.stderr.write('xml parsing failed\n') |
|
3520 |
+ result['bozo'] = 1 |
|
3521 |
+ result['bozo_exception'] = feedparser.exc or e |
|
3522 |
+ use_strict_parser = 0 |
|
3523 |
+ if not use_strict_parser: |
|
3524 |
+ feedparser = _LooseFeedParser(baseuri, baselang, known_encoding and 'utf-8' or '', entities) |
|
3525 |
+ feedparser.feed(data) |
|
3526 |
+ result['feed'] = feedparser.feeddata |
|
3527 |
+ result['entries'] = feedparser.entries |
|
3528 |
+ result['version'] = result['version'] or feedparser.version |
|
3529 |
+ result['namespaces'] = feedparser.namespacesInUse |
|
3530 |
+ return result |
|
3531 |
+ |
|
3532 |
+class Serializer: |
|
3533 |
+ def __init__(self, results): |
|
3534 |
+ self.results = results |
|
3535 |
+ |
|
3536 |
+class TextSerializer(Serializer): |
|
3537 |
+ def write(self, stream=sys.stdout): |
|
3538 |
+ self._writer(stream, self.results, '') |
|
3539 |
+ |
|
3540 |
+ def _writer(self, stream, node, prefix): |
|
3541 |
+ if not node: return |
|
3542 |
+ if hasattr(node, 'keys'): |
|
3543 |
+ keys = node.keys() |
|
3544 |
+ keys.sort() |
|
3545 |
+ for k in keys: |
|
3546 |
+ if k in ('description', 'link'): continue |
|
3547 |
+ if node.has_key(k + '_detail'): continue |
|
3548 |
+ if node.has_key(k + '_parsed'): continue |
|
3549 |
+ self._writer(stream, node[k], prefix + k + '.') |
|
3550 |
+ elif type(node) == types.ListType: |
|
3551 |
+ index = 0 |
|
3552 |
+ for n in node: |
|
3553 |
+ self._writer(stream, n, prefix[:-1] + '[' + str(index) + '].') |
|
3554 |
+ index += 1 |
|
3555 |
+ else: |
|
3556 |
+ try: |
|
3557 |
+ s = str(node).encode('utf-8') |
|
3558 |
+ s = s.replace('\\', '\\\\') |
|
3559 |
+ s = s.replace('\r', '') |
|
3560 |
+ s = s.replace('\n', r'\n') |
|
3561 |
+ stream.write(prefix[:-1]) |
|
3562 |
+ stream.write('=') |
|
3563 |
+ stream.write(s) |
|
3564 |
+ stream.write('\n') |
|
3565 |
+ except: |
|
3566 |
+ pass |
|
3567 |
+ |
|
3568 |
+class PprintSerializer(Serializer): |
|
3569 |
+ def write(self, stream=sys.stdout): |
|
3570 |
+ if self.results.has_key('href'): |
|
3571 |
+ stream.write(self.results['href'] + '\n\n') |
|
3572 |
+ from pprint import pprint |
|
3573 |
+ pprint(self.results, stream) |
|
3574 |
+ stream.write('\n') |
|
3575 |
+ |
|
3576 |
+if __name__ == '__main__': |
|
3577 |
+ try: |
|
3578 |
+ from optparse import OptionParser |
|
3579 |
+ except: |
|
3580 |
+ OptionParser = None |
|
3581 |
+ |
|
3582 |
+ if OptionParser: |
|
3583 |
+ optionParser = OptionParser(version=__version__, usage="%prog [options] url_or_filename_or_-") |
|
3584 |
+ optionParser.set_defaults(format="pprint") |
|
3585 |
+ optionParser.add_option("-A", "--user-agent", dest="agent", metavar="AGENT", help="User-Agent for HTTP URLs") |
|
3586 |
+ optionParser.add_option("-e", "--referer", "--referrer", dest="referrer", metavar="URL", help="Referrer for HTTP URLs") |
|
3587 |
+ optionParser.add_option("-t", "--etag", dest="etag", metavar="TAG", help="ETag/If-None-Match for HTTP URLs") |
|
3588 |
+ optionParser.add_option("-m", "--last-modified", dest="modified", metavar="DATE", help="Last-modified/If-Modified-Since for HTTP URLs (any supported date format)") |
|
3589 |
+ optionParser.add_option("-f", "--format", dest="format", metavar="FORMAT", help="output results in FORMAT (text, pprint)") |
|
3590 |
+ optionParser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False, help="write debugging information to stderr") |
|
3591 |
+ (options, urls) = optionParser.parse_args() |
|
3592 |
+ if options.verbose: |
|
3593 |
+ _debug = 1 |
|
3594 |
+ if not urls: |
|
3595 |
+ optionParser.print_help() |
|
3596 |
+ sys.exit(0) |
|
3597 |
+ else: |
|
3598 |
+ if not sys.argv[1:]: |
|
3599 |
+ print __doc__ |
|
3600 |
+ sys.exit(0) |
|
3601 |
+ class _Options: |
|
3602 |
+ etag = modified = agent = referrer = None |
|
3603 |
+ format = 'pprint' |
|
3604 |
+ options = _Options() |
|
3605 |
+ urls = sys.argv[1:] |
|
3606 |
+ |
|
3607 |
+ zopeCompatibilityHack() |
|
3608 |
+ |
|
3609 |
+ serializer = globals().get(options.format.capitalize() + 'Serializer', Serializer) |
|
3610 |
+ for url in urls: |
|
3611 |
+ results = parse(url, etag=options.etag, modified=options.modified, agent=options.agent, referrer=options.referrer) |
|
3612 |
+ serializer(results).write(sys.stdout) |
... | ... |
@@ -0,0 +1,630 @@ |
1 |
+#!/usr/bin/python2.5 |
|
2 |
+# chmod 755 me, and make sure I have UNIX style newlines. |
|
3 |
+# |
|
4 |
+# techcrunch.py |
|
5 |
+# |
|
6 |
+# http://feeds.feedburner.com/TechCrunch |
|
7 |
+# feed = feedparser.parse( 'http://feeds.feedburner.com/TechCrunch' ) |
|
8 |
+# feed.entries[14]['feedburner_origlink'], feed.entries[14]['slash_comments'] |
|
9 |
+# |
|
10 |
+# TODO: |
|
11 |
+# 1. Deep links: '<a class="post_more_link snap_nopreview" href="http://www.crunchgear.com/2010/08/21/fanboyism-when-expression-meets-desperation/">Read the rest of this entry »</a>' |
|
12 |
+# link = "http://techcrunch.com/2010/08/21/fanboyism-when-expression-meets-desperation/" |
|
13 |
+ |
|
14 |
+import feedparser |
|
15 |
+import yaml |
|
16 |
+import sys |
|
17 |
+import os |
|
18 |
+import time |
|
19 |
+import StringIO |
|
20 |
+import codecs |
|
21 |
+import traceback |
|
22 |
+import calendar |
|
23 |
+import pickle |
|
24 |
+import exceptions |
|
25 |
+import urllib |
|
26 |
+import urllib2 |
|
27 |
+import httplib |
|
28 |
+import shutil |
|
29 |
+import glob |
|
30 |
+import smtplib |
|
31 |
+import bisect |
|
32 |
+import analysis |
|
33 |
+import simplejson as json |
|
34 |
+import cookielib |
|
35 |
+ |
|
36 |
+debug = True |
|
37 |
+any_entry_added = False |
|
38 |
+ |
|
39 |
+localdir = '' |
|
40 |
+ |
|
41 |
+html_head = """ |
|
42 |
+<!DOCTYPE HTML PUBLIC '-//W3C//DTD HTML 4.01 Transitional//EN'> |
|
43 |
+<HTML><HEAD> |
|
44 |
+ <title>TechCrunch Feed Filter</title> |
|
45 |
+ <!-- <link rel="alternate" type="application/rss+xml" title="RSS feed" href="http://techcrunch.dlma.com/rss_feed.xml" /> --> |
|
46 |
+ <link rel="alternate" type="application/rss+xml" title="RSS feed" href="http://feeds.feedburner.com/TrendingAtTechcrunch" /> |
|
47 |
+ <style type="text/css"> |
|
48 |
+ body { font-family: "Arial", san-serif; } |
|
49 |
+ .author { font-size: smaller; } |
|
50 |
+ .h3 { font-size: larger; } |
|
51 |
+ a { text-decoration: none; } |
|
52 |
+ /* table { border: none; border-collapse:collapse; font-size: large } */ |
|
53 |
+ table { border-collapse: collapse; } |
|
54 |
+ table.legend { border:1px solid LightSlateGray; font-size: medium; border-collapse:separated; } |
|
55 |
+ table.legend th { border: 1px solid LightSlateGray; background-color: #E0E0E0; } |
|
56 |
+ table.legend td { border: 1px solid LightSlateGray; } |
|
57 |
+ tr.even { background:#%s; padding: 2em; } |
|
58 |
+ tr.odd { background:#%s; padding-bottom: 2em; } |
|
59 |
+ </style> |
|
60 |
+</HEAD> |
|
61 |
+<BODY> |
|
62 |
+<div align='center'><h3>TechCrunch Feed Filter</h3></div> |
|
63 |
+This page shows what analysis is done to filter the noise away from the Techcrunch feed into <a href="http://feeds.feedburner.com/TrendingAtTechcrunch">a more concise feed</a>.<br /><br /> |
|
64 |
+""" |
|
65 |
+ |
|
66 |
+html_footer = """ |
|
67 |
+</table> |
|
68 |
+</div><br /> |
|
69 |
+<div align='center'>Thanks to <a href="http://www.feedparser.org/">The Universal Feed Parser module</a>, |
|
70 |
+<a href="http://pyyaml.org/">PyYAML</a> and <a href="http://code.google.com/apis/chart/">Google Charts</a>.<br /><a href="techcrunch.yaml">raw data</a> • <a href="stats.txt">status</a></div><br /> |
|
71 |
+</BODY> |
|
72 |
+</HTML> |
|
73 |
+""" |
|
74 |
+ |
|
75 |
+img_width = 300 |
|
76 |
+img_height = 50 |
|
77 |
+ |
|
78 |
+series_1_color = "0000FF" |
|
79 |
+series_2_color = "00AA00" |
|
80 |
+threshold_color = "FF8C00" |
|
81 |
+ |
|
82 |
+even_background = "F8F8F8" |
|
83 |
+#even_background = "FFFFFF" |
|
84 |
+odd_background = "E8E8E8" |
|
85 |
+ |
|
86 |
+def asciiize( s ): |
|
87 |
+ try: |
|
88 |
+ return s.encode( 'ascii' ) |
|
89 |
+ except UnicodeEncodeError, e: |
|
90 |
+ return s |
|
91 |
+ except exceptions.AttributeError, e: |
|
92 |
+ return s |
|
93 |
+ |
|
94 |
+def sendEmail( subject, message, toaddrs, fromaddr='"techcrunch.py" <techcrunch@techcrunch.dlma.com>' ): |
|
95 |
+ """Sends Email""" |
|
96 |
+ smtp = smtplib.SMTP( 'localhost' ) |
|
97 |
+ smtp.sendmail( fromaddr, \ |
|
98 |
+ toaddrs, \ |
|
99 |
+ "Content-Type: text/plain; charset=\"us-ascii\"\r\nFrom: %s\r\nTo: %s\r\nSubject: %s\r\n%s" % \ |
|
100 |
+ ( fromaddr, ", ".join( toaddrs ), subject, message ) ) |
|
101 |
+ smtp.quit() |
|
102 |
+ |
|
103 |
+def make_chart_url( time_posted, comment_times, comment_values, retweet_times, retweet_values, met_threshold_pt, bg_color ): |
|
104 |
+# comment_times, comment_values = zip( *comments ) |
|
105 |
+# retweet_times, retweet_values = zip( *retweets ) |
|
106 |
+ |
|
107 |
+ # TODO handle failure cases, -1 |
|
108 |
+ |
|
109 |
+ if not len( comment_times ): |
|
110 |
+ comment_times = [ time_posted, ] |
|
111 |
+ if not len( comment_values ): |
|
112 |
+ comment_values = [ 0, ] |
|
113 |
+ if not len( retweet_times ): |
|
114 |
+ retweet_times = [ time_posted, ] |
|
115 |
+ if not len( retweet_values ): |
|
116 |
+ retweet_values = [ 0, ] |
|
117 |
+ |
|
118 |
+# comment_times = [ (i - time_posted + 900) / 1800 for i in comment_times ] |
|
119 |
+# retweet_times = [ (i - time_posted + 900) / 1800 for i in retweet_times ] |
|
120 |
+ comment_times = [ (i - time_posted) / 1800 for i in comment_times ] |
|
121 |
+ retweet_times = [ (i - time_posted) / 1800 for i in retweet_times ] |
|
122 |
+ |
|
123 |
+ min_comment_time = min( comment_times ) |
|
124 |
+ max_comment_time = max( comment_times ) |
|
125 |
+ min_comment_value = min( comment_values ) |
|
126 |
+ max_comment_value = max( comment_values ) |
|
127 |
+ min_retweet_time = min( retweet_times ) |
|
128 |
+ max_retweet_time = max( retweet_times ) |
|
129 |
+ min_retweet_value = min( retweet_values ) |
|
130 |
+ max_retweet_value = max( retweet_values ) |
|
131 |
+ |
|
132 |
+ if len( comment_values ) < 8 and len( comment_values ) > 1: |
|
133 |
+ # max_comment_value *= 2 |
|
134 |
+ pass |
|
135 |
+ elif len( comment_values ) == 1: |
|
136 |
+ min_comment_value = 0 |
|
137 |
+ if len( retweet_values ) < 8 and len( retweet_values ) > 1: |
|
138 |
+ # max_retweet_value *= 2 |
|
139 |
+ pass |
|
140 |
+ elif len( retweet_values ) == 1: |
|
141 |
+ min_retweet_value = 0 |
|
142 |
+ |
|
143 |
+ min_comment_value = 0 |
|
144 |
+ min_retweet_value = 0 |
|
145 |
+ |
|
146 |
+ chart_url = "http://chart.apis.google.com/chart?cht=lxy&chco=%s,%s&chs=%dx%d&chxs=0,%s|1,%s" % \ |
|
147 |
+ ( series_1_color, series_2_color, img_width, img_height, series_1_color, series_2_color ) |
|
148 |
+ chart_url += "&chd=t:%s|%s|%s|%s" % ( ','.join( [ str( n ) for n in comment_times ] ), |
|
149 |
+ ','.join( [ str( n ) for n in comment_values ] ), |
|
150 |
+ ','.join( [ str( n ) for n in retweet_times ] ), |
|
151 |
+ ','.join( [ str( n ) for n in retweet_values ] ) ) |
|
152 |
+ if met_threshold_pt != -1: |
|
153 |
+ chart_url += "&chm=o,%s,1,%d,10" % ( threshold_color, met_threshold_pt ) |
|
154 |
+ chart_url += "&chxt=y,r&chxl=0:|%d|%d|1:|%d|%d&chds=%d,%d,%d,%d,%d,%d,%d,%d" % \ |
|
155 |
+ ( min_comment_value, max_comment_value, min_retweet_value, max_retweet_value, |
|
156 |
+ 0, max( 7, max_comment_time ), |
|
157 |
+ min_comment_value, max_comment_value, |
|
158 |
+ 0, max( 7, max_retweet_time ), |
|
159 |
+ min_comment_value, max_retweet_value ) |
|
160 |
+ chart_url += "&chf=bg,s,%s&chdl=comments|retweets" % ( bg_color, ) |
|
161 |
+ return chart_url |
|
162 |
+ |
|
163 |
+def process_feed( yaml_items ): |
|
164 |
+ """ |
|
165 |
+ Retrieve the url and process it. |
|
166 |
+ feed_info (in, out) A tuple that describes an individual feed, like its name and etag. |
|
167 |
+ """ |
|
168 |
+ |
|
169 |
+ feed = feedparser.parse( 'http://feeds.feedburner.com/TechCrunch' ) |
|
170 |
+ if hasattr( feed, 'status' ): |
|
171 |
+ if feed.status == 304: |
|
172 |
+ pass |
|
173 |
+ else: |
|
174 |
+ feed_is_modified = True |
|
175 |
+ if feed.status != 200 and feed.status != 307 and feed.status != 301 and feed.status != 302: |
|
176 |
+ if feed.status == 503: |
|
177 |
+ print "the feed is temporarily unavailable." |
|
178 |
+ elif feed.status == 400: |
|
179 |
+ print "the feed says we made a bad request." |
|
180 |
+ elif feed.status == 502: |
|
181 |
+ print "the feed reported a bad gateway error." |
|
182 |
+ elif feed.status == 404: |
|
183 |
+ print "the feed says the page was not found." |
|
184 |
+ elif feed.status == 500: |
|
185 |
+ print "the feed had an internal server error." |
|
186 |
+ elif feed.status == 403: |
|
187 |
+ print "Access to the feed was forbidden." |
|
188 |
+ else: |
|
189 |
+ print "the feed returned feed.status %d." % ( feed.status, ) |
|
190 |
+ else: |
|
191 |
+ # Save off this |
|
192 |
+ f = file( os.path.join( localdir, 'techcrunch_feed.pickle' ), 'wb' ) |
|
193 |
+ try: |
|
194 |
+ pickle.dump( feed, f ) |
|
195 |
+ except( pickle.PicklingError, exceptions.TypeError ), e: |
|
196 |
+ print "An error occurred while pickling the feed: %s." % \ |
|
197 |
+ ( # str(e.__class__), |
|
198 |
+ str(e) ) |
|
199 |
+ traceback.print_exc( file = sys.stdout ) |
|
200 |
+ feed_is_modified = False |
|
201 |
+ f.close() |
|
202 |
+ |
|
203 |
+ for i in reversed( feed.entries ): |
|
204 |
+ process_item( i, yaml_items ) |
|
205 |
+ |
|
206 |
+ # If we have more than 200 items, remove the old ones. |
|
207 |
+ while len( yaml_items ) > 200: |
|
208 |
+ yaml_items.pop() |
|
209 |
+ |
|
210 |
+ cookie = Get_cookie( urllib2.Request( 'http://mediacdn.disqus.com/1078/build/system/count.js' ) ) |
|
211 |
+ |
|
212 |
+ for i in yaml_items: |
|
213 |
+ # i['title'] = asciiize( i['title'] ) |
|
214 |
+ # i['tags'] = map( asciiize, i['tags'] ) |
|
215 |
+ process_yaml_item( i, cookie ) |
|
216 |
+ |
|
217 |
+ else: |
|
218 |
+ if hasattr(feed, 'bozo_exception'): |
|
219 |
+ e = feed.bozo_exception |
|
220 |
+ if isinstance( e, urllib2.URLError ): # e.__class__ == urllib2.URLError: # and hasattr(e, 'errno') and e.errno == 110: |
|
221 |
+ print_last_line = True |
|
222 |
+ if hasattr(e, 'reason'): |
|
223 |
+ if e.reason[0] == 110: |
|
224 |
+ print "the feed's connection timed out." |
|
225 |
+ print_last_line = False |
|
226 |
+ elif e.reason[0] == 111: |
|
227 |
+ print "the feed's connection was refused." |
|
228 |
+ print_last_line = False |
|
229 |
+ elif e.reason[0] == 104: |
|
230 |
+ print "the feed reset the connection." |
|
231 |
+ print_last_line = False |
|
232 |
+ else: |
|
233 |
+ print "the feed had a URLError with reason %s." % ( str(e.reason), ) |
|
234 |
+ print_last_line = False |
|
235 |
+ if print_last_line: |
|
236 |
+ print "the feed had a URLError %s" % ( str(e), ) |
|
237 |
+ elif isinstance( e, httplib.BadStatusLine ): |
|
238 |
+ if hasattr(e, 'message'): |
|
239 |
+ print "the feed gave a bad status line %s." % ( str(e.message ), ) |
|
240 |
+ else: |
|
241 |
+ print "the feed gave a bad status line." |
|
242 |
+ else: |
|
243 |
+ if len( str(e) ): |
|
244 |
+ print "the feed bozo_exception: %s \"%s\"" % ( str(e.__class__), str(e) ) |
|
245 |
+ else: |
|
246 |
+ print "the feed bozo_exception: %s %s" % ( str(e.__class__), repr(e) ) |
|
247 |
+ else: |
|
248 |
+ print "the feed returned class %s, %s" % ( str(feed.__class__), str(feed) ) |
|
249 |
+ |
|
250 |
+def process_item( feed_item, yaml_items ): |
|
251 |
+ # Get the time |
|
252 |
+ global any_entry_added |
|
253 |
+ timecode_now = int( time.time() ) |
|
254 |
+ date_parsed = time.gmtime() |
|
255 |
+ if hasattr( feed_item, 'issued_parsed' ): |
|
256 |
+ date_parsed = feed_item.issued_parsed |
|
257 |
+ date_set = True |
|
258 |
+ elif hasattr( feed_item, 'date_parsed' ): |
|
259 |
+ date_parsed = feed_item.date_parsed |
|
260 |
+ date_set = True |
|
261 |
+ else: |
|
262 |
+ print "process_item found no timestamp for", asciiize( feed_item.link ) |
|
263 |
+ timecode_parsed = calendar.timegm( date_parsed ) |
|
264 |
+ |
|
265 |
+ # Look for i.feedburner_origlink in yaml_items |
|
266 |
+ yaml_item = None |
|
267 |
+ for i in yaml_items: |
|
268 |
+ if feed_item.feedburner_origlink == i['link']: |
|
269 |
+ yaml_item = i |
|
270 |
+ break |
|
271 |
+ if not yaml_item: |
|
272 |
+ author = '' |
|
273 |
+ link = feed_item.link |
|
274 |
+ if hasattr( feed_item, 'author' ): |
|
275 |
+ author = asciiize( feed_item.author ) |
|
276 |
+ if hasattr( feed_item, 'feedburner_origlink' ): |
|
277 |
+ link = feed_item.feedburner_origlink |
|
278 |
+ |
|
279 |
+ # Make a new yaml_item |
|
280 |
+ yaml_item = { 'title' : asciiize( feed_item.title ), |
|
281 |
+ 'link' : asciiize( link ), |
|
282 |
+ 'author' : author, |
|
283 |
+ 'tags' : [], |
|
284 |
+ 'orig_posted' : timecode_parsed, |
|
285 |
+ 'qualified' : -1, |
|
286 |
+ 'comment_times' : [], |
|
287 |
+ 'comments' : [], |
|
288 |
+ 'slash_comment_times' : [], |
|
289 |
+ 'slash_comments' : [], |
|
290 |
+ 'retweet_times' : [], |
|
291 |
+ 'retweets' : [] |
|
292 |
+ } |
|
293 |
+ if hasattr( feed_item, 'tags' ): |
|
294 |
+ for i in feed_item.tags: |
|
295 |
+ yaml_item['tags'].append( asciiize( i.term ) ) |
|
296 |
+ |
|
297 |
+ yaml_items.insert( 0, yaml_item ) |
|
298 |
+ any_entry_added = True |
|
299 |
+ |
|
300 |
+ # Maybe check to ensure that this item isn't too old. |
|
301 |
+ if timecode_parsed < timecode_now - 60 * 30 * 9: |
|
302 |
+ return |
|
303 |
+ |
|
304 |
+ # Now, add the new values |
|
305 |
+ if hasattr( feed_item, 'slash_comments' ) and len( yaml_item['slash_comments'] ) < 8: |
|
306 |
+ any_entry_added = True |
|
307 |
+ yaml_item['slash_comment_times'].append( timecode_now ) |
|
308 |
+ yaml_item['slash_comments'].append( int( feed_item.slash_comments ) ) |
|
309 |
+ |
|
310 |
+def process_yaml_item( yaml_item, cookie ): |
|
311 |
+ global any_entry_added |
|
312 |
+ |
|
313 |
+ timecode_now = int( time.time() ) |
|
314 |
+ if len( yaml_item['comments'] ) < 8: |
|
315 |
+ num_comments = Get_num_disqus_comments( yaml_item['link'], cookie ) |
|
316 |
+ if num_comments != -1: |
|
317 |
+ any_entry_added = True |
|
318 |
+ yaml_item['comment_times'].append( timecode_now ) |
|
319 |
+ yaml_item['comments'].append( num_comments ) |
|
320 |
+ |
|
321 |
+ if len( yaml_item['retweets'] ) < 8: |
|
322 |
+ num_retweets = Get_num_retweets( yaml_item['link'] ) |
|
323 |
+ if num_retweets != -1: |
|
324 |
+ any_entry_added = True |
|
325 |
+ yaml_item['retweet_times'].append( timecode_now ) |
|
326 |
+ yaml_item['retweets'].append( num_retweets ) |
|
327 |
+ |
|
328 |
+def Get_num_comments( url_string ): |
|
329 |
+ try: |
|
330 |
+ f = urllib2.urlopen( url_string ) |
|
331 |
+ data = f.read() |
|
332 |
+ f.close() |
|
333 |
+ except urllib2.URLError, e: |
|
334 |
+ if hasattr( e, 'reason' ): |
|
335 |
+ print "Get_num_comments got an error:", e.reason |
|
336 |
+ elif hasattr( e, 'code' ): |
|
337 |
+ print "Get_num_comments got an error. Code:", e.code |
|
338 |
+ return -1 |
|
339 |
+ tag_to_find = '<a href="#comments" rel="nofollow">' |
|
340 |
+ offset = data.find( tag_to_find ) |
|
341 |
+ if offset != -1: |
|
342 |
+ start_pos = offset + len( tag_to_find ) |
|
343 |
+ end_pos = start_pos |
|
344 |
+ while str.isdigit( data[ end_pos ] ): |
|
345 |
+ end_pos += 1 |
|
346 |
+ if end_pos > start_pos: |
|
347 |
+ return int( data[start_pos:end_pos] ) |
|
348 |
+ return -1 |
|
349 |
+ |
|
350 |
+def Get_cookie( cookie_request ): |
|
351 |
+ cookie = cookielib.CookieJar() |
|
352 |
+ try: |
|
353 |
+ cookie_response = urllib2.urlopen( cookie_request ) |
|
354 |
+ cookie.extract_cookies( cookie_response, cookie_request ) |
|
355 |
+ return cookie |
|
356 |
+ except urllib2.URLError, e: |
|
357 |
+ if hasattr( e, 'reason' ): |
|
358 |
+ print "Get_cookie got an error:", e.reason |
|
359 |
+ elif hasattr( e, 'code' ): |
|
360 |
+ print "Get_cookie got an error. Code:", e.code |
|
361 |
+ return None |
|
362 |
+ |
|
363 |
+def Get_num_disqus_comments( url_string, cookie ): |
|
364 |
+ |
|
365 |
+ if cookie == None: |
|
366 |
+ return -1 |
|
367 |
+ |
|
368 |
+ try: |
|
369 |
+ f = urllib2.urlopen( url_string ) |
|
370 |
+ data = f.read() |
|
371 |
+ f.close() |
|
372 |
+ except urllib2.URLError, e: |
|
373 |
+ if hasattr( e, 'reason' ): |
|
374 |
+ print "Get_num_disqus_comments got an error:", e.reason |
|
375 |
+ elif hasattr( e, 'code' ): |
|
376 |
+ print "Get_num_disqus_comments got an error. Code:", e.code |
|
377 |
+ return -1 |
|
378 |
+ |
|
379 |
+ tag_to_find = '<a href="#comments" rel="nofollow"><span class="dsq-postid" rel="' |
|
380 |
+ disqus_tag_to_find = 'displayCount(' |
|
381 |
+ offset = data.find( tag_to_find ) |
|
382 |
+ if offset != -1: |
|
383 |
+ start_pos = offset + len( tag_to_find ) |
|
384 |
+ end_pos = start_pos |
|
385 |
+ while data[ end_pos ] != '"' and end_pos < start_pos + 200: |
|
386 |
+ end_pos += 1 |
|
387 |
+ if end_pos < start_pos + 200: |
|
388 |
+ opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cookie ) ) |
|
389 |
+ url_GET_data = urllib.quote_plus( data[start_pos:end_pos] ).replace( '+', '%20' ) |
|
390 |
+ request = urllib2.Request( 'http://disqus.com/forums/techcrunch/count.js?q=1&0=1,' + url_GET_data ) |
|
391 |
+ try: |
|
392 |
+ response = opener.open( request ) |
|
393 |
+ disqus_data = response.read() |
|
394 |
+ except urllib2.URLError, e: |
|
395 |
+ if hasattr( e, 'reason' ): |
|
396 |
+ print "Get_num_disqus_comments got an error getting the count:", e.reason |
|
397 |
+ elif hasattr( e, 'code' ): |
|
398 |
+ print "Get_num_disqus_comments got an error getting the count. Code:", e.code |
|
399 |
+ disqus_data = "" |
|
400 |
+ disqus_offset = disqus_data.find( disqus_tag_to_find ) |
|
401 |
+ if disqus_offset != -1: |
|
402 |
+ start_pos = disqus_offset + len( disqus_tag_to_find ) |
|
403 |
+ end_pos = disqus_data.find( '}]})', start_pos ) |
|
404 |
+ if end_pos != -1: |
|
405 |
+ return int( json.loads( disqus_data[start_pos:end_pos+3] )['counts'][0]['comments'] ) |
|
406 |
+ |
|
407 |
+ return -1 |
|
408 |
+ |
|
409 |
+def Get_num_retweets( url_string ): |
|
410 |
+ try: |
|
411 |
+ f = urllib2.urlopen( 'http://api.tweetmeme.com/button.js?url=%s' % ( url_string ) ) |
|
412 |
+ data = f.read() |
|
413 |
+ f.close() |
|
414 |
+ except urllib2.URLError, e: |
|
415 |
+ if hasattr( e, 'reason' ): |
|
416 |
+ print "Get_num_retweets got an error:", e.reason |
|
417 |
+ elif hasattr( e, 'code' ): |
|
418 |
+ print "Get_num_retweets got an error. Code:", e.code |
|
419 |
+ return -1 |
|
420 |
+ tag_to_find = '<span class="c">' |
|
421 |
+ offset = data.find( tag_to_find ) |
|
422 |
+ if offset != -1: |
|
423 |
+ start_pos = offset + len( tag_to_find ) |
|
424 |
+ end_pos = data.find( '<', start_pos ) |
|
425 |
+ if end_pos != -1: |
|
426 |
+ return int( data[ start_pos:end_pos ] ) |
|
427 |
+ return -1 |
|
428 |
+ |
|
429 |
+def Save_image( url_string, file_path ): |
|
430 |
+ try: |
|
431 |
+ f = urllib2.urlopen( url_string ) |
|
432 |
+ data = f.read() |
|
433 |
+ f.close() |
|
434 |
+ except urllib2.URLError, e: |
|
435 |
+ if hasattr( e, 'reason' ): |
|
436 |
+ print "Save_image got an error:", e.reason |
|
437 |
+ elif hasattr( e, 'code' ): |
|
438 |
+ print "Save_image got an error. Code:", e.code |
|
439 |
+ return url_string |
|
440 |
+ if len( data ) > 50: |
|
441 |
+ f = open( file_path, 'wb' ) |
|
442 |
+ f.write( data ) |
|
443 |
+ f.close() |
|
444 |
+ return 'cache/' + os.path.basename( file_path ) |
|
445 |
+ return url_string |
|
446 |
+ |
|
447 |
+def Make_index_html( yaml_items, stats ): |
|
448 |
+ cur_time = int( time.time() ) |
|
449 |
+ new_index_fullpath = os.path.join( localdir, 'index.html_new' ) |
|
450 |
+ index_fullpath = os.path.join( localdir, 'index.html' ) |
|
451 |
+ cache_path = os.path.join( localdir, 'cache' ) |
|
452 |
+ |
|
453 |
+ files_to_delete = glob.glob( cache_path + '*.png' ) |
|
454 |
+# shutil.rmtree( cache_path ) |
|
455 |
+# os.mkdir( cache_path ) |
|
456 |
+ |
|
457 |
+ f = file( new_index_fullpath, 'w' ) |
|
458 |
+ f.write( html_head % ( even_background, odd_background ) ) |
|
459 |
+# f.write( '<div align="center">\n<table cellpadding="4">' ) |
|
460 |
+ |
|
461 |
+ f.write( '<div align="center">\n<table class="legend">\n<tr><th>0:30</th><th>1:00</th><th>1:30</th><th>2:00</th><th>2:30</th><th>3:00</th><th>3:30</th><th>4:00</th></tr><tr>' ) |
|
462 |
+ for median, mean, std_dev in stats: |
|
463 |
+ f.write( '<td>med=%1.1f μ=%1.1f σ=%1.1f </td> ' % ( median, mean, std_dev ) ) |
|
464 |
+ f.write( '</tr>\n</table></div>\n<br />\n' ) |
|
465 |
+ |
|
466 |
+ f.write( '<div align="center">\n<table>\n' ) |
|
467 |
+ image_index = 0 |
|
468 |
+ for i in yaml_items[:40]: |
|
469 |
+ chart_url = make_chart_url( i['orig_posted'], |
|
470 |
+ i['comment_times'], |
|
471 |
+ i['comments'], |
|
472 |
+ i['retweet_times'], |
|
473 |
+ i['retweets'], |
|
474 |
+ i['qualified'], |
|
475 |
+ image_index % 2 and even_background or odd_background, |
|
476 |
+ ) |
|
477 |
+ image_url = Save_image( chart_url, os.path.join( cache_path, '%d_%d.png' % ( cur_time, image_index ) ) ) |
|
478 |
+ f.write( '<tr valign="center" class="%s">\n <td><strong><a href="%s">%s</a></strong> <span class="author">by %s</span></td>\n' % \ |
|
479 |
+ ( image_index % 2 and "even" or "odd", |
|
480 |
+ i['link'], |
|
481 |
+ i['title'].encode( 'ascii', 'xmlcharrefreplace' ), |
|
482 |
+ i['author'].encode( 'ascii', 'xmlcharrefreplace' ), |
|
483 |
+ ) |
|
484 |
+ ) |
|
485 |
+ f.write( ' <td>%s<td>\n' % ( i['qualified'] != -1 and '<img src="star_30.png" width="30" height="29" />' or '' ) ) |
|
486 |
+ f.write( ' <td><img src="%s" width="%d" height="%d" border="0" /></td></tr>\n' % \ |
|
487 |
+ ( image_url, |
|
488 |
+ img_width, |
|
489 |
+ img_height |
|
490 |
+ ) |
|
491 |
+ ) |
|
492 |
+ image_index += 1 |
|
493 |
+ f.write( html_footer ) |
|
494 |
+ f.close() |
|
495 |
+ if os.path.exists( index_fullpath ): |
|
496 |
+ os.unlink( index_fullpath ) |
|
497 |
+ shutil.move( new_index_fullpath, index_fullpath ) |
|
498 |
+ for fname in files_to_delete: |
|
499 |
+ os.unlink( fname ) |
|
500 |
+ |
|
501 |
+def Make_feed_file( yaml_items ): |
|
502 |
+ f = open( os.path.join( localdir, 'rss_feed.xml' ), 'wb' ) |
|
503 |
+ f.write( "<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n<rss version=\"2.0\">\n<channel>\n<title>Trending at TechCrunch</title><link>http://techcrunch.dlma.com</link>" ) |
|
504 |
+ f.write( "<pubDate>%s</pubDate><description>Automatically Generated Feed</description><language>en-us</language>" % ( time.strftime( "%a, %d %b %Y %H:%M:%S +0000", time.gmtime() ) ) ) |
|
505 |
+ count = 0 |
|
506 |
+ for item in yaml_items: |
|
507 |
+ now = time.strftime( "%a, %d %b %Y %H:%M:%S +0000", time.gmtime( item['orig_posted'] ) ) |
|
508 |
+ if item['qualified'] != -1: |
|
509 |
+ f.write( "<item><title>%s</title><pubDate>%s</pubDate><link>%s</link><guid isPermaLink=\"false\">%s</guid><description><![CDATA[By: %s]]></description></item>\n" % \ |
|
510 |
+ ( item['title'].encode( 'ascii', 'xmlcharrefreplace' ), now, item['link'], item['link'], item['author'].encode( 'ascii', 'xmlcharrefreplace' ) ) ) |
|
511 |
+ count += 1 |
|
512 |
+ if count > 14: |
|
513 |
+ break |
|
514 |
+ f.write( "</channel></rss>" ) |
|
515 |
+ f.close() |
|
516 |
+ |
|
517 |
+if __name__=='__main__': |
|
518 |
+ start_time = time.time() |
|
519 |
+ progress_text = [] |
|
520 |
+ |
|
521 |
+ old_stdout = sys.stdout |
|
522 |
+ old_stderr = sys.stderr |
|
523 |
+ sys.stdout = sys.stderr = StringIO.StringIO() |
|
524 |
+ |
|
525 |
+ try: |
|
526 |
+ localdir = os.path.abspath( os.path.dirname( sys.argv[0] ) ) |
|
527 |
+ # |
|
528 |
+ # Read in techcrunch.yaml |
|
529 |
+ # |
|
530 |
+ # [ { 'title' : 'Title Text', |
|
531 |
+ # 'link' : u'http://techcrunch.com/2010/08/17/google-buzz-who-to-follow/', |
|
532 |
+ # 'author' : u'MG Siegler', |
|
533 |
+ # 'orig_posted' : 1282197199 |
|
534 |
+ # 'tags' : [ u'Google', u'privacy' ] |
|
535 |
+ # 'qualified' : -1 |
|
536 |
+ # 'comment_times' : [ 1282197199, 1282197407 ] |
|
537 |
+ # 'comments' : [ 0, 15 ] |
|
538 |
+ # 'slash_comment_times' : [ 1282197199, 1282197407 ] |
|
539 |
+ # 'slash_comments' : [ 0, 5 ] |
|
540 |
+ # 'slash_comment_times' : [ 1282197199, 1282197407 ] |
|
541 |
+ # 'slash_comments' : [ 0, 3 ] |
|
542 |
+ # 'retweet_times' : [ 1282197199, 1282197407 ] |
|
543 |
+ # 'retweets' : [ 0, 43 ] |
|
544 |
+ # }, |
|
545 |
+ # { ... } |
|
546 |
+ # ] |
|
547 |
+ # |
|
548 |
+ yaml_fullpath = os.path.join( localdir, 'techcrunch.yaml' ) |
|
549 |
+ if os.path.exists( yaml_fullpath ): |
|
550 |
+ f = file( yaml_fullpath, 'rb' ) |
|
551 |
+ items = yaml.load( f ) |
|
552 |
+ f.close() |
|
553 |
+ else: |
|
554 |
+ print "could not open", yaml_fullpath |
|
555 |
+ items = [] |
|
556 |
+ |
|
557 |
+ progress_text = [ "read techcrunch.yaml" ] |
|
558 |
+ process_feed( items ) |
|
559 |
+ |
|
560 |
+ # |
|
561 |
+ # If any work was done, then write files. |
|
562 |
+ # |
|
563 |
+ if True or any_entry_added: |
|
564 |
+ |
|
565 |
+ stats = analysis.Process_retweets_for_feed( items ) |
|
566 |
+ |
|
567 |
+ # We'll only look at the stats for the time 1:00 to 1:30 after posting. |
|
568 |
+ median, mean, sigma = stats[2] |
|
569 |
+ threshold = median + sigma |
|
570 |
+ for item in items: |
|
571 |
+ if item['qualified'] == -1: |
|
572 |
+ for i in range( len( item['retweet_times'] ) ): |
|
573 |
+ r_time = item['retweet_times'][i] |
|
574 |
+ if r_time - item['orig_posted'] < 5400: |
|
575 |
+ if item['retweets'][i] >= threshold: |
|
576 |
+ item['qualified'] = i |
|
577 |
+ if r_time - item['orig_posted'] >= 3600: |
|
578 |
+ break |
|
579 |
+ |
|
580 |
+ # |
|
581 |
+ # Write out the updated yaml file. |
|
582 |
+ # |
|
583 |
+ f = file( yaml_fullpath, 'wb' ) |
|
584 |
+ yaml.dump( items, f, width=120 ) |
|
585 |
+ f.close() |
|
586 |
+ f = file( os.path.join( localdir, 'techcrunch_text.yaml' ), 'w' ) |
|
587 |
+ yaml.dump( items, f, width=120 ) |
|
588 |
+ f.close() |
|
589 |
+ f = codecs.open( os.path.join( localdir, 'techcrunch_unicode.yaml' ), 'w', 'utf-8' ) |
|
590 |
+ yaml.dump( items, f, encoding='utf-8', width=120 ) |
|
591 |
+ f.close() |
|
592 |
+ |
|
593 |
+ Make_feed_file( items ) |
|
594 |
+ |
|
595 |
+ Make_index_html( items, stats ) |
|
596 |
+ else: |
|
597 |
+ print "No entries were added this time." |
|
598 |
+ |
|
599 |
+ except Exception, e: |
|
600 |
+ exceptional_text = "An exception occurred: " + str( e.__class__ ) + " " + str(e) |
|
601 |
+ print exceptional_text, ' '.join( progress_text ) |
|
602 |
+ traceback.print_exc( file = sys.stdout ) |
|
603 |
+ try: |
|
604 |
+ sendEmail( 'Exception thrown in techcrunch.py', |
|
605 |
+ exceptional_text, |
|
606 |
+ ( 'david.blume@gmail.com', ) ) |
|
607 |
+ except Exception, e: |
|
608 |
+ print "Could not send email to notify you of the exception. :(" |
|
609 |
+ |
|
610 |
+ message = sys.stdout.getvalue() |
|
611 |
+ sys.stdout = old_stdout |
|
612 |
+ sys.stderr = old_stderr |
|
613 |
+ if not debug: |
|
614 |
+ print message |
|
615 |
+ |
|
616 |
+ # Finally, let's save this to a statistics page |
|
617 |
+ if os.path.exists( os.path.join( localdir, 'stats.txt' ) ): |
|
618 |
+ f = open( os.path.join( localdir, 'stats.txt' )) |
|
619 |
+ try: |
|
620 |
+ lines = f.readlines() |
|
621 |
+ finally: |
|
622 |
+ f.close() |
|
623 |
+ else: |
|
624 |
+ lines = [] |
|
625 |
+ lines = lines[:168] # Just keep the past week's worth |
|
626 |
+ status = len( message.strip() ) and message.strip().replace( '\n', ' - ' ) or "OK" |
|
627 |
+ lines.insert( 0, "%s %3.0fs %s\n" % ( time.strftime('%H:%M, %Y-%m-%d', time.localtime()), time.time() - start_time, status )) |
|
628 |
+ f = open( os.path.join( localdir,'stats.txt' ), 'w' ) |
|
629 |
+ f.writelines( lines ) |
|
630 |
+ f.close() |
|
0 | 631 |