dblume commited on 2024-07-25 21:43:27
Showing 3 changed files, with 59 additions and 3674 deletions.
... | ... |
@@ -1,11 +1,10 @@ |
1 |
-#!/usr/bin/env python |
|
1 |
+#!/home/dblume/opt/python-3.9.6/bin/python3 |
|
2 | 2 |
|
3 | 3 |
import yaml |
4 | 4 |
import sys |
5 | 5 |
import os |
6 | 6 |
import time |
7 | 7 |
import traceback |
8 |
-import exceptions |
|
9 | 8 |
import math |
10 | 9 |
import bisect |
11 | 10 |
|
... | ... |
@@ -15,7 +14,7 @@ debug = True |
15 | 14 |
def get_standard_deviation(l): |
16 | 15 |
""" returns the standard deviation of the iterable l """ |
17 | 16 |
mean = sum(l) / len(l) |
18 |
- squares_of_diffs = map(lambda x: pow(x - mean, 2), l) |
|
17 |
+ squares_of_diffs = [pow(x - mean, 2) for x in l] |
|
19 | 18 |
mean_of_squares = sum(squares_of_diffs) / len(squares_of_diffs) |
20 | 19 |
return math.sqrt(mean_of_squares) |
21 | 20 |
|
... | ... |
@@ -48,7 +47,7 @@ def process_comments_for_feed(yaml_items): |
48 | 47 |
stats = [] |
49 | 48 |
for time_block in time_blocks: |
50 | 49 |
mean = sum(time_block) / len(time_block) |
51 |
- squares_of_diffs = map(lambda x: pow(x - mean, 2), time_block) |
|
50 |
+ squares_of_diffs = [pow(x - mean, 2) for x in time_block] |
|
52 | 51 |
mean_of_squares = sum(squares_of_diffs) / len(squares_of_diffs) |
53 | 52 |
std_dev = math.sqrt(mean_of_squares) |
54 | 53 |
stats.append((mean, std_dev)) |
... | ... |
@@ -84,7 +83,7 @@ def calculate_median_mean_stddev(time_blocks): |
84 | 83 |
# Calculate the mean and standard deviation |
85 | 84 |
if count > 0: |
86 | 85 |
mean = sum(block) / float(len(block)) |
87 |
- squares_of_diffs = map(lambda x: pow(x - mean, 2), block) |
|
86 |
+ squares_of_diffs = [pow(x - mean, 2) for x in block] |
|
88 | 87 |
mean_of_squares = sum(squares_of_diffs) / len(squares_of_diffs) |
89 | 88 |
else: |
90 | 89 |
mean = 0 |
... | ... |
@@ -153,7 +152,7 @@ if __name__=='__main__': |
153 | 152 |
items = yaml.load(f) |
154 | 153 |
f.close() |
155 | 154 |
else: |
156 |
- print "could not open", yaml_fullpath |
|
155 |
+ print("could not open", yaml_fullpath) |
|
157 | 156 |
items = [] |
158 | 157 |
|
159 | 158 |
weekend_stats, weekday_stats = Process_feed(items, 'fb_shares', 'comment_times') |
... | ... |
@@ -163,24 +162,24 @@ if __name__=='__main__': |
163 | 162 |
weekend_threshold = weekend_median + (weekend_sigma) |
164 | 163 |
median, mean, sigma = weekday_stats[2] |
165 | 164 |
threshold = median + (sigma) |
166 |
- print "Weekend Median=%1.1f, Mean=%1.1f, Sigma=%1.1f --> Threshold = %1.1f" % (weekend_median, weekend_mean, weekend_sigma, weekend_threshold) |
|
167 |
- print "Weekday Median=%1.1f, Mean=%1.1f, Sigma=%1.1f --> Threshold = %1.1f" % (median, mean, sigma, threshold) |
|
165 |
+ print("Weekend Median=%1.1f, Mean=%1.1f, Sigma=%1.1f --> Threshold = %1.1f" % (weekend_median, weekend_mean, weekend_sigma, weekend_threshold)) |
|
166 |
+ print("Weekday Median=%1.1f, Mean=%1.1f, Sigma=%1.1f --> Threshold = %1.1f" % (median, mean, sigma, threshold)) |
|
168 | 167 |
for item in items: |
169 | 168 |
if item['qualified'] == -1: |
170 |
- print "Processing", item['title'].encode('ascii', 'replace') |
|
169 |
+ print("Processing", item['title'].encode('ascii', 'replace')) |
|
171 | 170 |
for i in range(len(item['retweet_times'])): |
172 | 171 |
r_time = item['retweet_times'][i] |
173 | 172 |
if r_time - item['orig_posted'] < 5400: |
174 |
- print "Time %1.1f = %d" % ((r_time - item['orig_posted']) / 1800.0, item['retweets'][i]), |
|
173 |
+ print("Time %1.1f = %d" % ((r_time - item['orig_posted']) / 1800.0, item['retweets'][i]), end=' ') |
|
175 | 174 |
if item['retweets'][i] >= threshold: |
176 | 175 |
item['qualified'] = i |
177 |
- print "NOW QUALIFIES", |
|
176 |
+ print("NOW QUALIFIES", end=' ') |
|
178 | 177 |
if r_time - item['orig_posted'] >= 3600: |
179 | 178 |
break |
180 |
|
|
179 |
+ print() |
|
181 | 180 |
|
182 | 181 |
except Exception as e: |
183 | 182 |
exceptional_text = "An exception occurred: " + str(e.__class__) + " " + str(e) |
184 |
- print exceptional_text, ' '.join(progress_text) |
|
183 |
+ print(exceptional_text, ' '.join(progress_text)) |
|
185 | 184 |
traceback.print_exc(file=sys.stdout) |
186 | 185 |
|
... | ... |
@@ -1,3612 +0,0 @@ |
1 |
-#!/usr/bin/env python |
|
2 |
-"""Universal feed parser |
|
3 |
- |
|
4 |
-Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds |
|
5 |
- |
|
6 |
-Visit http://feedparser.org/ for the latest version |
|
7 |
-Visit http://feedparser.org/docs/ for the latest documentation |
|
8 |
- |
|
9 |
-Required: Python 2.1 or later |
|
10 |
-Recommended: Python 2.3 or later |
|
11 |
-Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/> |
|
12 |
-""" |
|
13 |
- |
|
14 |
-__version__ = "4.2-pre-" + "$Revision: 291 $"[11:14] + "-svn" |
|
15 |
-__license__ = """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved. |
|
16 |
- |
|
17 |
-Redistribution and use in source and binary forms, with or without modification, |
|
18 |
-are permitted provided that the following conditions are met: |
|
19 |
- |
|
20 |
-* Redistributions of source code must retain the above copyright notice, |
|
21 |
- this list of conditions and the following disclaimer. |
|
22 |
-* Redistributions in binary form must reproduce the above copyright notice, |
|
23 |
- this list of conditions and the following disclaimer in the documentation |
|
24 |
- and/or other materials provided with the distribution. |
|
25 |
- |
|
26 |
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' |
|
27 |
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
28 |
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
29 |
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
|
30 |
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
|
31 |
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
|
32 |
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|
33 |
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
|
34 |
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
|
35 |
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
|
36 |
-POSSIBILITY OF SUCH DAMAGE.""" |
|
37 |
-__author__ = "Mark Pilgrim <http://diveintomark.org/>" |
|
38 |
-__contributors__ = ["Jason Diamond <http://injektilo.org/>", |
|
39 |
- "John Beimler <http://john.beimler.org/>", |
|
40 |
- "Fazal Majid <http://www.majid.info/mylos/weblog/>", |
|
41 |
- "Aaron Swartz <http://aaronsw.com/>", |
|
42 |
- "Kevin Marks <http://epeus.blogspot.com/>", |
|
43 |
- "Sam Ruby <http://intertwingly.net/>"] |
|
44 |
-_debug = 0 |
|
45 |
- |
|
46 |
-# HTTP "User-Agent" header to send to servers when downloading feeds. |
|
47 |
-# If you are embedding feedparser in a larger application, you should |
|
48 |
-# change this to your application name and URL. |
|
49 |
-USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__ |
|
50 |
- |
|
51 |
-# HTTP "Accept" header to send to servers when downloading feeds. If you don't |
|
52 |
-# want to send an Accept header, set this to None. |
|
53 |
-ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1" |
|
54 |
- |
|
55 |
-# List of preferred XML parsers, by SAX driver name. These will be tried first, |
|
56 |
-# but if they're not installed, Python will keep searching through its own list |
|
57 |
-# of pre-installed parsers until it finds one that supports everything we need. |
|
58 |
-PREFERRED_XML_PARSERS = ["drv_libxml2"] |
|
59 |
- |
|
60 |
-# If you want feedparser to automatically run HTML markup through HTML Tidy, set |
|
61 |
-# this to 1. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html> |
|
62 |
-# or utidylib <http://utidylib.berlios.de/>. |
|
63 |
-TIDY_MARKUP = 0 |
|
64 |
- |
|
65 |
-# List of Python interfaces for HTML Tidy, in order of preference. Only useful |
|
66 |
-# if TIDY_MARKUP = 1 |
|
67 |
-PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] |
|
68 |
- |
|
69 |
-# If you want feedparser to automatically resolve all relative URIs, set this |
|
70 |
-# to 1. |
|
71 |
-RESOLVE_RELATIVE_URIS = 1 |
|
72 |
- |
|
73 |
-# If you want feedparser to automatically sanitize all potentially unsafe |
|
74 |
-# HTML content, set this to 1. |
|
75 |
-SANITIZE_HTML = 1 |
|
76 |
- |
|
77 |
-# ---------- required modules (should come with any Python distribution) ---------- |
|
78 |
-import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2 |
|
79 |
-try: |
|
80 |
- from cStringIO import StringIO as _StringIO |
|
81 |
-except: |
|
82 |
- from StringIO import StringIO as _StringIO |
|
83 |
- |
|
84 |
-# ---------- optional modules (feedparser will work without these, but with reduced functionality) ---------- |
|
85 |
- |
|
86 |
-# gzip is included with most Python distributions, but may not be available if you compiled your own |
|
87 |
-try: |
|
88 |
- import gzip |
|
89 |
-except: |
|
90 |
- gzip = None |
|
91 |
-try: |
|
92 |
- import zlib |
|
93 |
-except: |
|
94 |
- zlib = None |
|
95 |
- |
|
96 |
-# If a real XML parser is available, feedparser will attempt to use it. feedparser has |
|
97 |
-# been tested with the built-in SAX parser, PyXML, and libxml2. On platforms where the |
|
98 |
-# Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some |
|
99 |
-# versions of FreeBSD), feedparser will quietly fall back on regex-based parsing. |
|
100 |
-try: |
|
101 |
- import xml.sax |
|
102 |
- xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers |
|
103 |
- from xml.sax.saxutils import escape as _xmlescape |
|
104 |
- _XML_AVAILABLE = 1 |
|
105 |
-except: |
|
106 |
- _XML_AVAILABLE = 0 |
|
107 |
- def _xmlescape(data,entities={}): |
|
108 |
- data = data.replace('&', '&') |
|
109 |
- data = data.replace('>', '>') |
|
110 |
- data = data.replace('<', '<') |
|
111 |
- for char, entity in entities: |
|
112 |
- data = data.replace(char, entity) |
|
113 |
- return data |
|
114 |
- |
|
115 |
-# base64 support for Atom feeds that contain embedded binary data |
|
116 |
-try: |
|
117 |
- import base64, binascii |
|
118 |
-except: |
|
119 |
- base64 = binascii = None |
|
120 |
- |
|
121 |
-# cjkcodecs and iconv_codec provide support for more character encodings. |
|
122 |
-# Both are available from http://cjkpython.i18n.org/ |
|
123 |
-try: |
|
124 |
- import cjkcodecs.aliases |
|
125 |
-except: |
|
126 |
- pass |
|
127 |
-try: |
|
128 |
- import iconv_codec |
|
129 |
-except: |
|
130 |
- pass |
|
131 |
- |
|
132 |
-# chardet library auto-detects character encodings |
|
133 |
-# Download from http://chardet.feedparser.org/ |
|
134 |
-try: |
|
135 |
- import chardet |
|
136 |
- if _debug: |
|
137 |
- import chardet.constants |
|
138 |
- chardet.constants._debug = 1 |
|
139 |
-except: |
|
140 |
- chardet = None |
|
141 |
- |
|
142 |
-# reversable htmlentitydefs mappings for Python 2.2 |
|
143 |
-try: |
|
144 |
- from htmlentitydefs import name2codepoint, codepoint2name |
|
145 |
-except: |
|
146 |
- import htmlentitydefs |
|
147 |
- name2codepoint={} |
|
148 |
- codepoint2name={} |
|
149 |
- for (name,codepoint) in htmlentitydefs.entitydefs.iteritems(): |
|
150 |
- if codepoint.startswith('&#'): codepoint=unichr(int(codepoint[2:-1])) |
|
151 |
- name2codepoint[name]=ord(codepoint) |
|
152 |
- codepoint2name[ord(codepoint)]=name |
|
153 |
- |
|
154 |
-# BeautifulSoup parser used for parsing microformats from embedded HTML content |
|
155 |
-# http://www.crummy.com/software/BeautifulSoup/ |
|
156 |
-# feedparser is tested with BeautifulSoup 3.0.x, but it might work with the |
|
157 |
-# older 2.x series. If it doesn't, and you can figure out why, I'll accept a |
|
158 |
-# patch and modify the compatibility statement accordingly. |
|
159 |
-try: |
|
160 |
- import BeautifulSoup |
|
161 |
-except: |
|
162 |
- BeautifulSoup = None |
|
163 |
- |
|
164 |
-# ---------- don't touch these ---------- |
|
165 |
-class ThingsNobodyCaresAboutButMe(Exception): pass |
|
166 |
-class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass |
|
167 |
-class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass |
|
168 |
-class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass |
|
169 |
-class UndeclaredNamespace(Exception): pass |
|
170 |
- |
|
171 |
-sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') |
|
172 |
-sgmllib.special = re.compile('<!') |
|
173 |
-sgmllib.charref = re.compile('&#(\d+|x[0-9a-fA-F]+);') |
|
174 |
- |
|
175 |
-if sgmllib.endbracket.search(' <').start(0): |
|
176 |
- class EndBracketMatch: |
|
177 |
- endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''') |
|
178 |
- def search(self,string,index=0): |
|
179 |
- self.match = self.endbracket.match(string,index) |
|
180 |
- if self.match: return self |
|
181 |
- def start(self,n): |
|
182 |
- return self.match.end(n) |
|
183 |
- sgmllib.endbracket = EndBracketMatch() |
|
184 |
- |
|
185 |
-SUPPORTED_VERSIONS = {'': 'unknown', |
|
186 |
- 'rss090': 'RSS 0.90', |
|
187 |
- 'rss091n': 'RSS 0.91 (Netscape)', |
|
188 |
- 'rss091u': 'RSS 0.91 (Userland)', |
|
189 |
- 'rss092': 'RSS 0.92', |
|
190 |
- 'rss093': 'RSS 0.93', |
|
191 |
- 'rss094': 'RSS 0.94', |
|
192 |
- 'rss20': 'RSS 2.0', |
|
193 |
- 'rss10': 'RSS 1.0', |
|
194 |
- 'rss': 'RSS (unknown version)', |
|
195 |
- 'atom01': 'Atom 0.1', |
|
196 |
- 'atom02': 'Atom 0.2', |
|
197 |
- 'atom03': 'Atom 0.3', |
|
198 |
- 'atom10': 'Atom 1.0', |
|
199 |
- 'atom': 'Atom (unknown version)', |
|
200 |
- 'cdf': 'CDF', |
|
201 |
- 'hotrss': 'Hot RSS' |
|
202 |
- } |
|
203 |
- |
|
204 |
-try: |
|
205 |
- UserDict = dict |
|
206 |
-except NameError: |
|
207 |
- # Python 2.1 does not have dict |
|
208 |
- from UserDict import UserDict |
|
209 |
- def dict(aList): |
|
210 |
- rc = {} |
|
211 |
- for k, v in aList: |
|
212 |
- rc[k] = v |
|
213 |
- return rc |
|
214 |
- |
|
215 |
-class FeedParserDict(UserDict): |
|
216 |
- keymap = {'channel': 'feed', |
|
217 |
- 'items': 'entries', |
|
218 |
- 'guid': 'id', |
|
219 |
- 'date': 'updated', |
|
220 |
- 'date_parsed': 'updated_parsed', |
|
221 |
- 'description': ['subtitle', 'summary'], |
|
222 |
- 'url': ['href'], |
|
223 |
- 'modified': 'updated', |
|
224 |
- 'modified_parsed': 'updated_parsed', |
|
225 |
- 'issued': 'published', |
|
226 |
- 'issued_parsed': 'published_parsed', |
|
227 |
- 'copyright': 'rights', |
|
228 |
- 'copyright_detail': 'rights_detail', |
|
229 |
- 'tagline': 'subtitle', |
|
230 |
- 'tagline_detail': 'subtitle_detail'} |
|
231 |
- def __getitem__(self, key): |
|
232 |
- if key == 'category': |
|
233 |
- return UserDict.__getitem__(self, 'tags')[0]['term'] |
|
234 |
- if key == 'enclosures': |
|
235 |
- norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel']) |
|
236 |
- return [norel(link) for link in UserDict.__getitem__(self, 'links') if link['rel']=='enclosure'] |
|
237 |
- if key == 'license': |
|
238 |
- for link in UserDict.__getitem__(self, 'links'): |
|
239 |
- if link['rel']=='license' and link.has_key('href'): |
|
240 |
- return link['href'] |
|
241 |
- if key == 'categories': |
|
242 |
- return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')] |
|
243 |
- realkey = self.keymap.get(key, key) |
|
244 |
- if type(realkey) == types.ListType: |
|
245 |
- for k in realkey: |
|
246 |
- if UserDict.has_key(self, k): |
|
247 |
- return UserDict.__getitem__(self, k) |
|
248 |
- if UserDict.has_key(self, key): |
|
249 |
- return UserDict.__getitem__(self, key) |
|
250 |
- return UserDict.__getitem__(self, realkey) |
|
251 |
- |
|
252 |
- def __setitem__(self, key, value): |
|
253 |
- for k in self.keymap.keys(): |
|
254 |
- if key == k: |
|
255 |
- key = self.keymap[k] |
|
256 |
- if type(key) == types.ListType: |
|
257 |
- key = key[0] |
|
258 |
- return UserDict.__setitem__(self, key, value) |
|
259 |
- |
|
260 |
- def get(self, key, default=None): |
|
261 |
- if self.has_key(key): |
|
262 |
- return self[key] |
|
263 |
- else: |
|
264 |
- return default |
|
265 |
- |
|
266 |
- def setdefault(self, key, value): |
|
267 |
- if not self.has_key(key): |
|
268 |
- self[key] = value |
|
269 |
- return self[key] |
|
270 |
- |
|
271 |
- def has_key(self, key): |
|
272 |
- try: |
|
273 |
- return hasattr(self, key) or UserDict.has_key(self, key) |
|
274 |
- except AttributeError: |
|
275 |
- return False |
|
276 |
- |
|
277 |
- def __getattr__(self, key): |
|
278 |
- try: |
|
279 |
- return self.__dict__[key] |
|
280 |
- except KeyError: |
|
281 |
- pass |
|
282 |
- try: |
|
283 |
- assert not key.startswith('_') |
|
284 |
- return self.__getitem__(key) |
|
285 |
- except: |
|
286 |
- raise AttributeError, "object has no attribute '%s'" % key |
|
287 |
- |
|
288 |
- def __setattr__(self, key, value): |
|
289 |
- if key.startswith('_') or key == 'data': |
|
290 |
- self.__dict__[key] = value |
|
291 |
- else: |
|
292 |
- return self.__setitem__(key, value) |
|
293 |
- |
|
294 |
- def __contains__(self, key): |
|
295 |
- return self.has_key(key) |
|
296 |
- |
|
297 |
-def zopeCompatibilityHack(): |
|
298 |
- global FeedParserDict |
|
299 |
- del FeedParserDict |
|
300 |
- def FeedParserDict(aDict=None): |
|
301 |
- rc = {} |
|
302 |
- if aDict: |
|
303 |
- rc.update(aDict) |
|
304 |
- return rc |
|
305 |
- |
|
306 |
-_ebcdic_to_ascii_map = None |
|
307 |
-def _ebcdic_to_ascii(s): |
|
308 |
- global _ebcdic_to_ascii_map |
|
309 |
- if not _ebcdic_to_ascii_map: |
|
310 |
- emap = ( |
|
311 |
- 0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, |
|
312 |
- 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, |
|
313 |
- 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, |
|
314 |
- 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, |
|
315 |
- 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, |
|
316 |
- 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, |
|
317 |
- 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, |
|
318 |
- 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, |
|
319 |
- 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201, |
|
320 |
- 202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208, |
|
321 |
- 209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215, |
|
322 |
- 216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231, |
|
323 |
- 123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237, |
|
324 |
- 125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243, |
|
325 |
- 92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249, |
|
326 |
- 48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255 |
|
327 |
- ) |
|
328 |
- import string |
|
329 |
- _ebcdic_to_ascii_map = string.maketrans( \ |
|
330 |
- ''.join(map(chr, range(256))), ''.join(map(chr, emap))) |
|
331 |
- return s.translate(_ebcdic_to_ascii_map) |
|
332 |
- |
|
333 |
-_cp1252 = { |
|
334 |
- unichr(128): unichr(8364), # euro sign |
|
335 |
- unichr(130): unichr(8218), # single low-9 quotation mark |
|
336 |
- unichr(131): unichr( 402), # latin small letter f with hook |
|
337 |
- unichr(132): unichr(8222), # double low-9 quotation mark |
|
338 |
- unichr(133): unichr(8230), # horizontal ellipsis |
|
339 |
- unichr(134): unichr(8224), # dagger |
|
340 |
- unichr(135): unichr(8225), # double dagger |
|
341 |
- unichr(136): unichr( 710), # modifier letter circumflex accent |
|
342 |
- unichr(137): unichr(8240), # per mille sign |
|
343 |
- unichr(138): unichr( 352), # latin capital letter s with caron |
|
344 |
- unichr(139): unichr(8249), # single left-pointing angle quotation mark |
|
345 |
- unichr(140): unichr( 338), # latin capital ligature oe |
|
346 |
- unichr(142): unichr( 381), # latin capital letter z with caron |
|
347 |
- unichr(145): unichr(8216), # left single quotation mark |
|
348 |
- unichr(146): unichr(8217), # right single quotation mark |
|
349 |
- unichr(147): unichr(8220), # left double quotation mark |
|
350 |
- unichr(148): unichr(8221), # right double quotation mark |
|
351 |
- unichr(149): unichr(8226), # bullet |
|
352 |
- unichr(150): unichr(8211), # en dash |
|
353 |
- unichr(151): unichr(8212), # em dash |
|
354 |
- unichr(152): unichr( 732), # small tilde |
|
355 |
- unichr(153): unichr(8482), # trade mark sign |
|
356 |
- unichr(154): unichr( 353), # latin small letter s with caron |
|
357 |
- unichr(155): unichr(8250), # single right-pointing angle quotation mark |
|
358 |
- unichr(156): unichr( 339), # latin small ligature oe |
|
359 |
- unichr(158): unichr( 382), # latin small letter z with caron |
|
360 |
- unichr(159): unichr( 376)} # latin capital letter y with diaeresis |
|
361 |
- |
|
362 |
-_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)') |
|
363 |
-def _urljoin(base, uri): |
|
364 |
- uri = _urifixer.sub(r'\1\3', uri) |
|
365 |
- try: |
|
366 |
- return urlparse.urljoin(base, uri) |
|
367 |
- except: |
|
368 |
- uri = urlparse.urlunparse([urllib.quote(part) for part in urlparse.urlparse(uri)]) |
|
369 |
- return urlparse.urljoin(base, uri) |
|
370 |
- |
|
371 |
-class _FeedParserMixin: |
|
372 |
- namespaces = {'': '', |
|
373 |
- 'http://backend.userland.com/rss': '', |
|
374 |
- 'http://blogs.law.harvard.edu/tech/rss': '', |
|
375 |
- 'http://purl.org/rss/1.0/': '', |
|
376 |
- 'http://my.netscape.com/rdf/simple/0.9/': '', |
|
377 |
- 'http://example.com/newformat#': '', |
|
378 |
- 'http://example.com/necho': '', |
|
379 |
- 'http://purl.org/echo/': '', |
|
380 |
- 'uri/of/echo/namespace#': '', |
|
381 |
- 'http://purl.org/pie/': '', |
|
382 |
- 'http://purl.org/atom/ns#': '', |
|
383 |
- 'http://www.w3.org/2005/Atom': '', |
|
384 |
- 'http://purl.org/rss/1.0/modules/rss091#': '', |
|
385 |
- |
|
386 |
- 'http://webns.net/mvcb/': 'admin', |
|
387 |
- 'http://purl.org/rss/1.0/modules/aggregation/': 'ag', |
|
388 |
- 'http://purl.org/rss/1.0/modules/annotate/': 'annotate', |
|
389 |
- 'http://media.tangent.org/rss/1.0/': 'audio', |
|
390 |
- 'http://backend.userland.com/blogChannelModule': 'blogChannel', |
|
391 |
- 'http://web.resource.org/cc/': 'cc', |
|
392 |
- 'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons', |
|
393 |
- 'http://purl.org/rss/1.0/modules/company': 'co', |
|
394 |
- 'http://purl.org/rss/1.0/modules/content/': 'content', |
|
395 |
- 'http://my.theinfo.org/changed/1.0/rss/': 'cp', |
|
396 |
- 'http://purl.org/dc/elements/1.1/': 'dc', |
|
397 |
- 'http://purl.org/dc/terms/': 'dcterms', |
|
398 |
- 'http://purl.org/rss/1.0/modules/email/': 'email', |
|
399 |
- 'http://purl.org/rss/1.0/modules/event/': 'ev', |
|
400 |
- 'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner', |
|
401 |
- 'http://freshmeat.net/rss/fm/': 'fm', |
|
402 |
- 'http://xmlns.com/foaf/0.1/': 'foaf', |
|
403 |
- 'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo', |
|
404 |
- 'http://postneo.com/icbm/': 'icbm', |
|
405 |
- 'http://purl.org/rss/1.0/modules/image/': 'image', |
|
406 |
- 'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes', |
|
407 |
- 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes', |
|
408 |
- 'http://purl.org/rss/1.0/modules/link/': 'l', |
|
409 |
- 'http://search.yahoo.com/mrss': 'media', |
|
410 |
- 'http://madskills.com/public/xml/rss/module/pingback/': 'pingback', |
|
411 |
- 'http://prismstandard.org/namespaces/1.2/basic/': 'prism', |
|
412 |
- 'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf', |
|
413 |
- 'http://www.w3.org/2000/01/rdf-schema#': 'rdfs', |
|
414 |
- 'http://purl.org/rss/1.0/modules/reference/': 'ref', |
|
415 |
- 'http://purl.org/rss/1.0/modules/richequiv/': 'reqv', |
|
416 |
- 'http://purl.org/rss/1.0/modules/search/': 'search', |
|
417 |
- 'http://purl.org/rss/1.0/modules/slash/': 'slash', |
|
418 |
- 'http://schemas.xmlsoap.org/soap/envelope/': 'soap', |
|
419 |
- 'http://purl.org/rss/1.0/modules/servicestatus/': 'ss', |
|
420 |
- 'http://hacks.benhammersley.com/rss/streaming/': 'str', |
|
421 |
- 'http://purl.org/rss/1.0/modules/subscription/': 'sub', |
|
422 |
- 'http://purl.org/rss/1.0/modules/syndication/': 'sy', |
|
423 |
- 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf', |
|
424 |
- 'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo', |
|
425 |
- 'http://purl.org/rss/1.0/modules/threading/': 'thr', |
|
426 |
- 'http://purl.org/rss/1.0/modules/textinput/': 'ti', |
|
427 |
- 'http://madskills.com/public/xml/rss/module/trackback/':'trackback', |
|
428 |
- 'http://wellformedweb.org/commentAPI/': 'wfw', |
|
429 |
- 'http://purl.org/rss/1.0/modules/wiki/': 'wiki', |
|
430 |
- 'http://www.w3.org/1999/xhtml': 'xhtml', |
|
431 |
- 'http://www.w3.org/1999/xlink': 'xlink', |
|
432 |
- 'http://www.w3.org/XML/1998/namespace': 'xml' |
|
433 |
-} |
|
434 |
- _matchnamespaces = {} |
|
435 |
- |
|
436 |
- can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo'] |
|
437 |
- can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'] |
|
438 |
- can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'] |
|
439 |
- html_types = ['text/html', 'application/xhtml+xml'] |
|
440 |
- |
|
441 |
- def __init__(self, baseuri=None, baselang=None, encoding='utf-8'): |
|
442 |
- if _debug: sys.stderr.write('initializing FeedParser\n') |
|
443 |
- if not self._matchnamespaces: |
|
444 |
- for k, v in self.namespaces.items(): |
|
445 |
- self._matchnamespaces[k.lower()] = v |
|
446 |
- self.feeddata = FeedParserDict() # feed-level data |
|
447 |
- self.encoding = encoding # character encoding |
|
448 |
- self.entries = [] # list of entry-level data |
|
449 |
- self.version = '' # feed type/version, see SUPPORTED_VERSIONS |
|
450 |
- self.namespacesInUse = {} # dictionary of namespaces defined by the feed |
|
451 |
- |
|
452 |
- # the following are used internally to track state; |
|
453 |
- # this is really out of control and should be refactored |
|
454 |
- self.infeed = 0 |
|
455 |
- self.inentry = 0 |
|
456 |
- self.incontent = 0 |
|
457 |
- self.intextinput = 0 |
|
458 |
- self.inimage = 0 |
|
459 |
- self.inauthor = 0 |
|
460 |
- self.incontributor = 0 |
|
461 |
- self.inpublisher = 0 |
|
462 |
- self.insource = 0 |
|
463 |
- self.sourcedata = FeedParserDict() |
|
464 |
- self.contentparams = FeedParserDict() |
|
465 |
- self._summaryKey = None |
|
466 |
- self.namespacemap = {} |
|
467 |
- self.elementstack = [] |
|
468 |
- self.basestack = [] |
|
469 |
- self.langstack = [] |
|
470 |
- self.baseuri = baseuri or '' |
|
471 |
- self.lang = baselang or None |
|
472 |
- self.svgOK = 0 |
|
473 |
- self.hasTitle = 0 |
|
474 |
- if baselang: |
|
475 |
- self.feeddata['language'] = baselang.replace('_','-') |
|
476 |
- |
|
477 |
- def unknown_starttag(self, tag, attrs): |
|
478 |
- if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs)) |
|
479 |
- # normalize attrs |
|
480 |
- attrs = [(k.lower(), v) for k, v in attrs] |
|
481 |
- attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] |
|
482 |
- |
|
483 |
- # track xml:base and xml:lang |
|
484 |
- attrsD = dict(attrs) |
|
485 |
- baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri |
|
486 |
- if type(baseuri) != type(u''): |
|
487 |
- try: |
|
488 |
- baseuri = unicode(baseuri, self.encoding) |
|
489 |
- except: |
|
490 |
- baseuri = unicode(baseuri, 'iso-8859-1') |
|
491 |
- self.baseuri = _urljoin(self.baseuri, baseuri) |
|
492 |
- lang = attrsD.get('xml:lang', attrsD.get('lang')) |
|
493 |
- if lang == '': |
|
494 |
- # xml:lang could be explicitly set to '', we need to capture that |
|
495 |
- lang = None |
|
496 |
- elif lang is None: |
|
497 |
- # if no xml:lang is specified, use parent lang |
|
498 |
- lang = self.lang |
|
499 |
- if lang: |
|
500 |
- if tag in ('feed', 'rss', 'rdf:RDF'): |
|
501 |
- self.feeddata['language'] = lang.replace('_','-') |
|
502 |
- self.lang = lang |
|
503 |
- self.basestack.append(self.baseuri) |
|
504 |
- self.langstack.append(lang) |
|
505 |
- |
|
506 |
- # track namespaces |
|
507 |
- for prefix, uri in attrs: |
|
508 |
- if prefix.startswith('xmlns:'): |
|
509 |
- self.trackNamespace(prefix[6:], uri) |
|
510 |
- elif prefix == 'xmlns': |
|
511 |
- self.trackNamespace(None, uri) |
|
512 |
- |
|
513 |
- # track inline content |
|
514 |
- if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): |
|
515 |
- if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007 |
|
516 |
- # element declared itself as escaped markup, but it isn't really |
|
517 |
- self.contentparams['type'] = 'application/xhtml+xml' |
|
518 |
- if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml': |
|
519 |
- if tag.find(':') <> -1: |
|
520 |
- prefix, tag = tag.split(':', 1) |
|
521 |
- namespace = self.namespacesInUse.get(prefix, '') |
|
522 |
- if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML': |
|
523 |
- attrs.append(('xmlns',namespace)) |
|
524 |
- if tag=='svg' and namespace=='http://www.w3.org/2000/svg': |
|
525 |
- attrs.append(('xmlns',namespace)) |
|
526 |
- if tag == 'svg': self.svgOK += 1 |
|
527 |
- return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0) |
|
528 |
- |
|
529 |
- # match namespaces |
|
530 |
- if tag.find(':') <> -1: |
|
531 |
- prefix, suffix = tag.split(':', 1) |
|
532 |
- else: |
|
533 |
- prefix, suffix = '', tag |
|
534 |
- prefix = self.namespacemap.get(prefix, prefix) |
|
535 |
- if prefix: |
|
536 |
- prefix = prefix + '_' |
|
537 |
- |
|
538 |
- # special hack for better tracking of empty textinput/image elements in illformed feeds |
|
539 |
- if (not prefix) and tag not in ('title', 'link', 'description', 'name'): |
|
540 |
- self.intextinput = 0 |
|
541 |
- if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'): |
|
542 |
- self.inimage = 0 |
|
543 |
- |
|
544 |
- # call special handler (if defined) or default handler |
|
545 |
- methodname = '_start_' + prefix + suffix |
|
546 |
- try: |
|
547 |
- method = getattr(self, methodname) |
|
548 |
- return method(attrsD) |
|
549 |
- except AttributeError: |
|
550 |
- return self.push(prefix + suffix, 1) |
|
551 |
- |
|
552 |
- def unknown_endtag(self, tag): |
|
553 |
- if _debug: sys.stderr.write('end %s\n' % tag) |
|
554 |
- # match namespaces |
|
555 |
- if tag.find(':') <> -1: |
|
556 |
- prefix, suffix = tag.split(':', 1) |
|
557 |
- else: |
|
558 |
- prefix, suffix = '', tag |
|
559 |
- prefix = self.namespacemap.get(prefix, prefix) |
|
560 |
- if prefix: |
|
561 |
- prefix = prefix + '_' |
|
562 |
- if suffix == 'svg' and self.svgOK: self.svgOK -= 1 |
|
563 |
- |
|
564 |
- # call special handler (if defined) or default handler |
|
565 |
- methodname = '_end_' + prefix + suffix |
|
566 |
- try: |
|
567 |
- if self.svgOK: raise AttributeError() |
|
568 |
- method = getattr(self, methodname) |
|
569 |
- method() |
|
570 |
- except AttributeError: |
|
571 |
- self.pop(prefix + suffix) |
|
572 |
- |
|
573 |
- # track inline content |
|
574 |
- if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): |
|
575 |
- # element declared itself as escaped markup, but it isn't really |
|
576 |
- if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007 |
|
577 |
- self.contentparams['type'] = 'application/xhtml+xml' |
|
578 |
- if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml': |
|
579 |
- tag = tag.split(':')[-1] |
|
580 |
- self.handle_data('</%s>' % tag, escape=0) |
|
581 |
- |
|
582 |
- # track xml:base and xml:lang going out of scope |
|
583 |
- if self.basestack: |
|
584 |
- self.basestack.pop() |
|
585 |
- if self.basestack and self.basestack[-1]: |
|
586 |
- self.baseuri = self.basestack[-1] |
|
587 |
- if self.langstack: |
|
588 |
- self.langstack.pop() |
|
589 |
- if self.langstack: # and (self.langstack[-1] is not None): |
|
590 |
- self.lang = self.langstack[-1] |
|
591 |
- |
|
592 |
- def handle_charref(self, ref): |
|
593 |
- # called for each character reference, e.g. for ' ', ref will be '160' |
|
594 |
- if not self.elementstack: return |
|
595 |
- ref = ref.lower() |
|
596 |
- if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'): |
|
597 |
- text = '&#%s;' % ref |
|
598 |
- else: |
|
599 |
- if ref[0] == 'x': |
|
600 |
- c = int(ref[1:], 16) |
|
601 |
- else: |
|
602 |
- c = int(ref) |
|
603 |
- text = unichr(c).encode('utf-8') |
|
604 |
- self.elementstack[-1][2].append(text) |
|
605 |
- |
|
606 |
- def handle_entityref(self, ref): |
|
607 |
- # called for each entity reference, e.g. for '©', ref will be 'copy' |
|
608 |
- if not self.elementstack: return |
|
609 |
- if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref) |
|
610 |
- if ref in ('lt', 'gt', 'quot', 'amp', 'apos'): |
|
611 |
- text = '&%s;' % ref |
|
612 |
- elif ref in self.entities.keys(): |
|
613 |
- text = self.entities[ref] |
|
614 |
- if text.startswith('&#') and text.endswith(';'): |
|
615 |
- return self.handle_entityref(text) |
|
616 |
- else: |
|
617 |
- try: name2codepoint[ref] |
|
618 |
- except KeyError: text = '&%s;' % ref |
|
619 |
- else: text = unichr(name2codepoint[ref]).encode('utf-8') |
|
620 |
- self.elementstack[-1][2].append(text) |
|
621 |
- |
|
622 |
- def handle_data(self, text, escape=1): |
|
623 |
- # called for each block of plain text, i.e. outside of any tag and |
|
624 |
- # not containing any character or entity references |
|
625 |
- if not self.elementstack: return |
|
626 |
- if escape and self.contentparams.get('type') == 'application/xhtml+xml': |
|
627 |
- text = _xmlescape(text) |
|
628 |
- self.elementstack[-1][2].append(text) |
|
629 |
- |
|
630 |
- def handle_comment(self, text): |
|
631 |
- # called for each comment, e.g. <!-- insert message here --> |
|
632 |
- pass |
|
633 |
- |
|
634 |
- def handle_pi(self, text): |
|
635 |
- # called for each processing instruction, e.g. <?instruction> |
|
636 |
- pass |
|
637 |
- |
|
638 |
- def handle_decl(self, text): |
|
639 |
- pass |
|
640 |
- |
|
641 |
- def parse_declaration(self, i): |
|
642 |
- # override internal declaration handler to handle CDATA blocks |
|
643 |
- if _debug: sys.stderr.write('entering parse_declaration\n') |
|
644 |
- if self.rawdata[i:i+9] == '<![CDATA[': |
|
645 |
- k = self.rawdata.find(']]>', i) |
|
646 |
- if k == -1: k = len(self.rawdata) |
|
647 |
- self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0) |
|
648 |
- return k+3 |
|
649 |
- else: |
|
650 |
- k = self.rawdata.find('>', i) |
|
651 |
- return k+1 |
|
652 |
- |
|
653 |
- def mapContentType(self, contentType): |
|
654 |
- contentType = contentType.lower() |
|
655 |
- if contentType == 'text': |
|
656 |
- contentType = 'text/plain' |
|
657 |
- elif contentType == 'html': |
|
658 |
- contentType = 'text/html' |
|
659 |
- elif contentType == 'xhtml': |
|
660 |
- contentType = 'application/xhtml+xml' |
|
661 |
- return contentType |
|
662 |
- |
|
663 |
- def trackNamespace(self, prefix, uri): |
|
664 |
- loweruri = uri.lower() |
|
665 |
- if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version: |
|
666 |
- self.version = 'rss090' |
|
667 |
- if loweruri == 'http://purl.org/rss/1.0/' and not self.version: |
|
668 |
- self.version = 'rss10' |
|
669 |
- if loweruri == 'http://www.w3.org/2005/atom' and not self.version: |
|
670 |
- self.version = 'atom10' |
|
671 |
- if loweruri.find('backend.userland.com/rss') <> -1: |
|
672 |
- # match any backend.userland.com namespace |
|
673 |
- uri = 'http://backend.userland.com/rss' |
|
674 |
- loweruri = uri |
|
675 |
- if self._matchnamespaces.has_key(loweruri): |
|
676 |
- self.namespacemap[prefix] = self._matchnamespaces[loweruri] |
|
677 |
- self.namespacesInUse[self._matchnamespaces[loweruri]] = uri |
|
678 |
- else: |
|
679 |
- self.namespacesInUse[prefix or ''] = uri |
|
680 |
- |
|
681 |
- def resolveURI(self, uri): |
|
682 |
- return _urljoin(self.baseuri or '', uri) |
|
683 |
- |
|
684 |
- def decodeEntities(self, element, data): |
|
685 |
- return data |
|
686 |
- |
|
687 |
- def strattrs(self, attrs): |
|
688 |
- return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'"'})) for t in attrs]) |
|
689 |
- |
|
690 |
- def push(self, element, expectingText): |
|
691 |
- self.elementstack.append([element, expectingText, []]) |
|
692 |
- |
|
693 |
- def pop(self, element, stripWhitespace=1): |
|
694 |
- if not self.elementstack: return |
|
695 |
- if self.elementstack[-1][0] != element: return |
|
696 |
- |
|
697 |
- element, expectingText, pieces = self.elementstack.pop() |
|
698 |
- |
|
699 |
- if self.version == 'atom10' and self.contentparams.get('type','text') == 'application/xhtml+xml': |
|
700 |
- # remove enclosing child element, but only if it is a <div> and |
|
701 |
- # only if all the remaining content is nested underneath it. |
|
702 |
- # This means that the divs would be retained in the following: |
|
703 |
- # <div>foo</div><div>bar</div> |
|
704 |
- while pieces and len(pieces)>1 and not pieces[-1].strip(): |
|
705 |
- del pieces[-1] |
|
706 |
- while pieces and len(pieces)>1 and not pieces[0].strip(): |
|
707 |
- del pieces[0] |
|
708 |
- if pieces and (pieces[0] == '<div>' or pieces[0].startswith('<div ')) and pieces[-1]=='</div>': |
|
709 |
- depth = 0 |
|
710 |
- for piece in pieces[:-1]: |
|
711 |
- if piece.startswith('</'): |
|
712 |
- depth -= 1 |
|
713 |
- if depth == 0: break |
|
714 |
- elif piece.startswith('<') and not piece.endswith('/>'): |
|
715 |
- depth += 1 |
|
716 |
- else: |
|
717 |
- pieces = pieces[1:-1] |
|
718 |
- |
|
719 |
- output = ''.join(pieces) |
|
720 |
- if stripWhitespace: |
|
721 |
- output = output.strip() |
|
722 |
- if not expectingText: return output |
|
723 |
- |
|
724 |
- # decode base64 content |
|
725 |
- if base64 and self.contentparams.get('base64', 0): |
|
726 |
- try: |
|
727 |
- output = base64.decodestring(output) |
|
728 |
- except binascii.Error: |
|
729 |
- pass |
|
730 |
- except binascii.Incomplete: |
|
731 |
- pass |
|
732 |
- |
|
733 |
- # resolve relative URIs |
|
734 |
- if (element in self.can_be_relative_uri) and output: |
|
735 |
- output = self.resolveURI(output) |
|
736 |
- |
|
737 |
- # decode entities within embedded markup |
|
738 |
- if not self.contentparams.get('base64', 0): |
|
739 |
- output = self.decodeEntities(element, output) |
|
740 |
- |
|
741 |
- if self.lookslikehtml(output): |
|
742 |
- self.contentparams['type']='text/html' |
|
743 |
- |
|
744 |
- # remove temporary cruft from contentparams |
|
745 |
- try: |
|
746 |
- del self.contentparams['mode'] |
|
747 |
- except KeyError: |
|
748 |
- pass |
|
749 |
- try: |
|
750 |
- del self.contentparams['base64'] |
|
751 |
- except KeyError: |
|
752 |
- pass |
|
753 |
- |
|
754 |
- is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types |
|
755 |
- # resolve relative URIs within embedded markup |
|
756 |
- if is_htmlish and RESOLVE_RELATIVE_URIS: |
|
757 |
- if element in self.can_contain_relative_uris: |
|
758 |
- output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html')) |
|
759 |
- |
|
760 |
- # parse microformats |
|
761 |
- # (must do this before sanitizing because some microformats |
|
762 |
- # rely on elements that we sanitize) |
|
763 |
- if is_htmlish and element in ['content', 'description', 'summary']: |
|
764 |
- mfresults = _parseMicroformats(output, self.baseuri, self.encoding) |
|
765 |
- if mfresults: |
|
766 |
- for tag in mfresults.get('tags', []): |
|
767 |
- self._addTag(tag['term'], tag['scheme'], tag['label']) |
|
768 |
- for enclosure in mfresults.get('enclosures', []): |
|
769 |
- self._start_enclosure(enclosure) |
|
770 |
- for xfn in mfresults.get('xfn', []): |
|
771 |
- self._addXFN(xfn['relationships'], xfn['href'], xfn['name']) |
|
772 |
- vcard = mfresults.get('vcard') |
|
773 |
- if vcard: |
|
774 |
- self._getContext()['vcard'] = vcard |
|
775 |
- |
|
776 |
- # sanitize embedded markup |
|
777 |
- if is_htmlish and SANITIZE_HTML: |
|
778 |
- if element in self.can_contain_dangerous_markup: |
|
779 |
- output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html')) |
|
780 |
- |
|
781 |
- if self.encoding and type(output) != type(u''): |
|
782 |
- try: |
|
783 |
- output = unicode(output, self.encoding) |
|
784 |
- except: |
|
785 |
- pass |
|
786 |
- |
|
787 |
- # address common error where people take data that is already |
|
788 |
- # utf-8, presume that it is iso-8859-1, and re-encode it. |
|
789 |
- if self.encoding=='utf-8' and type(output) == type(u''): |
|
790 |
- try: |
|
791 |
- output = unicode(output.encode('iso-8859-1'), 'utf-8') |
|
792 |
- except: |
|
793 |
- pass |
|
794 |
- |
|
795 |
- # map win-1252 extensions to the proper code points |
|
796 |
- if type(output) == type(u''): |
|
797 |
- output = u''.join([c in _cp1252.keys() and _cp1252[c] or c for c in output]) |
|
798 |
- |
|
799 |
- # categories/tags/keywords/whatever are handled in _end_category |
|
800 |
- if element == 'category': |
|
801 |
- return output |
|
802 |
- |
|
803 |
- if element == 'title' and self.hasTitle: |
|
804 |
- return output |
|
805 |
- |
|
806 |
- # store output in appropriate place(s) |
|
807 |
- if self.inentry and not self.insource: |
|
808 |
- if element == 'content': |
|
809 |
- self.entries[-1].setdefault(element, []) |
|
810 |
- contentparams = copy.deepcopy(self.contentparams) |
|
811 |
- contentparams['value'] = output |
|
812 |
- self.entries[-1][element].append(contentparams) |
|
813 |
- elif element == 'link': |
|
814 |
- self.entries[-1][element] = output |
|
815 |
- if output: |
|
816 |
- self.entries[-1]['links'][-1]['href'] = output |
|
817 |
- else: |
|
818 |
- if element == 'description': |
|
819 |
- element = 'summary' |
|
820 |
- self.entries[-1][element] = output |
|
821 |
- if self.incontent: |
|
822 |
- contentparams = copy.deepcopy(self.contentparams) |
|
823 |
- contentparams['value'] = output |
|
824 |
- self.entries[-1][element + '_detail'] = contentparams |
|
825 |
- elif (self.infeed or self.insource):# and (not self.intextinput) and (not self.inimage): |
|
826 |
- context = self._getContext() |
|
827 |
- if element == 'description': |
|
828 |
- element = 'subtitle' |
|
829 |
- context[element] = output |
|
830 |
- if element == 'link': |
|
831 |
- context['links'][-1]['href'] = output |
|
832 |
- elif self.incontent: |
|
833 |
- contentparams = copy.deepcopy(self.contentparams) |
|
834 |
- contentparams['value'] = output |
|
835 |
- context[element + '_detail'] = contentparams |
|
836 |
- return output |
|
837 |
- |
|
838 |
- def pushContent(self, tag, attrsD, defaultContentType, expectingText): |
|
839 |
- self.incontent += 1 |
|
840 |
- if self.lang: self.lang=self.lang.replace('_','-') |
|
841 |
- self.contentparams = FeedParserDict({ |
|
842 |
- 'type': self.mapContentType(attrsD.get('type', defaultContentType)), |
|
843 |
- 'language': self.lang, |
|
844 |
- 'base': self.baseuri}) |
|
845 |
- self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams) |
|
846 |
- self.push(tag, expectingText) |
|
847 |
- |
|
848 |
- def popContent(self, tag): |
|
849 |
- value = self.pop(tag) |
|
850 |
- self.incontent -= 1 |
|
851 |
- self.contentparams.clear() |
|
852 |
- return value |
|
853 |
- |
|
854 |
- # a number of elements in a number of RSS variants are nominally plain |
|
855 |
- # text, but this is routinely ignored. This is an attempt to detect |
|
856 |
- # the most common cases. As false positives often result in silent |
|
857 |
- # data loss, this function errs on the conservative side. |
|
858 |
- def lookslikehtml(self, str): |
|
859 |
- if self.version.startswith('atom'): return |
|
860 |
- if self.contentparams.get('type','text/html') != 'text/plain': return |
|
861 |
- |
|
862 |
- # must have a close tag or a entity reference to qualify |
|
863 |
- if not (re.search(r'</(\w+)>',str) or re.search("&#?\w+;",str)): return |
|
864 |
- |
|
865 |
- # all tags must be in a restricted subset of valid HTML tags |
|
866 |
- if filter(lambda t: t.lower() not in _HTMLSanitizer.acceptable_elements, |
|
867 |
- re.findall(r'</?(\w+)',str)): return |
|
868 |
- |
|
869 |
- # all entities must have been defined as valid HTML entities |
|
870 |
- from htmlentitydefs import entitydefs |
|
871 |
- if filter(lambda e: e not in entitydefs.keys(), |
|
872 |
- re.findall(r'&(\w+);',str)): return |
|
873 |
- |
|
874 |
- return 1 |
|
875 |
- |
|
876 |
- def _mapToStandardPrefix(self, name): |
|
877 |
- colonpos = name.find(':') |
|
878 |
- if colonpos <> -1: |
|
879 |
- prefix = name[:colonpos] |
|
880 |
- suffix = name[colonpos+1:] |
|
881 |
- prefix = self.namespacemap.get(prefix, prefix) |
|
882 |
- name = prefix + ':' + suffix |
|
883 |
- return name |
|
884 |
- |
|
885 |
- def _getAttribute(self, attrsD, name): |
|
886 |
- return attrsD.get(self._mapToStandardPrefix(name)) |
|
887 |
- |
|
888 |
- def _isBase64(self, attrsD, contentparams): |
|
889 |
- if attrsD.get('mode', '') == 'base64': |
|
890 |
- return 1 |
|
891 |
- if self.contentparams['type'].startswith('text/'): |
|
892 |
- return 0 |
|
893 |
- if self.contentparams['type'].endswith('+xml'): |
|
894 |
- return 0 |
|
895 |
- if self.contentparams['type'].endswith('/xml'): |
|
896 |
- return 0 |
|
897 |
- return 1 |
|
898 |
- |
|
899 |
- def _itsAnHrefDamnIt(self, attrsD): |
|
900 |
- href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None))) |
|
901 |
- if href: |
|
902 |
- try: |
|
903 |
- del attrsD['url'] |
|
904 |
- except KeyError: |
|
905 |
- pass |
|
906 |
- try: |
|
907 |
- del attrsD['uri'] |
|
908 |
- except KeyError: |
|
909 |
- pass |
|
910 |
- attrsD['href'] = href |
|
911 |
- return attrsD |
|
912 |
- |
|
913 |
- def _save(self, key, value): |
|
914 |
- context = self._getContext() |
|
915 |
- context.setdefault(key, value) |
|
916 |
- |
|
917 |
- def _start_rss(self, attrsD): |
|
918 |
- versionmap = {'0.91': 'rss091u', |
|
919 |
- '0.92': 'rss092', |
|
920 |
- '0.93': 'rss093', |
|
921 |
- '0.94': 'rss094'} |
|
922 |
- if not self.version: |
|
923 |
- attr_version = attrsD.get('version', '') |
|
924 |
- version = versionmap.get(attr_version) |
|
925 |
- if version: |
|
926 |
- self.version = version |
|
927 |
- elif attr_version.startswith('2.'): |
|
928 |
- self.version = 'rss20' |
|
929 |
- else: |
|
930 |
- self.version = 'rss' |
|
931 |
- |
|
932 |
- def _start_dlhottitles(self, attrsD): |
|
933 |
- self.version = 'hotrss' |
|
934 |
- |
|
935 |
- def _start_channel(self, attrsD): |
|
936 |
- self.infeed = 1 |
|
937 |
- self._cdf_common(attrsD) |
|
938 |
- _start_feedinfo = _start_channel |
|
939 |
- |
|
940 |
- def _cdf_common(self, attrsD): |
|
941 |
- if attrsD.has_key('lastmod'): |
|
942 |
- self._start_modified({}) |
|
943 |
- self.elementstack[-1][-1] = attrsD['lastmod'] |
|
944 |
- self._end_modified() |
|
945 |
- if attrsD.has_key('href'): |
|
946 |
- self._start_link({}) |
|
947 |
- self.elementstack[-1][-1] = attrsD['href'] |
|
948 |
- self._end_link() |
|
949 |
- |
|
950 |
- def _start_feed(self, attrsD): |
|
951 |
- self.infeed = 1 |
|
952 |
- versionmap = {'0.1': 'atom01', |
|
953 |
- '0.2': 'atom02', |
|
954 |
- '0.3': 'atom03'} |
|
955 |
- if not self.version: |
|
956 |
- attr_version = attrsD.get('version') |
|
957 |
- version = versionmap.get(attr_version) |
|
958 |
- if version: |
|
959 |
- self.version = version |
|
960 |
- else: |
|
961 |
- self.version = 'atom' |
|
962 |
- |
|
963 |
- def _end_channel(self): |
|
964 |
- self.infeed = 0 |
|
965 |
- _end_feed = _end_channel |
|
966 |
- |
|
967 |
- def _start_image(self, attrsD): |
|
968 |
- context = self._getContext() |
|
969 |
- context.setdefault('image', FeedParserDict()) |
|
970 |
- self.inimage = 1 |
|
971 |
- self.hasTitle = 0 |
|
972 |
- self.push('image', 0) |
|
973 |
- |
|
974 |
- def _end_image(self): |
|
975 |
- self.pop('image') |
|
976 |
- self.inimage = 0 |
|
977 |
- |
|
978 |
- def _start_textinput(self, attrsD): |
|
979 |
- context = self._getContext() |
|
980 |
- context.setdefault('textinput', FeedParserDict()) |
|
981 |
- self.intextinput = 1 |
|
982 |
- self.hasTitle = 0 |
|
983 |
- self.push('textinput', 0) |
|
984 |
- _start_textInput = _start_textinput |
|
985 |
- |
|
986 |
- def _end_textinput(self): |
|
987 |
- self.pop('textinput') |
|
988 |
- self.intextinput = 0 |
|
989 |
- _end_textInput = _end_textinput |
|
990 |
- |
|
991 |
- def _start_author(self, attrsD): |
|
992 |
- self.inauthor = 1 |
|
993 |
- self.push('author', 1) |
|
994 |
- _start_managingeditor = _start_author |
|
995 |
- _start_dc_author = _start_author |
|
996 |
- _start_dc_creator = _start_author |
|
997 |
- _start_itunes_author = _start_author |
|
998 |
- |
|
999 |
- def _end_author(self): |
|
1000 |
- self.pop('author') |
|
1001 |
- self.inauthor = 0 |
|
1002 |
- self._sync_author_detail() |
|
1003 |
- _end_managingeditor = _end_author |
|
1004 |
- _end_dc_author = _end_author |
|
1005 |
- _end_dc_creator = _end_author |
|
1006 |
- _end_itunes_author = _end_author |
|
1007 |
- |
|
1008 |
- def _start_itunes_owner(self, attrsD): |
|
1009 |
- self.inpublisher = 1 |
|
1010 |
- self.push('publisher', 0) |
|
1011 |
- |
|
1012 |
- def _end_itunes_owner(self): |
|
1013 |
- self.pop('publisher') |
|
1014 |
- self.inpublisher = 0 |
|
1015 |
- self._sync_author_detail('publisher') |
|
1016 |
- |
|
1017 |
- def _start_contributor(self, attrsD): |
|
1018 |
- self.incontributor = 1 |
|
1019 |
- context = self._getContext() |
|
1020 |
- context.setdefault('contributors', []) |
|
1021 |
- context['contributors'].append(FeedParserDict()) |
|
1022 |
- self.push('contributor', 0) |
|
1023 |
- |
|
1024 |
- def _end_contributor(self): |
|
1025 |
- self.pop('contributor') |
|
1026 |
- self.incontributor = 0 |
|
1027 |
- |
|
1028 |
- def _start_dc_contributor(self, attrsD): |
|
1029 |
- self.incontributor = 1 |
|
1030 |
- context = self._getContext() |
|
1031 |
- context.setdefault('contributors', []) |
|
1032 |
- context['contributors'].append(FeedParserDict()) |
|
1033 |
- self.push('name', 0) |
|
1034 |
- |
|
1035 |
- def _end_dc_contributor(self): |
|
1036 |
- self._end_name() |
|
1037 |
- self.incontributor = 0 |
|
1038 |
- |
|
1039 |
- def _start_name(self, attrsD): |
|
1040 |
- self.push('name', 0) |
|
1041 |
- _start_itunes_name = _start_name |
|
1042 |
- |
|
1043 |
- def _end_name(self): |
|
1044 |
- value = self.pop('name') |
|
1045 |
- if self.inpublisher: |
|
1046 |
- self._save_author('name', value, 'publisher') |
|
1047 |
- elif self.inauthor: |
|
1048 |
- self._save_author('name', value) |
|
1049 |
- elif self.incontributor: |
|
1050 |
- self._save_contributor('name', value) |
|
1051 |
- elif self.intextinput: |
|
1052 |
- context = self._getContext() |
|
1053 |
- context['name'] = value |
|
1054 |
- _end_itunes_name = _end_name |
|
1055 |
- |
|
1056 |
- def _start_width(self, attrsD): |
|
1057 |
- self.push('width', 0) |
|
1058 |
- |
|
1059 |
- def _end_width(self): |
|
1060 |
- value = self.pop('width') |
|
1061 |
- try: |
|
1062 |
- value = int(value) |
|
1063 |
- except: |
|
1064 |
- value = 0 |
|
1065 |
- if self.inimage: |
|
1066 |
- context = self._getContext() |
|
1067 |
- context['width'] = value |
|
1068 |
- |
|
1069 |
- def _start_height(self, attrsD): |
|
1070 |
- self.push('height', 0) |
|
1071 |
- |
|
1072 |
- def _end_height(self): |
|
1073 |
- value = self.pop('height') |
|
1074 |
- try: |
|
1075 |
- value = int(value) |
|
1076 |
- except: |
|
1077 |
- value = 0 |
|
1078 |
- if self.inimage: |
|
1079 |
- context = self._getContext() |
|
1080 |
- context['height'] = value |
|
1081 |
- |
|
1082 |
- def _start_url(self, attrsD): |
|
1083 |
- self.push('href', 1) |
|
1084 |
- _start_homepage = _start_url |
|
1085 |
- _start_uri = _start_url |
|
1086 |
- |
|
1087 |
- def _end_url(self): |
|
1088 |
- value = self.pop('href') |
|
1089 |
- if self.inauthor: |
|
1090 |
- self._save_author('href', value) |
|
1091 |
- elif self.incontributor: |
|
1092 |
- self._save_contributor('href', value) |
|
1093 |
- _end_homepage = _end_url |
|
1094 |
- _end_uri = _end_url |
|
1095 |
- |
|
1096 |
- def _start_email(self, attrsD): |
|
1097 |
- self.push('email', 0) |
|
1098 |
- _start_itunes_email = _start_email |
|
1099 |
- |
|
1100 |
- def _end_email(self): |
|
1101 |
- value = self.pop('email') |
|
1102 |
- if self.inpublisher: |
|
1103 |
- self._save_author('email', value, 'publisher') |
|
1104 |
- elif self.inauthor: |
|
1105 |
- self._save_author('email', value) |
|
1106 |
- elif self.incontributor: |
|
1107 |
- self._save_contributor('email', value) |
|
1108 |
- _end_itunes_email = _end_email |
|
1109 |
- |
|
1110 |
- def _getContext(self): |
|
1111 |
- if self.insource: |
|
1112 |
- context = self.sourcedata |
|
1113 |
- elif self.inimage: |
|
1114 |
- context = self.feeddata['image'] |
|
1115 |
- elif self.intextinput: |
|
1116 |
- context = self.feeddata['textinput'] |
|
1117 |
- elif self.inentry: |
|
1118 |
- context = self.entries[-1] |
|
1119 |
- else: |
|
1120 |
- context = self.feeddata |
|
1121 |
- return context |
|
1122 |
- |
|
1123 |
- def _save_author(self, key, value, prefix='author'): |
|
1124 |
- context = self._getContext() |
|
1125 |
- context.setdefault(prefix + '_detail', FeedParserDict()) |
|
1126 |
- context[prefix + '_detail'][key] = value |
|
1127 |
- self._sync_author_detail() |
|
1128 |
- |
|
1129 |
- def _save_contributor(self, key, value): |
|
1130 |
- context = self._getContext() |
|
1131 |
- context.setdefault('contributors', [FeedParserDict()]) |
|
1132 |
- context['contributors'][-1][key] = value |
|
1133 |
- |
|
1134 |
- def _sync_author_detail(self, key='author'): |
|
1135 |
- context = self._getContext() |
|
1136 |
- detail = context.get('%s_detail' % key) |
|
1137 |
- if detail: |
|
1138 |
- name = detail.get('name') |
|
1139 |
- email = detail.get('email') |
|
1140 |
- if name and email: |
|
1141 |
- context[key] = '%s (%s)' % (name, email) |
|
1142 |
- elif name: |
|
1143 |
- context[key] = name |
|
1144 |
- elif email: |
|
1145 |
- context[key] = email |
|
1146 |
- else: |
|
1147 |
- author, email = context.get(key), None |
|
1148 |
- if not author: return |
|
1149 |
- emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author) |
|
1150 |
- if emailmatch: |
|
1151 |
- email = emailmatch.group(0) |
|
1152 |
- # probably a better way to do the following, but it passes all the tests |
|
1153 |
- author = author.replace(email, '') |
|
1154 |
- author = author.replace('()', '') |
|
1155 |
- author = author.replace('<>', '') |
|
1156 |
- author = author.replace('<>', '') |
|
1157 |
- author = author.strip() |
|
1158 |
- if author and (author[0] == '('): |
|
1159 |
- author = author[1:] |
|
1160 |
- if author and (author[-1] == ')'): |
|
1161 |
- author = author[:-1] |
|
1162 |
- author = author.strip() |
|
1163 |
- if author or email: |
|
1164 |
- context.setdefault('%s_detail' % key, FeedParserDict()) |
|
1165 |
- if author: |
|
1166 |
- context['%s_detail' % key]['name'] = author |
|
1167 |
- if email: |
|
1168 |
- context['%s_detail' % key]['email'] = email |
|
1169 |
- |
|
1170 |
- def _start_subtitle(self, attrsD): |
|
1171 |
- self.pushContent('subtitle', attrsD, 'text/plain', 1) |
|
1172 |
- _start_tagline = _start_subtitle |
|
1173 |
- _start_itunes_subtitle = _start_subtitle |
|
1174 |
- |
|
1175 |
- def _end_subtitle(self): |
|
1176 |
- self.popContent('subtitle') |
|
1177 |
- _end_tagline = _end_subtitle |
|
1178 |
- _end_itunes_subtitle = _end_subtitle |
|
1179 |
- |
|
1180 |
- def _start_rights(self, attrsD): |
|
1181 |
- self.pushContent('rights', attrsD, 'text/plain', 1) |
|
1182 |
- _start_dc_rights = _start_rights |
|
1183 |
- _start_copyright = _start_rights |
|
1184 |
- |
|
1185 |
- def _end_rights(self): |
|
1186 |
- self.popContent('rights') |
|
1187 |
- _end_dc_rights = _end_rights |
|
1188 |
- _end_copyright = _end_rights |
|
1189 |
- |
|
1190 |
- def _start_item(self, attrsD): |
|
1191 |
- self.entries.append(FeedParserDict()) |
|
1192 |
- self.push('item', 0) |
|
1193 |
- self.inentry = 1 |
|
1194 |
- self.guidislink = 0 |
|
1195 |
- self.hasTitle = 0 |
|
1196 |
- id = self._getAttribute(attrsD, 'rdf:about') |
|
1197 |
- if id: |
|
1198 |
- context = self._getContext() |
|
1199 |
- context['id'] = id |
|
1200 |
- self._cdf_common(attrsD) |
|
1201 |
- _start_entry = _start_item |
|
1202 |
- _start_product = _start_item |
|
1203 |
- |
|
1204 |
- def _end_item(self): |
|
1205 |
- self.pop('item') |
|
1206 |
- self.inentry = 0 |
|
1207 |
- _end_entry = _end_item |
|
1208 |
- |
|
1209 |
- def _start_dc_language(self, attrsD): |
|
1210 |
- self.push('language', 1) |
|
1211 |
- _start_language = _start_dc_language |
|
1212 |
- |
|
1213 |
- def _end_dc_language(self): |
|
1214 |
- self.lang = self.pop('language') |
|
1215 |
- _end_language = _end_dc_language |
|
1216 |
- |
|
1217 |
- def _start_dc_publisher(self, attrsD): |
|
1218 |
- self.push('publisher', 1) |
|
1219 |
- _start_webmaster = _start_dc_publisher |
|
1220 |
- |
|
1221 |
- def _end_dc_publisher(self): |
|
1222 |
- self.pop('publisher') |
|
1223 |
- self._sync_author_detail('publisher') |
|
1224 |
- _end_webmaster = _end_dc_publisher |
|
1225 |
- |
|
1226 |
- def _start_published(self, attrsD): |
|
1227 |
- self.push('published', 1) |
|
1228 |
- _start_dcterms_issued = _start_published |
|
1229 |
- _start_issued = _start_published |
|
1230 |
- |
|
1231 |
- def _end_published(self): |
|
1232 |
- value = self.pop('published') |
|
1233 |
- self._save('published_parsed', _parse_date(value)) |
|
1234 |
- _end_dcterms_issued = _end_published |
|
1235 |
- _end_issued = _end_published |
|
1236 |
- |
|
1237 |
- def _start_updated(self, attrsD): |
|
1238 |
- self.push('updated', 1) |
|
1239 |
- _start_modified = _start_updated |
|
1240 |
- _start_dcterms_modified = _start_updated |
|
1241 |
- _start_pubdate = _start_updated |
|
1242 |
- _start_dc_date = _start_updated |
|
1243 |
- |
|
1244 |
- def _end_updated(self): |
|
1245 |
- value = self.pop('updated') |
|
1246 |
- parsed_value = _parse_date(value) |
|
1247 |
- self._save('updated_parsed', parsed_value) |
|
1248 |
- _end_modified = _end_updated |
|
1249 |
- _end_dcterms_modified = _end_updated |
|
1250 |
- _end_pubdate = _end_updated |
|
1251 |
- _end_dc_date = _end_updated |
|
1252 |
- |
|
1253 |
- def _start_created(self, attrsD): |
|
1254 |
- self.push('created', 1) |
|
1255 |
- _start_dcterms_created = _start_created |
|
1256 |
- |
|
1257 |
- def _end_created(self): |
|
1258 |
- value = self.pop('created') |
|
1259 |
- self._save('created_parsed', _parse_date(value)) |
|
1260 |
- _end_dcterms_created = _end_created |
|
1261 |
- |
|
1262 |
- def _start_expirationdate(self, attrsD): |
|
1263 |
- self.push('expired', 1) |
|
1264 |
- |
|
1265 |
- def _end_expirationdate(self): |
|
1266 |
- self._save('expired_parsed', _parse_date(self.pop('expired'))) |
|
1267 |
- |
|
1268 |
- def _start_cc_license(self, attrsD): |
|
1269 |
- context = self._getContext() |
|
1270 |
- value = self._getAttribute(attrsD, 'rdf:resource') |
|
1271 |
- attrsD = FeedParserDict() |
|
1272 |
- attrsD['rel']='license' |
|
1273 |
- if value: attrsD['href']=value |
|
1274 |
- context.setdefault('links', []).append(attrsD) |
|
1275 |
- |
|
1276 |
- def _start_creativecommons_license(self, attrsD): |
|
1277 |
- self.push('license', 1) |
|
1278 |
- _start_creativeCommons_license = _start_creativecommons_license |
|
1279 |
- |
|
1280 |
- def _end_creativecommons_license(self): |
|
1281 |
- value = self.pop('license') |
|
1282 |
- context = self._getContext() |
|
1283 |
- attrsD = FeedParserDict() |
|
1284 |
- attrsD['rel']='license' |
|
1285 |
- if value: attrsD['href']=value |
|
1286 |
- context.setdefault('links', []).append(attrsD) |
|
1287 |
- del context['license'] |
|
1288 |
- _end_creativeCommons_license = _end_creativecommons_license |
|
1289 |
- |
|
1290 |
- def _addXFN(self, relationships, href, name): |
|
1291 |
- context = self._getContext() |
|
1292 |
- xfn = context.setdefault('xfn', []) |
|
1293 |
- value = FeedParserDict({'relationships': relationships, 'href': href, 'name': name}) |
|
1294 |
- if value not in xfn: |
|
1295 |
- xfn.append(value) |
|
1296 |
- |
|
1297 |
- def _addTag(self, term, scheme, label): |
|
1298 |
- context = self._getContext() |
|
1299 |
- tags = context.setdefault('tags', []) |
|
1300 |
- if (not term) and (not scheme) and (not label): return |
|
1301 |
- value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label}) |
|
1302 |
- if value not in tags: |
|
1303 |
- tags.append(value) |
|
1304 |
- |
|
1305 |
- def _start_category(self, attrsD): |
|
1306 |
- if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD)) |
|
1307 |
- term = attrsD.get('term') |
|
1308 |
- scheme = attrsD.get('scheme', attrsD.get('domain')) |
|
1309 |
- label = attrsD.get('label') |
|
1310 |
- self._addTag(term, scheme, label) |
|
1311 |
- self.push('category', 1) |
|
1312 |
- _start_dc_subject = _start_category |
|
1313 |
- _start_keywords = _start_category |
|
1314 |
- |
|
1315 |
- def _end_itunes_keywords(self): |
|
1316 |
- for term in self.pop('itunes_keywords').split(): |
|
1317 |
- self._addTag(term, 'http://www.itunes.com/', None) |
|
1318 |
- |
|
1319 |
- def _start_itunes_category(self, attrsD): |
|
1320 |
- self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None) |
|
1321 |
- self.push('category', 1) |
|
1322 |
- |
|
1323 |
- def _end_category(self): |
|
1324 |
- value = self.pop('category') |
|
1325 |
- if not value: return |
|
1326 |
- context = self._getContext() |
|
1327 |
- tags = context['tags'] |
|
1328 |
- if value and len(tags) and not tags[-1]['term']: |
|
1329 |
- tags[-1]['term'] = value |
|
1330 |
- else: |
|
1331 |
- self._addTag(value, None, None) |
|
1332 |
- _end_dc_subject = _end_category |
|
1333 |
- _end_keywords = _end_category |
|
1334 |
- _end_itunes_category = _end_category |
|
1335 |
- |
|
1336 |
- def _start_cloud(self, attrsD): |
|
1337 |
- self._getContext()['cloud'] = FeedParserDict(attrsD) |
|
1338 |
- |
|
1339 |
- def _start_link(self, attrsD): |
|
1340 |
- attrsD.setdefault('rel', 'alternate') |
|
1341 |
- if attrsD['rel'] == 'self': |
|
1342 |
- attrsD.setdefault('type', 'application/atom+xml') |
|
1343 |
- else: |
|
1344 |
- attrsD.setdefault('type', 'text/html') |
|
1345 |
- context = self._getContext() |
|
1346 |
- attrsD = self._itsAnHrefDamnIt(attrsD) |
|
1347 |
- if attrsD.has_key('href'): |
|
1348 |
- attrsD['href'] = self.resolveURI(attrsD['href']) |
|
1349 |
- if attrsD.get('rel')=='enclosure' and not context.get('id'): |
|
1350 |
- context['id'] = attrsD.get('href') |
|
1351 |
- expectingText = self.infeed or self.inentry or self.insource |
|
1352 |
- context.setdefault('links', []) |
|
1353 |
- context['links'].append(FeedParserDict(attrsD)) |
|
1354 |
- if attrsD.has_key('href'): |
|
1355 |
- expectingText = 0 |
|
1356 |
- if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types): |
|
1357 |
- context['link'] = attrsD['href'] |
|
1358 |
- else: |
|
1359 |
- self.push('link', expectingText) |
|
1360 |
- _start_producturl = _start_link |
|
1361 |
- |
|
1362 |
- def _end_link(self): |
|
1363 |
- value = self.pop('link') |
|
1364 |
- context = self._getContext() |
|
1365 |
- _end_producturl = _end_link |
|
1366 |
- |
|
1367 |
- def _start_guid(self, attrsD): |
|
1368 |
- self.guidislink = (attrsD.get('ispermalink', 'true') == 'true') |
|
1369 |
- self.push('id', 1) |
|
1370 |
- |
|
1371 |
- def _end_guid(self): |
|
1372 |
- value = self.pop('id') |
|
1373 |
- self._save('guidislink', self.guidislink and not self._getContext().has_key('link')) |
|
1374 |
- if self.guidislink: |
|
1375 |
- # guid acts as link, but only if 'ispermalink' is not present or is 'true', |
|
1376 |
- # and only if the item doesn't already have a link element |
|
1377 |
- self._save('link', value) |
|
1378 |
- |
|
1379 |
- def _start_title(self, attrsD): |
|
1380 |
- if self.svgOK: return self.unknown_starttag('title', attrsD.items()) |
|
1381 |
- self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource) |
|
1382 |
- _start_dc_title = _start_title |
|
1383 |
- _start_media_title = _start_title |
|
1384 |
- |
|
1385 |
- def _end_title(self): |
|
1386 |
- if self.svgOK: return |
|
1387 |
- value = self.popContent('title') |
|
1388 |
- if not value: return |
|
1389 |
- context = self._getContext() |
|
1390 |
- self.hasTitle = 1 |
|
1391 |
- _end_dc_title = _end_title |
|
1392 |
- |
|
1393 |
- def _end_media_title(self): |
|
1394 |
- hasTitle = self.hasTitle |
|
1395 |
- self._end_title() |
|
1396 |
- self.hasTitle = hasTitle |
|
1397 |
- |
|
1398 |
- def _start_description(self, attrsD): |
|
1399 |
- context = self._getContext() |
|
1400 |
- if context.has_key('summary'): |
|
1401 |
- self._summaryKey = 'content' |
|
1402 |
- self._start_content(attrsD) |
|
1403 |
- else: |
|
1404 |
- self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource) |
|
1405 |
- _start_dc_description = _start_description |
|
1406 |
- |
|
1407 |
- def _start_abstract(self, attrsD): |
|
1408 |
- self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource) |
|
1409 |
- |
|
1410 |
- def _end_description(self): |
|
1411 |
- if self._summaryKey == 'content': |
|
1412 |
- self._end_content() |
|
1413 |
- else: |
|
1414 |
- value = self.popContent('description') |
|
1415 |
- self._summaryKey = None |
|
1416 |
- _end_abstract = _end_description |
|
1417 |
- _end_dc_description = _end_description |
|
1418 |
- |
|
1419 |
- def _start_info(self, attrsD): |
|
1420 |
- self.pushContent('info', attrsD, 'text/plain', 1) |
|
1421 |
- _start_feedburner_browserfriendly = _start_info |
|
1422 |
- |
|
1423 |
- def _end_info(self): |
|
1424 |
- self.popContent('info') |
|
1425 |
- _end_feedburner_browserfriendly = _end_info |
|
1426 |
- |
|
1427 |
- def _start_generator(self, attrsD): |
|
1428 |
- if attrsD: |
|
1429 |
- attrsD = self._itsAnHrefDamnIt(attrsD) |
|
1430 |
- if attrsD.has_key('href'): |
|
1431 |
- attrsD['href'] = self.resolveURI(attrsD['href']) |
|
1432 |
- self._getContext()['generator_detail'] = FeedParserDict(attrsD) |
|
1433 |
- self.push('generator', 1) |
|
1434 |
- |
|
1435 |
- def _end_generator(self): |
|
1436 |
- value = self.pop('generator') |
|
1437 |
- context = self._getContext() |
|
1438 |
- if context.has_key('generator_detail'): |
|
1439 |
- context['generator_detail']['name'] = value |
|
1440 |
- |
|
1441 |
- def _start_admin_generatoragent(self, attrsD): |
|
1442 |
- self.push('generator', 1) |
|
1443 |
- value = self._getAttribute(attrsD, 'rdf:resource') |
|
1444 |
- if value: |
|
1445 |
- self.elementstack[-1][2].append(value) |
|
1446 |
- self.pop('generator') |
|
1447 |
- self._getContext()['generator_detail'] = FeedParserDict({'href': value}) |
|
1448 |
- |
|
1449 |
- def _start_admin_errorreportsto(self, attrsD): |
|
1450 |
- self.push('errorreportsto', 1) |
|
1451 |
- value = self._getAttribute(attrsD, 'rdf:resource') |
|
1452 |
- if value: |
|
1453 |
- self.elementstack[-1][2].append(value) |
|
1454 |
- self.pop('errorreportsto') |
|
1455 |
- |
|
1456 |
- def _start_summary(self, attrsD): |
|
1457 |
- context = self._getContext() |
|
1458 |
- if context.has_key('summary'): |
|
1459 |
- self._summaryKey = 'content' |
|
1460 |
- self._start_content(attrsD) |
|
1461 |
- else: |
|
1462 |
- self._summaryKey = 'summary' |
|
1463 |
- self.pushContent(self._summaryKey, attrsD, 'text/plain', 1) |
|
1464 |
- _start_itunes_summary = _start_summary |
|
1465 |
- |
|
1466 |
- def _end_summary(self): |
|
1467 |
- if self._summaryKey == 'content': |
|
1468 |
- self._end_content() |
|
1469 |
- else: |
|
1470 |
- self.popContent(self._summaryKey or 'summary') |
|
1471 |
- self._summaryKey = None |
|
1472 |
- _end_itunes_summary = _end_summary |
|
1473 |
- |
|
1474 |
- def _start_enclosure(self, attrsD): |
|
1475 |
- attrsD = self._itsAnHrefDamnIt(attrsD) |
|
1476 |
- context = self._getContext() |
|
1477 |
- attrsD['rel']='enclosure' |
|
1478 |
- context.setdefault('links', []).append(FeedParserDict(attrsD)) |
|
1479 |
- href = attrsD.get('href') |
|
1480 |
- if href and not context.get('id'): |
|
1481 |
- context['id'] = href |
|
1482 |
- |
|
1483 |
- def _start_source(self, attrsD): |
|
1484 |
- self.insource = 1 |
|
1485 |
- self.hasTitle = 0 |
|
1486 |
- |
|
1487 |
- def _end_source(self): |
|
1488 |
- self.insource = 0 |
|
1489 |
- self._getContext()['source'] = copy.deepcopy(self.sourcedata) |
|
1490 |
- self.sourcedata.clear() |
|
1491 |
- |
|
1492 |
- def _start_content(self, attrsD): |
|
1493 |
- self.pushContent('content', attrsD, 'text/plain', 1) |
|
1494 |
- src = attrsD.get('src') |
|
1495 |
- if src: |
|
1496 |
- self.contentparams['src'] = src |
|
1497 |
- self.push('content', 1) |
|
1498 |
- |
|
1499 |
- def _start_prodlink(self, attrsD): |
|
1500 |
- self.pushContent('content', attrsD, 'text/html', 1) |
|
1501 |
- |
|
1502 |
- def _start_body(self, attrsD): |
|
1503 |
- self.pushContent('content', attrsD, 'application/xhtml+xml', 1) |
|
1504 |
- _start_xhtml_body = _start_body |
|
1505 |
- |
|
1506 |
- def _start_content_encoded(self, attrsD): |
|
1507 |
- self.pushContent('content', attrsD, 'text/html', 1) |
|
1508 |
- _start_fullitem = _start_content_encoded |
|
1509 |
- |
|
1510 |
- def _end_content(self): |
|
1511 |
- copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types) |
|
1512 |
- value = self.popContent('content') |
|
1513 |
- if copyToDescription: |
|
1514 |
- self._save('description', value) |
|
1515 |
- |
|
1516 |
- _end_body = _end_content |
|
1517 |
- _end_xhtml_body = _end_content |
|
1518 |
- _end_content_encoded = _end_content |
|
1519 |
- _end_fullitem = _end_content |
|
1520 |
- _end_prodlink = _end_content |
|
1521 |
- |
|
1522 |
- def _start_itunes_image(self, attrsD): |
|
1523 |
- self.push('itunes_image', 0) |
|
1524 |
- self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')}) |
|
1525 |
- _start_itunes_link = _start_itunes_image |
|
1526 |
- |
|
1527 |
- def _end_itunes_block(self): |
|
1528 |
- value = self.pop('itunes_block', 0) |
|
1529 |
- self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0 |
|
1530 |
- |
|
1531 |
- def _end_itunes_explicit(self): |
|
1532 |
- value = self.pop('itunes_explicit', 0) |
|
1533 |
- self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0 |
|
1534 |
- |
|
1535 |
-if _XML_AVAILABLE: |
|
1536 |
- class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler): |
|
1537 |
- def __init__(self, baseuri, baselang, encoding): |
|
1538 |
- if _debug: sys.stderr.write('trying StrictFeedParser\n') |
|
1539 |
- xml.sax.handler.ContentHandler.__init__(self) |
|
1540 |
- _FeedParserMixin.__init__(self, baseuri, baselang, encoding) |
|
1541 |
- self.bozo = 0 |
|
1542 |
- self.exc = None |
|
1543 |
- |
|
1544 |
- def startPrefixMapping(self, prefix, uri): |
|
1545 |
- self.trackNamespace(prefix, uri) |
|
1546 |
- |
|
1547 |
- def startElementNS(self, name, qname, attrs): |
|
1548 |
- namespace, localname = name |
|
1549 |
- lowernamespace = str(namespace or '').lower() |
|
1550 |
- if lowernamespace.find('backend.userland.com/rss') <> -1: |
|
1551 |
- # match any backend.userland.com namespace |
|
1552 |
- namespace = 'http://backend.userland.com/rss' |
|
1553 |
- lowernamespace = namespace |
|
1554 |
- if qname and qname.find(':') > 0: |
|
1555 |
- givenprefix = qname.split(':')[0] |
|
1556 |
- else: |
|
1557 |
- givenprefix = None |
|
1558 |
- prefix = self._matchnamespaces.get(lowernamespace, givenprefix) |
|
1559 |
- if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix): |
|
1560 |
- raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix |
|
1561 |
- localname = str(localname).lower() |
|
1562 |
- |
|
1563 |
- # qname implementation is horribly broken in Python 2.1 (it |
|
1564 |
- # doesn't report any), and slightly broken in Python 2.2 (it |
|
1565 |
- # doesn't report the xml: namespace). So we match up namespaces |
|
1566 |
- # with a known list first, and then possibly override them with |
|
1567 |
- # the qnames the SAX parser gives us (if indeed it gives us any |
|
1568 |
- # at all). Thanks to MatejC for helping me test this and |
|
1569 |
- # tirelessly telling me that it didn't work yet. |
|
1570 |
- attrsD = {} |
|
1571 |
- if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML': |
|
1572 |
- attrsD['xmlns']=namespace |
|
1573 |
- if localname=='svg' and namespace=='http://www.w3.org/2000/svg': |
|
1574 |
- attrsD['xmlns']=namespace |
|
1575 |
- |
|
1576 |
- if prefix: |
|
1577 |
- localname = prefix.lower() + ':' + localname |
|
1578 |
- elif namespace and not qname: #Expat |
|
1579 |
- for name,value in self.namespacesInUse.items(): |
|
1580 |
- if name and value == namespace: |
|
1581 |
- localname = name + ':' + localname |
|
1582 |
- break |
|
1583 |
- if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname)) |
|
1584 |
- |
|
1585 |
- for (namespace, attrlocalname), attrvalue in attrs._attrs.items(): |
|
1586 |
- lowernamespace = (namespace or '').lower() |
|
1587 |
- prefix = self._matchnamespaces.get(lowernamespace, '') |
|
1588 |
- if prefix: |
|
1589 |
- attrlocalname = prefix + ':' + attrlocalname |
|
1590 |
- attrsD[str(attrlocalname).lower()] = attrvalue |
|
1591 |
- for qname in attrs.getQNames(): |
|
1592 |
- attrsD[str(qname).lower()] = attrs.getValueByQName(qname) |
|
1593 |
- self.unknown_starttag(localname, attrsD.items()) |
|
1594 |
- |
|
1595 |
- def characters(self, text): |
|
1596 |
- self.handle_data(text) |
|
1597 |
- |
|
1598 |
- def endElementNS(self, name, qname): |
|
1599 |
- namespace, localname = name |
|
1600 |
- lowernamespace = str(namespace or '').lower() |
|
1601 |
- if qname and qname.find(':') > 0: |
|
1602 |
- givenprefix = qname.split(':')[0] |
|
1603 |
- else: |
|
1604 |
- givenprefix = '' |
|
1605 |
- prefix = self._matchnamespaces.get(lowernamespace, givenprefix) |
|
1606 |
- if prefix: |
|
1607 |
- localname = prefix + ':' + localname |
|
1608 |
- elif namespace and not qname: #Expat |
|
1609 |
- for name,value in self.namespacesInUse.items(): |
|
1610 |
- if name and value == namespace: |
|
1611 |
- localname = name + ':' + localname |
|
1612 |
- break |
|
1613 |
- localname = str(localname).lower() |
|
1614 |
- self.unknown_endtag(localname) |
|
1615 |
- |
|
1616 |
- def error(self, exc): |
|
1617 |
- self.bozo = 1 |
|
1618 |
- self.exc = exc |
|
1619 |
- |
|
1620 |
- def fatalError(self, exc): |
|
1621 |
- self.error(exc) |
|
1622 |
- raise exc |
|
1623 |
- |
|
1624 |
-class _BaseHTMLProcessor(sgmllib.SGMLParser): |
|
1625 |
- special = re.compile('''[<>'"]''') |
|
1626 |
- bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)") |
|
1627 |
- elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', |
|
1628 |
- 'img', 'input', 'isindex', 'link', 'meta', 'param'] |
|
1629 |
- |
|
1630 |
- def __init__(self, encoding, type): |
|
1631 |
- self.encoding = encoding |
|
1632 |
- self.type = type |
|
1633 |
- if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding) |
|
1634 |
- sgmllib.SGMLParser.__init__(self) |
|
1635 |
- |
|
1636 |
- def reset(self): |
|
1637 |
- self.pieces = [] |
|
1638 |
- sgmllib.SGMLParser.reset(self) |
|
1639 |
- |
|
1640 |
- def _shorttag_replace(self, match): |
|
1641 |
- tag = match.group(1) |
|
1642 |
- if tag in self.elements_no_end_tag: |
|
1643 |
- return '<' + tag + ' />' |
|
1644 |
- else: |
|
1645 |
- return '<' + tag + '></' + tag + '>' |
|
1646 |
- |
|
1647 |
- def parse_starttag(self,i): |
|
1648 |
- j=sgmllib.SGMLParser.parse_starttag(self, i) |
|
1649 |
- if self.type == 'application/xhtml+xml': |
|
1650 |
- if j>2 and self.rawdata[j-2:j]=='/>': |
|
1651 |
- self.unknown_endtag(self.lasttag) |
|
1652 |
- return j |
|
1653 |
- |
|
1654 |
- def feed(self, data): |
|
1655 |
- data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data) |
|
1656 |
- #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace |
|
1657 |
- data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data) |
|
1658 |
- data = data.replace(''', "'") |
|
1659 |
- data = data.replace('"', '"') |
|
1660 |
- if self.encoding and type(data) == type(u''): |
|
1661 |
- data = data.encode(self.encoding) |
|
1662 |
- sgmllib.SGMLParser.feed(self, data) |
|
1663 |
- sgmllib.SGMLParser.close(self) |
|
1664 |
- |
|
1665 |
- def normalize_attrs(self, attrs): |
|
1666 |
- if not attrs: return attrs |
|
1667 |
- # utility method to be called by descendants |
|
1668 |
- attrs = dict([(k.lower(), v) for k, v in attrs]).items() |
|
1669 |
- attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] |
|
1670 |
- attrs.sort() |
|
1671 |
- return attrs |
|
1672 |
- |
|
1673 |
- def unknown_starttag(self, tag, attrs): |
|
1674 |
- # called for each start tag |
|
1675 |
- # attrs is a list of (attr, value) tuples |
|
1676 |
- # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')] |
|
1677 |
- if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag) |
|
1678 |
- uattrs = [] |
|
1679 |
- strattrs='' |
|
1680 |
- if attrs: |
|
1681 |
- for key, value in attrs: |
|
1682 |
- value=value.replace('>','>').replace('<','<').replace('"','"') |
|
1683 |
- value = self.bare_ampersand.sub("&", value) |
|
1684 |
- # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds |
|
1685 |
- if type(value) != type(u''): |
|
1686 |
- try: |
|
1687 |
- value = unicode(value, self.encoding) |
|
1688 |
- except: |
|
1689 |
- value = unicode(value, 'iso-8859-1') |
|
1690 |
- uattrs.append((unicode(key, self.encoding), value)) |
|
1691 |
- strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]) |
|
1692 |
- if self.encoding: |
|
1693 |
- try: |
|
1694 |
- strattrs=strattrs.encode(self.encoding) |
|
1695 |
- except: |
|
1696 |
- pass |
|
1697 |
- if tag in self.elements_no_end_tag: |
|
1698 |
- self.pieces.append('<%(tag)s%(strattrs)s />' % locals()) |
|
1699 |
- else: |
|
1700 |
- self.pieces.append('<%(tag)s%(strattrs)s>' % locals()) |
|
1701 |
- |
|
1702 |
- def unknown_endtag(self, tag): |
|
1703 |
- # called for each end tag, e.g. for </pre>, tag will be 'pre' |
|
1704 |
- # Reconstruct the original end tag. |
|
1705 |
- if tag not in self.elements_no_end_tag: |
|
1706 |
- self.pieces.append("</%(tag)s>" % locals()) |
|
1707 |
- |
|
1708 |
- def handle_charref(self, ref): |
|
1709 |
- # called for each character reference, e.g. for ' ', ref will be '160' |
|
1710 |
- # Reconstruct the original character reference. |
|
1711 |
- if ref.startswith('x'): |
|
1712 |
- value = unichr(int(ref[1:],16)) |
|
1713 |
- else: |
|
1714 |
- value = unichr(int(ref)) |
|
1715 |
- |
|
1716 |
- if value in _cp1252.keys(): |
|
1717 |
- self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:]) |
|
1718 |
- else: |
|
1719 |
- self.pieces.append('&#%(ref)s;' % locals()) |
|
1720 |
- |
|
1721 |
- def handle_entityref(self, ref): |
|
1722 |
- # called for each entity reference, e.g. for '©', ref will be 'copy' |
|
1723 |
- # Reconstruct the original entity reference. |
|
1724 |
- if name2codepoint.has_key(ref): |
|
1725 |
- self.pieces.append('&%(ref)s;' % locals()) |
|
1726 |
- else: |
|
1727 |
- self.pieces.append('&%(ref)s' % locals()) |
|
1728 |
- |
|
1729 |
- def handle_data(self, text): |
|
1730 |
- # called for each block of plain text, i.e. outside of any tag and |
|
1731 |
- # not containing any character or entity references |
|
1732 |
- # Store the original text verbatim. |
|
1733 |
- if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text) |
|
1734 |
- self.pieces.append(text) |
|
1735 |
- |
|
1736 |
- def handle_comment(self, text): |
|
1737 |
- # called for each HTML comment, e.g. <!-- insert Javascript code here --> |
|
1738 |
- # Reconstruct the original comment. |
|
1739 |
- self.pieces.append('<!--%(text)s-->' % locals()) |
|
1740 |
- |
|
1741 |
- def handle_pi(self, text): |
|
1742 |
- # called for each processing instruction, e.g. <?instruction> |
|
1743 |
- # Reconstruct original processing instruction. |
|
1744 |
- self.pieces.append('<?%(text)s>' % locals()) |
|
1745 |
- |
|
1746 |
- def handle_decl(self, text): |
|
1747 |
- # called for the DOCTYPE, if present, e.g. |
|
1748 |
- # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" |
|
1749 |
- # "http://www.w3.org/TR/html4/loose.dtd"> |
|
1750 |
- # Reconstruct original DOCTYPE |
|
1751 |
- self.pieces.append('<!%(text)s>' % locals()) |
|
1752 |
- |
|
1753 |
- _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match |
|
1754 |
- def _scan_name(self, i, declstartpos): |
|
1755 |
- rawdata = self.rawdata |
|
1756 |
- n = len(rawdata) |
|
1757 |
- if i == n: |
|
1758 |
- return None, -1 |
|
1759 |
- m = self._new_declname_match(rawdata, i) |
|
1760 |
- if m: |
|
1761 |
- s = m.group() |
|
1762 |
- name = s.strip() |
|
1763 |
- if (i + len(s)) == n: |
|
1764 |
- return None, -1 # end of buffer |
|
1765 |
- return name.lower(), m.end() |
|
1766 |
- else: |
|
1767 |
- self.handle_data(rawdata) |
|
1768 |
-# self.updatepos(declstartpos, i) |
|
1769 |
- return None, -1 |
|
1770 |
- |
|
1771 |
- def convert_charref(self, name): |
|
1772 |
- return '&#%s;' % name |
|
1773 |
- |
|
1774 |
- def convert_entityref(self, name): |
|
1775 |
- return '&%s;' % name |
|
1776 |
- |
|
1777 |
- def output(self): |
|
1778 |
- '''Return processed HTML as a single string''' |
|
1779 |
- return ''.join([str(p) for p in self.pieces]) |
|
1780 |
- |
|
1781 |
-class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor): |
|
1782 |
- def __init__(self, baseuri, baselang, encoding, entities): |
|
1783 |
- sgmllib.SGMLParser.__init__(self) |
|
1784 |
- _FeedParserMixin.__init__(self, baseuri, baselang, encoding) |
|
1785 |
- _BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml') |
|
1786 |
- self.entities=entities |
|
1787 |
- |
|
1788 |
- def decodeEntities(self, element, data): |
|
1789 |
- data = data.replace('<', '<') |
|
1790 |
- data = data.replace('<', '<') |
|
1791 |
- data = data.replace('<', '<') |
|
1792 |
- data = data.replace('>', '>') |
|
1793 |
- data = data.replace('>', '>') |
|
1794 |
- data = data.replace('>', '>') |
|
1795 |
- data = data.replace('&', '&') |
|
1796 |
- data = data.replace('&', '&') |
|
1797 |
- data = data.replace('"', '"') |
|
1798 |
- data = data.replace('"', '"') |
|
1799 |
- data = data.replace(''', ''') |
|
1800 |
- data = data.replace(''', ''') |
|
1801 |
- if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): |
|
1802 |
- data = data.replace('<', '<') |
|
1803 |
- data = data.replace('>', '>') |
|
1804 |
- data = data.replace('&', '&') |
|
1805 |
- data = data.replace('"', '"') |
|
1806 |
- data = data.replace(''', "'") |
|
1807 |
- return data |
|
1808 |
- |
|
1809 |
- def strattrs(self, attrs): |
|
1810 |
- return ''.join([' %s="%s"' % (n,v.replace('"','"')) for n,v in attrs]) |
|
1811 |
- |
|
1812 |
-class _MicroformatsParser: |
|
1813 |
- STRING = 1 |
|
1814 |
- DATE = 2 |
|
1815 |
- URI = 3 |
|
1816 |
- NODE = 4 |
|
1817 |
- EMAIL = 5 |
|
1818 |
- |
|
1819 |
- known_xfn_relationships = ['contact', 'acquaintance', 'friend', 'met', 'co-worker', 'coworker', 'colleague', 'co-resident', 'coresident', 'neighbor', 'child', 'parent', 'sibling', 'brother', 'sister', 'spouse', 'wife', 'husband', 'kin', 'relative', 'muse', 'crush', 'date', 'sweetheart', 'me'] |
|
1820 |
- known_binary_extensions = ['zip','rar','exe','gz','tar','tgz','tbz2','bz2','z','7z','dmg','img','sit','sitx','hqx','deb','rpm','bz2','jar','rar','iso','bin','msi','mp2','mp3','ogg','ogm','mp4','m4v','m4a','avi','wma','wmv'] |
|
1821 |
- |
|
1822 |
- def __init__(self, data, baseuri, encoding): |
|
1823 |
- self.document = BeautifulSoup.BeautifulSoup(data) |
|
1824 |
- self.baseuri = baseuri |
|
1825 |
- self.encoding = encoding |
|
1826 |
- if type(data) == type(u''): |
|
1827 |
- data = data.encode(encoding) |
|
1828 |
- self.tags = [] |
|
1829 |
- self.enclosures = [] |
|
1830 |
- self.xfn = [] |
|
1831 |
- self.vcard = None |
|
1832 |
- |
|
1833 |
- def vcardEscape(self, s): |
|
1834 |
- if type(s) in (type(''), type(u'')): |
|
1835 |
- s = s.replace(',', '\\,').replace(';', '\\;').replace('\n', '\\n') |
|
1836 |
- return s |
|
1837 |
- |
|
1838 |
- def vcardFold(self, s): |
|
1839 |
- s = re.sub(';+$', '', s) |
|
1840 |
- sFolded = '' |
|
1841 |
- iMax = 75 |
|
1842 |
- sPrefix = '' |
|
1843 |
- while len(s) > iMax: |
|
1844 |
- sFolded += sPrefix + s[:iMax] + '\n' |
|
1845 |
- s = s[iMax:] |
|
1846 |
- sPrefix = ' ' |
|
1847 |
- iMax = 74 |
|
1848 |
- sFolded += sPrefix + s |
|
1849 |
- return sFolded |
|
1850 |
- |
|
1851 |
- def normalize(self, s): |
|
1852 |
- return re.sub(r'\s+', ' ', s).strip() |
|
1853 |
- |
|
1854 |
- def unique(self, aList): |
|
1855 |
- results = [] |
|
1856 |
- for element in aList: |
|
1857 |
- if element not in results: |
|
1858 |
- results.append(element) |
|
1859 |
- return results |
|
1860 |
- |
|
1861 |
- def toISO8601(self, dt): |
|
1862 |
- return time.strftime('%Y-%m-%dT%H:%M:%SZ', dt) |
|
1863 |
- |
|
1864 |
- def getPropertyValue(self, elmRoot, sProperty, iPropertyType=4, bAllowMultiple=0, bAutoEscape=0): |
|
1865 |
- all = lambda x: 1 |
|
1866 |
- sProperty = sProperty.lower() |
|
1867 |
- bFound = 0 |
|
1868 |
- bNormalize = 1 |
|
1869 |
- propertyMatch = {'class': re.compile(r'\b%s\b' % sProperty)} |
|
1870 |
- if bAllowMultiple and (iPropertyType != self.NODE): |
|
1871 |
- snapResults = [] |
|
1872 |
- containers = elmRoot(['ul', 'ol'], propertyMatch) |
|
1873 |
- for container in containers: |
|
1874 |
- snapResults.extend(container('li')) |
|
1875 |
- bFound = (len(snapResults) != 0) |
|
1876 |
- if not bFound: |
|
1877 |
- snapResults = elmRoot(all, propertyMatch) |
|
1878 |
- bFound = (len(snapResults) != 0) |
|
1879 |
- if (not bFound) and (sProperty == 'value'): |
|
1880 |
- snapResults = elmRoot('pre') |
|
1881 |
- bFound = (len(snapResults) != 0) |
|
1882 |
- bNormalize = not bFound |
|
1883 |
- if not bFound: |
|
1884 |
- snapResults = [elmRoot] |
|
1885 |
- bFound = (len(snapResults) != 0) |
|
1886 |
- arFilter = [] |
|
1887 |
- if sProperty == 'vcard': |
|
1888 |
- snapFilter = elmRoot(all, propertyMatch) |
|
1889 |
- for node in snapFilter: |
|
1890 |
- if node.findParent(all, propertyMatch): |
|
1891 |
- arFilter.append(node) |
|
1892 |
- arResults = [] |
|
1893 |
- for node in snapResults: |
|
1894 |
- if node not in arFilter: |
|
1895 |
- arResults.append(node) |
|
1896 |
- bFound = (len(arResults) != 0) |
|
1897 |
- if not bFound: |
|
1898 |
- if bAllowMultiple: return [] |
|
1899 |
- elif iPropertyType == self.STRING: return '' |
|
1900 |
- elif iPropertyType == self.DATE: return None |
|
1901 |
- elif iPropertyType == self.URI: return '' |
|
1902 |
- elif iPropertyType == self.NODE: return None |
|
1903 |
- else: return None |
|
1904 |
- arValues = [] |
|
1905 |
- for elmResult in arResults: |
|
1906 |
- sValue = None |
|
1907 |
- if iPropertyType == self.NODE: |
|
1908 |
- if bAllowMultiple: |
|
1909 |
- arValues.append(elmResult) |
|
1910 |
- continue |
|
1911 |
- else: |
|
1912 |
- return elmResult |
|
1913 |
- sNodeName = elmResult.name.lower() |
|
1914 |
- if (iPropertyType == self.EMAIL) and (sNodeName == 'a'): |
|
1915 |
- sValue = (elmResult.get('href') or '').split('mailto:').pop().split('?')[0] |
|
1916 |
- if sValue: |
|
1917 |
- sValue = bNormalize and self.normalize(sValue) or sValue.strip() |
|
1918 |
- if (not sValue) and (sNodeName == 'abbr'): |
|
1919 |
- sValue = elmResult.get('title') |
|
1920 |
- if sValue: |
|
1921 |
- sValue = bNormalize and self.normalize(sValue) or sValue.strip() |
|
1922 |
- if (not sValue) and (iPropertyType == self.URI): |
|
1923 |
- if sNodeName == 'a': sValue = elmResult.get('href') |
|
1924 |
- elif sNodeName == 'img': sValue = elmResult.get('src') |
|
1925 |
- elif sNodeName == 'object': sValue = elmResult.get('data') |
|
1926 |
- if sValue: |
|
1927 |
- sValue = bNormalize and self.normalize(sValue) or sValue.strip() |
|
1928 |
- if (not sValue) and (sNodeName == 'img'): |
|
1929 |
- sValue = elmResult.get('alt') |
|
1930 |
- if sValue: |
|
1931 |
- sValue = bNormalize and self.normalize(sValue) or sValue.strip() |
|
1932 |
- if not sValue: |
|
1933 |
- sValue = elmResult.renderContents() |
|
1934 |
- sValue = re.sub(r'<\S[^>]*>', '', sValue) |
|
1935 |
- sValue = sValue.replace('\r\n', '\n') |
|
1936 |
- sValue = sValue.replace('\r', '\n') |
|
1937 |
- if sValue: |
|
1938 |
- sValue = bNormalize and self.normalize(sValue) or sValue.strip() |
|
1939 |
- if not sValue: continue |
|
1940 |
- if iPropertyType == self.DATE: |
|
1941 |
- sValue = _parse_date_iso8601(sValue) |
|
1942 |
- if bAllowMultiple: |
|
1943 |
- arValues.append(bAutoEscape and self.vcardEscape(sValue) or sValue) |
|
1944 |
- else: |
|
1945 |
- return bAutoEscape and self.vcardEscape(sValue) or sValue |
|
1946 |
- return arValues |
|
1947 |
- |
|
1948 |
- def findVCards(self, elmRoot, bAgentParsing=0): |
|
1949 |
- sVCards = '' |
|
1950 |
- |
|
1951 |
- if not bAgentParsing: |
|
1952 |
- arCards = self.getPropertyValue(elmRoot, 'vcard', bAllowMultiple=1) |
|
1953 |
- else: |
|
1954 |
- arCards = [elmRoot] |
|
1955 |
- |
|
1956 |
- for elmCard in arCards: |
|
1957 |
- arLines = [] |
|
1958 |
- |
|
1959 |
- def processSingleString(sProperty): |
|
1960 |
- sValue = self.getPropertyValue(elmCard, sProperty, self.STRING, bAutoEscape=1) |
|
1961 |
- if sValue: |
|
1962 |
- arLines.append(self.vcardFold(sProperty.upper() + ':' + sValue)) |
|
1963 |
- return sValue or '' |
|
1964 |
- |
|
1965 |
- def processSingleURI(sProperty): |
|
1966 |
- sValue = self.getPropertyValue(elmCard, sProperty, self.URI) |
|
1967 |
- if sValue: |
|
1968 |
- sContentType = '' |
|
1969 |
- sEncoding = '' |
|
1970 |
- sValueKey = '' |
|
1971 |
- if sValue.startswith('data:'): |
|
1972 |
- sEncoding = ';ENCODING=b' |
|
1973 |
- sContentType = sValue.split(';')[0].split('/').pop() |
|
1974 |
- sValue = sValue.split(',', 1).pop() |
|
1975 |
- else: |
|
1976 |
- elmValue = self.getPropertyValue(elmCard, sProperty) |
|
1977 |
- if elmValue: |
|
1978 |
- if sProperty != 'url': |
|
1979 |
- sValueKey = ';VALUE=uri' |
|
1980 |
- sContentType = elmValue.get('type', '').strip().split('/').pop().strip() |
|
1981 |
- sContentType = sContentType.upper() |
|
1982 |
- if sContentType == 'OCTET-STREAM': |
|
1983 |
- sContentType = '' |
|
1984 |
- if sContentType: |
|
1985 |
- sContentType = ';TYPE=' + sContentType.upper() |
|
1986 |
- arLines.append(self.vcardFold(sProperty.upper() + sEncoding + sContentType + sValueKey + ':' + sValue)) |
|
1987 |
- |
|
1988 |
- def processTypeValue(sProperty, arDefaultType, arForceType=None): |
|
1989 |
- arResults = self.getPropertyValue(elmCard, sProperty, bAllowMultiple=1) |
|
1990 |
- for elmResult in arResults: |
|
1991 |
- arType = self.getPropertyValue(elmResult, 'type', self.STRING, 1, 1) |
|
1992 |
- if arForceType: |
|
1993 |
- arType = self.unique(arForceType + arType) |
|
1994 |
- if not arType: |
|
1995 |
- arType = arDefaultType |
|
1996 |
- sValue = self.getPropertyValue(elmResult, 'value', self.EMAIL, 0) |
|
1997 |
- if sValue: |
|
1998 |
- arLines.append(self.vcardFold(sProperty.upper() + ';TYPE=' + ','.join(arType) + ':' + sValue)) |
|
1999 |
- |
|
2000 |
- # AGENT |
|
2001 |
- # must do this before all other properties because it is destructive |
|
2002 |
- # (removes nested class="vcard" nodes so they don't interfere with |
|
2003 |
- # this vcard's other properties) |
|
2004 |
- arAgent = self.getPropertyValue(elmCard, 'agent', bAllowMultiple=1) |
|
2005 |
- for elmAgent in arAgent: |
|
2006 |
- if re.compile(r'\bvcard\b').search(elmAgent.get('class')): |
|
2007 |
- sAgentValue = self.findVCards(elmAgent, 1) + '\n' |
|
2008 |
- sAgentValue = sAgentValue.replace('\n', '\\n') |
|
2009 |
- sAgentValue = sAgentValue.replace(';', '\\;') |
|
2010 |
- if sAgentValue: |
|
2011 |
- arLines.append(self.vcardFold('AGENT:' + sAgentValue)) |
|
2012 |
- elmAgent['class'] = '' |
|
2013 |
- elmAgent.contents = [] |
|
2014 |
- else: |
|
2015 |
- sAgentValue = self.getPropertyValue(elmAgent, 'value', self.URI, bAutoEscape=1); |
|
2016 |
- if sAgentValue: |
|
2017 |
- arLines.append(self.vcardFold('AGENT;VALUE=uri:' + sAgentValue)) |
|
2018 |
- |
|
2019 |
- # FN (full name) |
|
2020 |
- sFN = processSingleString('fn') |
|
2021 |
- |
|
2022 |
- # N (name) |
|
2023 |
- elmName = self.getPropertyValue(elmCard, 'n') |
|
2024 |
- if elmName: |
|
2025 |
- sFamilyName = self.getPropertyValue(elmName, 'family-name', self.STRING, bAutoEscape=1) |
|
2026 |
- sGivenName = self.getPropertyValue(elmName, 'given-name', self.STRING, bAutoEscape=1) |
|
2027 |
- arAdditionalNames = self.getPropertyValue(elmName, 'additional-name', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'additional-names', self.STRING, 1, 1) |
|
2028 |
- arHonorificPrefixes = self.getPropertyValue(elmName, 'honorific-prefix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-prefixes', self.STRING, 1, 1) |
|
2029 |
- arHonorificSuffixes = self.getPropertyValue(elmName, 'honorific-suffix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-suffixes', self.STRING, 1, 1) |
|
2030 |
- arLines.append(self.vcardFold('N:' + sFamilyName + ';' + |
|
2031 |
- sGivenName + ';' + |
|
2032 |
- ','.join(arAdditionalNames) + ';' + |
|
2033 |
- ','.join(arHonorificPrefixes) + ';' + |
|
2034 |
- ','.join(arHonorificSuffixes))) |
|
2035 |
- elif sFN: |
|
2036 |
- # implied "N" optimization |
|
2037 |
- # http://microformats.org/wiki/hcard#Implied_.22N.22_Optimization |
|
2038 |
- arNames = self.normalize(sFN).split() |
|
2039 |
- if len(arNames) == 2: |
|
2040 |
- bFamilyNameFirst = (arNames[0].endswith(',') or |
|
2041 |
- len(arNames[1]) == 1 or |
|
2042 |
- ((len(arNames[1]) == 2) and (arNames[1].endswith('.')))) |
|
2043 |
- if bFamilyNameFirst: |
|
2044 |
- arLines.append(self.vcardFold('N:' + arNames[0] + ';' + arNames[1])) |
|
2045 |
- else: |
|
2046 |
- arLines.append(self.vcardFold('N:' + arNames[1] + ';' + arNames[0])) |
|
2047 |
- |
|
2048 |
- # SORT-STRING |
|
2049 |
- sSortString = self.getPropertyValue(elmCard, 'sort-string', self.STRING, bAutoEscape=1) |
|
2050 |
- if sSortString: |
|
2051 |
- arLines.append(self.vcardFold('SORT-STRING:' + sSortString)) |
|
2052 |
- |
|
2053 |
- # NICKNAME |
|
2054 |
- arNickname = self.getPropertyValue(elmCard, 'nickname', self.STRING, 1, 1) |
|
2055 |
- if arNickname: |
|
2056 |
- arLines.append(self.vcardFold('NICKNAME:' + ','.join(arNickname))) |
|
2057 |
- |
|
2058 |
- # PHOTO |
|
2059 |
- processSingleURI('photo') |
|
2060 |
- |
|
2061 |
- # BDAY |
|
2062 |
- dtBday = self.getPropertyValue(elmCard, 'bday', self.DATE) |
|
2063 |
- if dtBday: |
|
2064 |
- arLines.append(self.vcardFold('BDAY:' + self.toISO8601(dtBday))) |
|
2065 |
- |
|
2066 |
- # ADR (address) |
|
2067 |
- arAdr = self.getPropertyValue(elmCard, 'adr', bAllowMultiple=1) |
|
2068 |
- for elmAdr in arAdr: |
|
2069 |
- arType = self.getPropertyValue(elmAdr, 'type', self.STRING, 1, 1) |
|
2070 |
- if not arType: |
|
2071 |
- arType = ['intl','postal','parcel','work'] # default adr types, see RFC 2426 section 3.2.1 |
|
2072 |
- sPostOfficeBox = self.getPropertyValue(elmAdr, 'post-office-box', self.STRING, 0, 1) |
|
2073 |
- sExtendedAddress = self.getPropertyValue(elmAdr, 'extended-address', self.STRING, 0, 1) |
|
2074 |
- sStreetAddress = self.getPropertyValue(elmAdr, 'street-address', self.STRING, 0, 1) |
|
2075 |
- sLocality = self.getPropertyValue(elmAdr, 'locality', self.STRING, 0, 1) |
|
2076 |
- sRegion = self.getPropertyValue(elmAdr, 'region', self.STRING, 0, 1) |
|
2077 |
- sPostalCode = self.getPropertyValue(elmAdr, 'postal-code', self.STRING, 0, 1) |
|
2078 |
- sCountryName = self.getPropertyValue(elmAdr, 'country-name', self.STRING, 0, 1) |
|
2079 |
- arLines.append(self.vcardFold('ADR;TYPE=' + ','.join(arType) + ':' + |
|
2080 |
- sPostOfficeBox + ';' + |
|
2081 |
- sExtendedAddress + ';' + |
|
2082 |
- sStreetAddress + ';' + |
|
2083 |
- sLocality + ';' + |
|
2084 |
- sRegion + ';' + |
|
2085 |
- sPostalCode + ';' + |
|
2086 |
- sCountryName)) |
|
2087 |
- |
|
2088 |
- # LABEL |
|
2089 |
- processTypeValue('label', ['intl','postal','parcel','work']) |
|
2090 |
- |
|
2091 |
- # TEL (phone number) |
|
2092 |
- processTypeValue('tel', ['voice']) |
|
2093 |
- |
|
2094 |
|
|
2095 |
- processTypeValue('email', ['internet'], ['internet']) |
|
2096 |
- |
|
2097 |
- # MAILER |
|
2098 |
- processSingleString('mailer') |
|
2099 |
- |
|
2100 |
- # TZ (timezone) |
|
2101 |
- processSingleString('tz') |
|
2102 |
- |
|
2103 |
- # GEO (geographical information) |
|
2104 |
- elmGeo = self.getPropertyValue(elmCard, 'geo') |
|
2105 |
- if elmGeo: |
|
2106 |
- sLatitude = self.getPropertyValue(elmGeo, 'latitude', self.STRING, 0, 1) |
|
2107 |
- sLongitude = self.getPropertyValue(elmGeo, 'longitude', self.STRING, 0, 1) |
|
2108 |
- arLines.append(self.vcardFold('GEO:' + sLatitude + ';' + sLongitude)) |
|
2109 |
- |
|
2110 |
- # TITLE |
|
2111 |
- processSingleString('title') |
|
2112 |
- |
|
2113 |
- # ROLE |
|
2114 |
- processSingleString('role') |
|
2115 |
- |
|
2116 |
- # LOGO |
|
2117 |
- processSingleURI('logo') |
|
2118 |
- |
|
2119 |
- # ORG (organization) |
|
2120 |
- elmOrg = self.getPropertyValue(elmCard, 'org') |
|
2121 |
- if elmOrg: |
|
2122 |
- sOrganizationName = self.getPropertyValue(elmOrg, 'organization-name', self.STRING, 0, 1) |
|
2123 |
- if not sOrganizationName: |
|
2124 |
- # implied "organization-name" optimization |
|
2125 |
- # http://microformats.org/wiki/hcard#Implied_.22organization-name.22_Optimization |
|
2126 |
- sOrganizationName = self.getPropertyValue(elmCard, 'org', self.STRING, 0, 1) |
|
2127 |
- if sOrganizationName: |
|
2128 |
- arLines.append(self.vcardFold('ORG:' + sOrganizationName)) |
|
2129 |
- else: |
|
2130 |
- arOrganizationUnit = self.getPropertyValue(elmOrg, 'organization-unit', self.STRING, 1, 1) |
|
2131 |
- arLines.append(self.vcardFold('ORG:' + sOrganizationName + ';' + ';'.join(arOrganizationUnit))) |
|
2132 |
- |
|
2133 |
- # CATEGORY |
|
2134 |
- arCategory = self.getPropertyValue(elmCard, 'category', self.STRING, 1, 1) + self.getPropertyValue(elmCard, 'categories', self.STRING, 1, 1) |
|
2135 |
- if arCategory: |
|
2136 |
- arLines.append(self.vcardFold('CATEGORIES:' + ','.join(arCategory))) |
|
2137 |
- |
|
2138 |
- # NOTE |
|
2139 |
- processSingleString('note') |
|
2140 |
- |
|
2141 |
- # REV |
|
2142 |
- processSingleString('rev') |
|
2143 |
- |
|
2144 |
- # SOUND |
|
2145 |
- processSingleURI('sound') |
|
2146 |
- |
|
2147 |
- # UID |
|
2148 |
- processSingleString('uid') |
|
2149 |
- |
|
2150 |
- # URL |
|
2151 |
- processSingleURI('url') |
|
2152 |
- |
|
2153 |
- # CLASS |
|
2154 |
- processSingleString('class') |
|
2155 |
- |
|
2156 |
- # KEY |
|
2157 |
- processSingleURI('key') |
|
2158 |
- |
|
2159 |
- if arLines: |
|
2160 |
- arLines = ['BEGIN:vCard','VERSION:3.0'] + arLines + ['END:vCard'] |
|
2161 |
- sVCards += '\n'.join(arLines) + '\n' |
|
2162 |
- |
|
2163 |
- return sVCards.strip() |
|
2164 |
- |
|
2165 |
- def isProbablyDownloadable(self, elm): |
|
2166 |
- attrsD = elm.attrMap |
|
2167 |
- if not attrsD.has_key('href'): return 0 |
|
2168 |
- linktype = attrsD.get('type', '').strip() |
|
2169 |
- if linktype.startswith('audio/') or \ |
|
2170 |
- linktype.startswith('video/') or \ |
|
2171 |
- (linktype.startswith('application/') and not linktype.endswith('xml')): |
|
2172 |
- return 1 |
|
2173 |
- path = urlparse.urlparse(attrsD['href'])[2] |
|
2174 |
- if path.find('.') == -1: return 0 |
|
2175 |
- fileext = path.split('.').pop().lower() |
|
2176 |
- return fileext in self.known_binary_extensions |
|
2177 |
- |
|
2178 |
- def findTags(self): |
|
2179 |
- all = lambda x: 1 |
|
2180 |
- for elm in self.document(all, {'rel': re.compile(r'\btag\b')}): |
|
2181 |
- href = elm.get('href') |
|
2182 |
- if not href: continue |
|
2183 |
- urlscheme, domain, path, params, query, fragment = \ |
|
2184 |
- urlparse.urlparse(_urljoin(self.baseuri, href)) |
|
2185 |
- segments = path.split('/') |
|
2186 |
- tag = segments.pop() |
|
2187 |
- if not tag: |
|
2188 |
- tag = segments.pop() |
|
2189 |
- tagscheme = urlparse.urlunparse((urlscheme, domain, '/'.join(segments), '', '', '')) |
|
2190 |
- if not tagscheme.endswith('/'): |
|
2191 |
- tagscheme += '/' |
|
2192 |
- self.tags.append(FeedParserDict({"term": tag, "scheme": tagscheme, "label": elm.string or ''})) |
|
2193 |
- |
|
2194 |
- def findEnclosures(self): |
|
2195 |
- all = lambda x: 1 |
|
2196 |
- enclosure_match = re.compile(r'\benclosure\b') |
|
2197 |
- for elm in self.document(all, {'href': re.compile(r'.+')}): |
|
2198 |
- if not enclosure_match.search(elm.get('rel', '')) and not self.isProbablyDownloadable(elm): continue |
|
2199 |
- if elm.attrMap not in self.enclosures: |
|
2200 |
- self.enclosures.append(elm.attrMap) |
|
2201 |
- if elm.string and not elm.get('title'): |
|
2202 |
- self.enclosures[-1]['title'] = elm.string |
|
2203 |
- |
|
2204 |
- def findXFN(self): |
|
2205 |
- all = lambda x: 1 |
|
2206 |
- for elm in self.document(all, {'rel': re.compile('.+'), 'href': re.compile('.+')}): |
|
2207 |
- rels = elm.get('rel', '').split() |
|
2208 |
- xfn_rels = [] |
|
2209 |
- for rel in rels: |
|
2210 |
- if rel in self.known_xfn_relationships: |
|
2211 |
- xfn_rels.append(rel) |
|
2212 |
- if xfn_rels: |
|
2213 |
- self.xfn.append({"relationships": xfn_rels, "href": elm.get('href', ''), "name": elm.string}) |
|
2214 |
- |
|
2215 |
-def _parseMicroformats(htmlSource, baseURI, encoding): |
|
2216 |
- if not BeautifulSoup: return |
|
2217 |
- if _debug: sys.stderr.write('entering _parseMicroformats\n') |
|
2218 |
- p = _MicroformatsParser(htmlSource, baseURI, encoding) |
|
2219 |
- p.vcard = p.findVCards(p.document) |
|
2220 |
- p.findTags() |
|
2221 |
- p.findEnclosures() |
|
2222 |
- p.findXFN() |
|
2223 |
- return {"tags": p.tags, "enclosures": p.enclosures, "xfn": p.xfn, "vcard": p.vcard} |
|
2224 |
- |
|
2225 |
-class _RelativeURIResolver(_BaseHTMLProcessor): |
|
2226 |
- relative_uris = [('a', 'href'), |
|
2227 |
- ('applet', 'codebase'), |
|
2228 |
- ('area', 'href'), |
|
2229 |
- ('blockquote', 'cite'), |
|
2230 |
- ('body', 'background'), |
|
2231 |
- ('del', 'cite'), |
|
2232 |
- ('form', 'action'), |
|
2233 |
- ('frame', 'longdesc'), |
|
2234 |
- ('frame', 'src'), |
|
2235 |
- ('iframe', 'longdesc'), |
|
2236 |
- ('iframe', 'src'), |
|
2237 |
- ('head', 'profile'), |
|
2238 |
- ('img', 'longdesc'), |
|
2239 |
- ('img', 'src'), |
|
2240 |
- ('img', 'usemap'), |
|
2241 |
- ('input', 'src'), |
|
2242 |
- ('input', 'usemap'), |
|
2243 |
- ('ins', 'cite'), |
|
2244 |
- ('link', 'href'), |
|
2245 |
- ('object', 'classid'), |
|
2246 |
- ('object', 'codebase'), |
|
2247 |
- ('object', 'data'), |
|
2248 |
- ('object', 'usemap'), |
|
2249 |
- ('q', 'cite'), |
|
2250 |
- ('script', 'src')] |
|
2251 |
- |
|
2252 |
- def __init__(self, baseuri, encoding, type): |
|
2253 |
- _BaseHTMLProcessor.__init__(self, encoding, type) |
|
2254 |
- self.baseuri = baseuri |
|
2255 |
- |
|
2256 |
- def resolveURI(self, uri): |
|
2257 |
- return _urljoin(self.baseuri, uri.strip()) |
|
2258 |
- |
|
2259 |
- def unknown_starttag(self, tag, attrs): |
|
2260 |
- attrs = self.normalize_attrs(attrs) |
|
2261 |
- attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs] |
|
2262 |
- _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) |
|
2263 |
- |
|
2264 |
-def _resolveRelativeURIs(htmlSource, baseURI, encoding, type): |
|
2265 |
- if _debug: sys.stderr.write('entering _resolveRelativeURIs\n') |
|
2266 |
- p = _RelativeURIResolver(baseURI, encoding, type) |
|
2267 |
- p.feed(htmlSource) |
|
2268 |
- return p.output() |
|
2269 |
- |
|
2270 |
-class _HTMLSanitizer(_BaseHTMLProcessor): |
|
2271 |
- acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'article', |
|
2272 |
- 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button', 'canvas', |
|
2273 |
- 'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'command', |
|
2274 |
- 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir', |
|
2275 |
- 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', 'figure', 'footer', |
|
2276 |
- 'font', 'form', 'header', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', |
|
2277 |
- 'img', 'input', 'ins', 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', |
|
2278 |
- 'menu', 'meter', 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', |
|
2279 |
- 'option', 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select', |
|
2280 |
- 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', 'sub', |
|
2281 |
- 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', 'th', 'thead', |
|
2282 |
- 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript'] |
|
2283 |
- |
|
2284 |
- acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey', |
|
2285 |
- 'action', 'align', 'alt', 'autoplay', 'autocomplete', 'autofocus', 'axis', |
|
2286 |
- 'background', 'balance', 'bgcolor', 'bgproperties', 'border', |
|
2287 |
- 'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding', |
|
2288 |
- 'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff', |
|
2289 |
- 'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols', |
|
2290 |
- 'colspan', 'compact', 'contenteditable', 'coords', 'data', 'datafld', |
|
2291 |
- 'datapagesize', 'datasrc', 'datetime', 'default', 'delay', 'dir', |
|
2292 |
- 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for', |
|
2293 |
- 'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus', |
|
2294 |
- 'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode', |
|
2295 |
- 'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc', |
|
2296 |
- 'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max', |
|
2297 |
- 'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref', |
|
2298 |
- 'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size', |
|
2299 |
- 'prompt', 'pqg', 'radiogroup', 'readonly', 'rel', 'repeat-max', |
|
2300 |
- 'repeat-min', 'replace', 'required', 'rev', 'rightspacing', 'rows', |
|
2301 |
- 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', |
|
2302 |
- 'start', 'step', 'summary', 'suppress', 'tabindex', 'target', 'template', |
|
2303 |
- 'title', 'toppadding', 'type', 'unselectable', 'usemap', 'urn', 'valign', |
|
2304 |
- 'value', 'variable', 'volume', 'vspace', 'vrml', 'width', 'wrap', |
|
2305 |
- 'xml:lang'] |
|
2306 |
- |
|
2307 |
- unacceptable_elements_with_end_tag = ['script', 'applet', 'style'] |
|
2308 |
- |
|
2309 |
- acceptable_css_properties = ['azimuth', 'background-color', |
|
2310 |
- 'border-bottom-color', 'border-collapse', 'border-color', |
|
2311 |
- 'border-left-color', 'border-right-color', 'border-top-color', 'clear', |
|
2312 |
- 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font', |
|
2313 |
- 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight', |
|
2314 |
- 'height', 'letter-spacing', 'line-height', 'overflow', 'pause', |
|
2315 |
- 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness', |
|
2316 |
- 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation', |
|
2317 |
- 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent', |
|
2318 |
- 'unicode-bidi', 'vertical-align', 'voice-family', 'volume', |
|
2319 |
- 'white-space', 'width'] |
|
2320 |
- |
|
2321 |
- # survey of common keywords found in feeds |
|
2322 |
- acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue', |
|
2323 |
- 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed', |
|
2324 |
- 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left', |
|
2325 |
- 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive', |
|
2326 |
- 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top', |
|
2327 |
- 'transparent', 'underline', 'white', 'yellow'] |
|
2328 |
- |
|
2329 |
- valid_css_values = re.compile('^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' + |
|
2330 |
- '\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$') |
|
2331 |
- |
|
2332 |
- mathml_elements = ['annotation', 'annotation-xml', 'maction', 'math', |
|
2333 |
- 'merror', 'mfenced', 'mfrac', 'mi', 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', |
|
2334 |
- 'mphantom', 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', |
|
2335 |
- 'msub', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder', |
|
2336 |
- 'munderover', 'none', 'semantics'] |
|
2337 |
- |
|
2338 |
- mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign', |
|
2339 |
- 'columnalign', 'close', 'columnlines', 'columnspacing', 'columnspan', 'depth', |
|
2340 |
- 'display', 'displaystyle', 'encoding', 'equalcolumns', 'equalrows', |
|
2341 |
- 'fence', 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', |
|
2342 |
- 'lspace', 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', |
|
2343 |
- 'maxsize', 'minsize', 'open', 'other', 'rowalign', 'rowalign', 'rowalign', |
|
2344 |
- 'rowlines', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection', |
|
2345 |
- 'separator', 'separators', 'stretchy', 'width', 'width', 'xlink:href', |
|
2346 |
- 'xlink:show', 'xlink:type', 'xmlns', 'xmlns:xlink'] |
|
2347 |
- |
|
2348 |
- # svgtiny - foreignObject + linearGradient + radialGradient + stop |
|
2349 |
- svg_elements = ['a', 'animate', 'animateColor', 'animateMotion', |
|
2350 |
- 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject', |
|
2351 |
- 'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', |
|
2352 |
- 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath', |
|
2353 |
- 'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop', |
|
2354 |
- 'svg', 'switch', 'text', 'title', 'tspan', 'use'] |
|
2355 |
- |
|
2356 |
- # svgtiny + class + opacity + offset + xmlns + xmlns:xlink |
|
2357 |
- svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic', |
|
2358 |
- 'arabic-form', 'ascent', 'attributeName', 'attributeType', |
|
2359 |
- 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height', |
|
2360 |
- 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx', |
|
2361 |
- 'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity', |
|
2362 |
- 'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style', |
|
2363 |
- 'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', |
|
2364 |
- 'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x', |
|
2365 |
- 'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines', |
|
2366 |
- 'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid', |
|
2367 |
- 'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max', |
|
2368 |
- 'min', 'name', 'offset', 'opacity', 'orient', 'origin', |
|
2369 |
- 'overline-position', 'overline-thickness', 'panose-1', 'path', |
|
2370 |
- 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY', |
|
2371 |
- 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures', |
|
2372 |
- 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', |
|
2373 |
- 'stop-color', 'stop-opacity', 'strikethrough-position', |
|
2374 |
- 'strikethrough-thickness', 'stroke', 'stroke-dasharray', |
|
2375 |
- 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin', |
|
2376 |
- 'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage', |
|
2377 |
- 'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2', |
|
2378 |
- 'underline-position', 'underline-thickness', 'unicode', 'unicode-range', |
|
2379 |
- 'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width', |
|
2380 |
- 'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole', |
|
2381 |
- 'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type', |
|
2382 |
- 'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', |
|
2383 |
- 'y2', 'zoomAndPan'] |
|
2384 |
- |
|
2385 |
- svg_attr_map = None |
|
2386 |
- svg_elem_map = None |
|
2387 |
- |
|
2388 |
- acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule', |
|
2389 |
- 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin', |
|
2390 |
- 'stroke-opacity'] |
|
2391 |
- |
|
2392 |
- def reset(self): |
|
2393 |
- _BaseHTMLProcessor.reset(self) |
|
2394 |
- self.unacceptablestack = 0 |
|
2395 |
- self.mathmlOK = 0 |
|
2396 |
- self.svgOK = 0 |
|
2397 |
- |
|
2398 |
- def unknown_starttag(self, tag, attrs): |
|
2399 |
- acceptable_attributes = self.acceptable_attributes |
|
2400 |
- keymap = {} |
|
2401 |
- if not tag in self.acceptable_elements or self.svgOK: |
|
2402 |
- if tag in self.unacceptable_elements_with_end_tag: |
|
2403 |
- self.unacceptablestack += 1 |
|
2404 |
- |
|
2405 |
- # not otherwise acceptable, perhaps it is MathML or SVG? |
|
2406 |
- if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs: |
|
2407 |
- self.mathmlOK += 1 |
|
2408 |
- if tag=='svg' and ('xmlns','http://www.w3.org/2000/svg') in attrs: |
|
2409 |
- self.svgOK += 1 |
|
2410 |
- |
|
2411 |
- # chose acceptable attributes based on tag class, else bail |
|
2412 |
- if self.mathmlOK and tag in self.mathml_elements: |
|
2413 |
- acceptable_attributes = self.mathml_attributes |
|
2414 |
- elif self.svgOK and tag in self.svg_elements: |
|
2415 |
- # for most vocabularies, lowercasing is a good idea. Many |
|
2416 |
- # svg elements, however, are camel case |
|
2417 |
- if not self.svg_attr_map: |
|
2418 |
- lower=[attr.lower() for attr in self.svg_attributes] |
|
2419 |
- mix=[a for a in self.svg_attributes if a not in lower] |
|
2420 |
- self.svg_attributes = lower |
|
2421 |
- self.svg_attr_map = dict([(a.lower(),a) for a in mix]) |
|
2422 |
- |
|
2423 |
- lower=[attr.lower() for attr in self.svg_elements] |
|
2424 |
- mix=[a for a in self.svg_elements if a not in lower] |
|
2425 |
- self.svg_elements = lower |
|
2426 |
- self.svg_elem_map = dict([(a.lower(),a) for a in mix]) |
|
2427 |
- acceptable_attributes = self.svg_attributes |
|
2428 |
- tag = self.svg_elem_map.get(tag,tag) |
|
2429 |
- keymap = self.svg_attr_map |
|
2430 |
- elif not tag in self.acceptable_elements: |
|
2431 |
- return |
|
2432 |
- |
|
2433 |
- # declare xlink namespace, if needed |
|
2434 |
- if self.mathmlOK or self.svgOK: |
|
2435 |
- if filter(lambda (n,v): n.startswith('xlink:'),attrs): |
|
2436 |
- if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs: |
|
2437 |
- attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink')) |
|
2438 |
- |
|
2439 |
- clean_attrs = [] |
|
2440 |
- for key, value in self.normalize_attrs(attrs): |
|
2441 |
- if key in acceptable_attributes: |
|
2442 |
- key=keymap.get(key,key) |
|
2443 |
- clean_attrs.append((key,value)) |
|
2444 |
- elif key=='style': |
|
2445 |
- clean_value = self.sanitize_style(value) |
|
2446 |
- if clean_value: clean_attrs.append((key,clean_value)) |
|
2447 |
- _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs) |
|
2448 |
- |
|
2449 |
- def unknown_endtag(self, tag): |
|
2450 |
- if not tag in self.acceptable_elements: |
|
2451 |
- if tag in self.unacceptable_elements_with_end_tag: |
|
2452 |
- self.unacceptablestack -= 1 |
|
2453 |
- if self.mathmlOK and tag in self.mathml_elements: |
|
2454 |
- if tag == 'math' and self.mathmlOK: self.mathmlOK -= 1 |
|
2455 |
- elif self.svgOK and tag in self.svg_elements: |
|
2456 |
- tag = self.svg_elem_map.get(tag,tag) |
|
2457 |
- if tag == 'svg' and self.svgOK: self.svgOK -= 1 |
|
2458 |
- else: |
|
2459 |
- return |
|
2460 |
- _BaseHTMLProcessor.unknown_endtag(self, tag) |
|
2461 |
- |
|
2462 |
- def handle_pi(self, text): |
|
2463 |
- pass |
|
2464 |
- |
|
2465 |
- def handle_decl(self, text): |
|
2466 |
- pass |
|
2467 |
- |
|
2468 |
- def handle_data(self, text): |
|
2469 |
- if not self.unacceptablestack: |
|
2470 |
- _BaseHTMLProcessor.handle_data(self, text) |
|
2471 |
- |
|
2472 |
- def sanitize_style(self, style): |
|
2473 |
- # disallow urls |
|
2474 |
- style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style) |
|
2475 |
- |
|
2476 |
- # gauntlet |
|
2477 |
- if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return '' |
|
2478 |
- if not re.match("^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$", style): return '' |
|
2479 |
- |
|
2480 |
- clean = [] |
|
2481 |
- for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style): |
|
2482 |
- if not value: continue |
|
2483 |
- if prop.lower() in self.acceptable_css_properties: |
|
2484 |
- clean.append(prop + ': ' + value + ';') |
|
2485 |
- elif prop.split('-')[0].lower() in ['background','border','margin','padding']: |
|
2486 |
- for keyword in value.split(): |
|
2487 |
- if not keyword in self.acceptable_css_keywords and \ |
|
2488 |
- not self.valid_css_values.match(keyword): |
|
2489 |
- break |
|
2490 |
- else: |
|
2491 |
- clean.append(prop + ': ' + value + ';') |
|
2492 |
- elif self.svgOK and prop.lower() in self.acceptable_svg_properties: |
|
2493 |
- clean.append(prop + ': ' + value + ';') |
|
2494 |
- |
|
2495 |
- return ' '.join(clean) |
|
2496 |
- |
|
2497 |
- |
|
2498 |
-def _sanitizeHTML(htmlSource, encoding, type): |
|
2499 |
- p = _HTMLSanitizer(encoding, type) |
|
2500 |
- p.feed(htmlSource) |
|
2501 |
- data = p.output() |
|
2502 |
- if TIDY_MARKUP: |
|
2503 |
- # loop through list of preferred Tidy interfaces looking for one that's installed, |
|
2504 |
- # then set up a common _tidy function to wrap the interface-specific API. |
|
2505 |
- _tidy = None |
|
2506 |
- for tidy_interface in PREFERRED_TIDY_INTERFACES: |
|
2507 |
- try: |
|
2508 |
- if tidy_interface == "uTidy": |
|
2509 |
- from tidy import parseString as _utidy |
|
2510 |
- def _tidy(data, **kwargs): |
|
2511 |
- return str(_utidy(data, **kwargs)) |
|
2512 |
- break |
|
2513 |
- elif tidy_interface == "mxTidy": |
|
2514 |
- from mx.Tidy import Tidy as _mxtidy |
|
2515 |
- def _tidy(data, **kwargs): |
|
2516 |
- nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs) |
|
2517 |
- return data |
|
2518 |
- break |
|
2519 |
- except: |
|
2520 |
- pass |
|
2521 |
- if _tidy: |
|
2522 |
- utf8 = type(data) == type(u'') |
|
2523 |
- if utf8: |
|
2524 |
- data = data.encode('utf-8') |
|
2525 |
- data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8") |
|
2526 |
- if utf8: |
|
2527 |
- data = unicode(data, 'utf-8') |
|
2528 |
- if data.count('<body'): |
|
2529 |
- data = data.split('<body', 1)[1] |
|
2530 |
- if data.count('>'): |
|
2531 |
- data = data.split('>', 1)[1] |
|
2532 |
- if data.count('</body'): |
|
2533 |
- data = data.split('</body', 1)[0] |
|
2534 |
- data = data.strip().replace('\r\n', '\n') |
|
2535 |
- return data |
|
2536 |
- |
|
2537 |
-class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler): |
|
2538 |
- def http_error_default(self, req, fp, code, msg, headers): |
|
2539 |
- if ((code / 100) == 3) and (code != 304): |
|
2540 |
- return self.http_error_302(req, fp, code, msg, headers) |
|
2541 |
- infourl = urllib.addinfourl(fp, headers, req.get_full_url()) |
|
2542 |
- infourl.status = code |
|
2543 |
- return infourl |
|
2544 |
- |
|
2545 |
- def http_error_302(self, req, fp, code, msg, headers): |
|
2546 |
- if headers.dict.has_key('location'): |
|
2547 |
- infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers) |
|
2548 |
- else: |
|
2549 |
- infourl = urllib.addinfourl(fp, headers, req.get_full_url()) |
|
2550 |
- if not hasattr(infourl, 'status'): |
|
2551 |
- infourl.status = code |
|
2552 |
- return infourl |
|
2553 |
- |
|
2554 |
- def http_error_301(self, req, fp, code, msg, headers): |
|
2555 |
- if headers.dict.has_key('location'): |
|
2556 |
- infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers) |
|
2557 |
- else: |
|
2558 |
- infourl = urllib.addinfourl(fp, headers, req.get_full_url()) |
|
2559 |
- if not hasattr(infourl, 'status'): |
|
2560 |
- infourl.status = code |
|
2561 |
- return infourl |
|
2562 |
- |
|
2563 |
- http_error_300 = http_error_302 |
|
2564 |
- http_error_303 = http_error_302 |
|
2565 |
- http_error_307 = http_error_302 |
|
2566 |
- |
|
2567 |
- def http_error_401(self, req, fp, code, msg, headers): |
|
2568 |
- # Check if |
|
2569 |
- # - server requires digest auth, AND |
|
2570 |
- # - we tried (unsuccessfully) with basic auth, AND |
|
2571 |
- # - we're using Python 2.3.3 or later (digest auth is irreparably broken in earlier versions) |
|
2572 |
- # If all conditions hold, parse authentication information |
|
2573 |
- # out of the Authorization header we sent the first time |
|
2574 |
- # (for the username and password) and the WWW-Authenticate |
|
2575 |
- # header the server sent back (for the realm) and retry |
|
2576 |
- # the request with the appropriate digest auth headers instead. |
|
2577 |
- # This evil genius hack has been brought to you by Aaron Swartz. |
|
2578 |
- host = urlparse.urlparse(req.get_full_url())[1] |
|
2579 |
- try: |
|
2580 |
- assert sys.version.split()[0] >= '2.3.3' |
|
2581 |
- assert base64 != None |
|
2582 |
- user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':') |
|
2583 |
- realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0] |
|
2584 |
- self.add_password(realm, host, user, passw) |
|
2585 |
- retry = self.http_error_auth_reqed('www-authenticate', host, req, headers) |
|
2586 |
- self.reset_retry_count() |
|
2587 |
- return retry |
|
2588 |
- except: |
|
2589 |
- return self.http_error_default(req, fp, code, msg, headers) |
|
2590 |
- |
|
2591 |
-def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers): |
|
2592 |
- """URL, filename, or string --> stream |
|
2593 |
- |
|
2594 |
- This function lets you define parsers that take any input source |
|
2595 |
- (URL, pathname to local or network file, or actual data as a string) |
|
2596 |
- and deal with it in a uniform manner. Returned object is guaranteed |
|
2597 |
- to have all the basic stdio read methods (read, readline, readlines). |
|
2598 |
- Just .close() the object when you're done with it. |
|
2599 |
- |
|
2600 |
- If the etag argument is supplied, it will be used as the value of an |
|
2601 |
- If-None-Match request header. |
|
2602 |
- |
|
2603 |
- If the modified argument is supplied, it can be a tuple of 9 integers |
|
2604 |
- (as returned by gmtime() in the standard Python time module) or a date |
|
2605 |
- string in any format supported by feedparser. Regardless, it MUST |
|
2606 |
- be in GMT (Greenwich Mean Time). It will be reformatted into an |
|
2607 |
- RFC 1123-compliant date and used as the value of an If-Modified-Since |
|
2608 |
- request header. |
|
2609 |
- |
|
2610 |
- If the agent argument is supplied, it will be used as the value of a |
|
2611 |
- User-Agent request header. |
|
2612 |
- |
|
2613 |
- If the referrer argument is supplied, it will be used as the value of a |
|
2614 |
- Referer[sic] request header. |
|
2615 |
- |
|
2616 |
- If handlers is supplied, it is a list of handlers used to build a |
|
2617 |
- urllib2 opener. |
|
2618 |
- """ |
|
2619 |
- |
|
2620 |
- if hasattr(url_file_stream_or_string, 'read'): |
|
2621 |
- return url_file_stream_or_string |
|
2622 |
- |
|
2623 |
- if url_file_stream_or_string == '-': |
|
2624 |
- return sys.stdin |
|
2625 |
- |
|
2626 |
- if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'): |
|
2627 |
- if not agent: |
|
2628 |
- agent = USER_AGENT |
|
2629 |
- # test for inline user:password for basic auth |
|
2630 |
- auth = None |
|
2631 |
- if base64: |
|
2632 |
- urltype, rest = urllib.splittype(url_file_stream_or_string) |
|
2633 |
- realhost, rest = urllib.splithost(rest) |
|
2634 |
- if realhost: |
|
2635 |
- user_passwd, realhost = urllib.splituser(realhost) |
|
2636 |
- if user_passwd: |
|
2637 |
- url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest) |
|
2638 |
- auth = base64.encodestring(user_passwd).strip() |
|
2639 |
- |
|
2640 |
- # iri support |
|
2641 |
- try: |
|
2642 |
- if isinstance(url_file_stream_or_string,unicode): |
|
2643 |
- url_file_stream_or_string = url_file_stream_or_string.encode('idna') |
|
2644 |
- else: |
|
2645 |
- url_file_stream_or_string = url_file_stream_or_string.decode('utf-8').encode('idna') |
|
2646 |
- except: |
|
2647 |
- pass |
|
2648 |
- |
|
2649 |
- # try to open with urllib2 (to use optional headers) |
|
2650 |
- request = urllib2.Request(url_file_stream_or_string) |
|
2651 |
- request.add_header('User-Agent', agent) |
|
2652 |
- if etag: |
|
2653 |
- request.add_header('If-None-Match', etag) |
|
2654 |
- if type(modified) == type(''): |
|
2655 |
- modified = _parse_date(modified) |
|
2656 |
- if modified: |
|
2657 |
- # format into an RFC 1123-compliant timestamp. We can't use |
|
2658 |
- # time.strftime() since the %a and %b directives can be affected |
|
2659 |
- # by the current locale, but RFC 2616 states that dates must be |
|
2660 |
- # in English. |
|
2661 |
- short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] |
|
2662 |
- months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] |
|
2663 |
- request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5])) |
|
2664 |
- if referrer: |
|
2665 |
- request.add_header('Referer', referrer) |
|
2666 |
- if gzip and zlib: |
|
2667 |
- request.add_header('Accept-encoding', 'gzip, deflate') |
|
2668 |
- elif gzip: |
|
2669 |
- request.add_header('Accept-encoding', 'gzip') |
|
2670 |
- elif zlib: |
|
2671 |
- request.add_header('Accept-encoding', 'deflate') |
|
2672 |
- else: |
|
2673 |
- request.add_header('Accept-encoding', '') |
|
2674 |
- if auth: |
|
2675 |
- request.add_header('Authorization', 'Basic %s' % auth) |
|
2676 |
- if ACCEPT_HEADER: |
|
2677 |
- request.add_header('Accept', ACCEPT_HEADER) |
|
2678 |
- request.add_header('A-IM', 'feed') # RFC 3229 support |
|
2679 |
- opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers)) |
|
2680 |
- opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent |
|
2681 |
- try: |
|
2682 |
- return opener.open(request) |
|
2683 |
- finally: |
|
2684 |
- opener.close() # JohnD |
|
2685 |
- |
|
2686 |
- # try to open with native open function (if url_file_stream_or_string is a filename) |
|
2687 |
- try: |
|
2688 |
- return open(url_file_stream_or_string) |
|
2689 |
- except: |
|
2690 |
- pass |
|
2691 |
- |
|
2692 |
- # treat url_file_stream_or_string as string |
|
2693 |
- return _StringIO(str(url_file_stream_or_string)) |
|
2694 |
- |
|
2695 |
-_date_handlers = [] |
|
2696 |
-def registerDateHandler(func): |
|
2697 |
- '''Register a date handler function (takes string, returns 9-tuple date in GMT)''' |
|
2698 |
- _date_handlers.insert(0, func) |
|
2699 |
- |
|
2700 |
-# ISO-8601 date parsing routines written by Fazal Majid. |
|
2701 |
-# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601 |
|
2702 |
-# parser is beyond the scope of feedparser and would be a worthwhile addition |
|
2703 |
-# to the Python library. |
|
2704 |
-# A single regular expression cannot parse ISO 8601 date formats into groups |
|
2705 |
-# as the standard is highly irregular (for instance is 030104 2003-01-04 or |
|
2706 |
-# 0301-04-01), so we use templates instead. |
|
2707 |
-# Please note the order in templates is significant because we need a |
|
2708 |
-# greedy match. |
|
2709 |
-_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-0MM?-?DD', 'YYYY-MM', 'YYYY-?OOO', |
|
2710 |
- 'YY-?MM-?DD', 'YY-?OOO', 'YYYY', |
|
2711 |
- '-YY-?MM', '-OOO', '-YY', |
|
2712 |
- '--MM-?DD', '--MM', |
|
2713 |
- '---DD', |
|
2714 |
- 'CC', ''] |
|
2715 |
-_iso8601_re = [ |
|
2716 |
- tmpl.replace( |
|
2717 |
- 'YYYY', r'(?P<year>\d{4})').replace( |
|
2718 |
- 'YY', r'(?P<year>\d\d)').replace( |
|
2719 |
- 'MM', r'(?P<month>[01]\d)').replace( |
|
2720 |
- 'DD', r'(?P<day>[0123]\d)').replace( |
|
2721 |
- 'OOO', r'(?P<ordinal>[0123]\d\d)').replace( |
|
2722 |
- 'CC', r'(?P<century>\d\d$)') |
|
2723 |
- + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})' |
|
2724 |
- + r'(:(?P<second>\d{2}(\.\d*)?))?' |
|
2725 |
- + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?' |
|
2726 |
- for tmpl in _iso8601_tmpl] |
|
2727 |
-del tmpl |
|
2728 |
-_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re] |
|
2729 |
-del regex |
|
2730 |
-def _parse_date_iso8601(dateString): |
|
2731 |
- '''Parse a variety of ISO-8601-compatible formats like 20040105''' |
|
2732 |
- m = None |
|
2733 |
- for _iso8601_match in _iso8601_matches: |
|
2734 |
- m = _iso8601_match(dateString) |
|
2735 |
- if m: break |
|
2736 |
- if not m: return |
|
2737 |
- if m.span() == (0, 0): return |
|
2738 |
- params = m.groupdict() |
|
2739 |
- ordinal = params.get('ordinal', 0) |
|
2740 |
- if ordinal: |
|
2741 |
- ordinal = int(ordinal) |
|
2742 |
- else: |
|
2743 |
- ordinal = 0 |
|
2744 |
- year = params.get('year', '--') |
|
2745 |
- if not year or year == '--': |
|
2746 |
- year = time.gmtime()[0] |
|
2747 |
- elif len(year) == 2: |
|
2748 |
- # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993 |
|
2749 |
- year = 100 * int(time.gmtime()[0] / 100) + int(year) |
|
2750 |
- else: |
|
2751 |
- year = int(year) |
|
2752 |
- month = params.get('month', '-') |
|
2753 |
- if not month or month == '-': |
|
2754 |
- # ordinals are NOT normalized by mktime, we simulate them |
|
2755 |
- # by setting month=1, day=ordinal |
|
2756 |
- if ordinal: |
|
2757 |
- month = 1 |
|
2758 |
- else: |
|
2759 |
- month = time.gmtime()[1] |
|
2760 |
- month = int(month) |
|
2761 |
- day = params.get('day', 0) |
|
2762 |
- if not day: |
|
2763 |
- # see above |
|
2764 |
- if ordinal: |
|
2765 |
- day = ordinal |
|
2766 |
- elif params.get('century', 0) or \ |
|
2767 |
- params.get('year', 0) or params.get('month', 0): |
|
2768 |
- day = 1 |
|
2769 |
- else: |
|
2770 |
- day = time.gmtime()[2] |
|
2771 |
- else: |
|
2772 |
- day = int(day) |
|
2773 |
- # special case of the century - is the first year of the 21st century |
|
2774 |
- # 2000 or 2001 ? The debate goes on... |
|
2775 |
- if 'century' in params.keys(): |
|
2776 |
- year = (int(params['century']) - 1) * 100 + 1 |
|
2777 |
- # in ISO 8601 most fields are optional |
|
2778 |
- for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']: |
|
2779 |
- if not params.get(field, None): |
|
2780 |
- params[field] = 0 |
|
2781 |
- hour = int(params.get('hour', 0)) |
|
2782 |
- minute = int(params.get('minute', 0)) |
|
2783 |
- second = int(float(params.get('second', 0))) |
|
2784 |
- # weekday is normalized by mktime(), we can ignore it |
|
2785 |
- weekday = 0 |
|
2786 |
- daylight_savings_flag = -1 |
|
2787 |
- tm = [year, month, day, hour, minute, second, weekday, |
|
2788 |
- ordinal, daylight_savings_flag] |
|
2789 |
- # ISO 8601 time zone adjustments |
|
2790 |
- tz = params.get('tz') |
|
2791 |
- if tz and tz != 'Z': |
|
2792 |
- if tz[0] == '-': |
|
2793 |
- tm[3] += int(params.get('tzhour', 0)) |
|
2794 |
- tm[4] += int(params.get('tzmin', 0)) |
|
2795 |
- elif tz[0] == '+': |
|
2796 |
- tm[3] -= int(params.get('tzhour', 0)) |
|
2797 |
- tm[4] -= int(params.get('tzmin', 0)) |
|
2798 |
- else: |
|
2799 |
- return None |
|
2800 |
- # Python's time.mktime() is a wrapper around the ANSI C mktime(3c) |
|
2801 |
- # which is guaranteed to normalize d/m/y/h/m/s. |
|
2802 |
- # Many implementations have bugs, but we'll pretend they don't. |
|
2803 |
- return time.localtime(time.mktime(tm)) |
|
2804 |
-registerDateHandler(_parse_date_iso8601) |
|
2805 |
- |
|
2806 |
-# 8-bit date handling routines written by ytrewq1. |
|
2807 |
-_korean_year = u'\ub144' # b3e2 in euc-kr |
|
2808 |
-_korean_month = u'\uc6d4' # bff9 in euc-kr |
|
2809 |
-_korean_day = u'\uc77c' # c0cf in euc-kr |
|
2810 |
-_korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr |
|
2811 |
-_korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr |
|
2812 |
- |
|
2813 |
-_korean_onblog_date_re = \ |
|
2814 |
- re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \ |
|
2815 |
- (_korean_year, _korean_month, _korean_day)) |
|
2816 |
-_korean_nate_date_re = \ |
|
2817 |
- re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \ |
|
2818 |
- (_korean_am, _korean_pm)) |
|
2819 |
-def _parse_date_onblog(dateString): |
|
2820 |
- '''Parse a string according to the OnBlog 8-bit date format''' |
|
2821 |
- m = _korean_onblog_date_re.match(dateString) |
|
2822 |
- if not m: return |
|
2823 |
- w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ |
|
2824 |
- {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ |
|
2825 |
- 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\ |
|
2826 |
- 'zonediff': '+09:00'} |
|
2827 |
- if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate) |
|
2828 |
- return _parse_date_w3dtf(w3dtfdate) |
|
2829 |
-registerDateHandler(_parse_date_onblog) |
|
2830 |
- |
|
2831 |
-def _parse_date_nate(dateString): |
|
2832 |
- '''Parse a string according to the Nate 8-bit date format''' |
|
2833 |
- m = _korean_nate_date_re.match(dateString) |
|
2834 |
- if not m: return |
|
2835 |
- hour = int(m.group(5)) |
|
2836 |
- ampm = m.group(4) |
|
2837 |
- if (ampm == _korean_pm): |
|
2838 |
- hour += 12 |
|
2839 |
- hour = str(hour) |
|
2840 |
- if len(hour) == 1: |
|
2841 |
- hour = '0' + hour |
|
2842 |
- w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ |
|
2843 |
- {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ |
|
2844 |
- 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\ |
|
2845 |
- 'zonediff': '+09:00'} |
|
2846 |
- if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate) |
|
2847 |
- return _parse_date_w3dtf(w3dtfdate) |
|
2848 |
-registerDateHandler(_parse_date_nate) |
|
2849 |
- |
|
2850 |
-_mssql_date_re = \ |
|
2851 |
- re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?') |
|
2852 |
-def _parse_date_mssql(dateString): |
|
2853 |
- '''Parse a string according to the MS SQL date format''' |
|
2854 |
- m = _mssql_date_re.match(dateString) |
|
2855 |
- if not m: return |
|
2856 |
- w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ |
|
2857 |
- {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ |
|
2858 |
- 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\ |
|
2859 |
- 'zonediff': '+09:00'} |
|
2860 |
- if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate) |
|
2861 |
- return _parse_date_w3dtf(w3dtfdate) |
|
2862 |
-registerDateHandler(_parse_date_mssql) |
|
2863 |
- |
|
2864 |
-# Unicode strings for Greek date strings |
|
2865 |
-_greek_months = \ |
|
2866 |
- { \ |
|
2867 |
- u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7 |
|
2868 |
- u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7 |
|
2869 |
- u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7 |
|
2870 |
- u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7 |
|
2871 |
- u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7 |
|
2872 |
- u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7 |
|
2873 |
- u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7 |
|
2874 |
- u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7 |
|
2875 |
- u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7 |
|
2876 |
- u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7 |
|
2877 |
- u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7 |
|
2878 |
- u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7 |
|
2879 |
- u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7 |
|
2880 |
- u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7 |
|
2881 |
- u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7 |
|
2882 |
- u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7 |
|
2883 |
- u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7 |
|
2884 |
- u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7 |
|
2885 |
- u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7 |
|
2886 |
- } |
|
2887 |
- |
|
2888 |
-_greek_wdays = \ |
|
2889 |
- { \ |
|
2890 |
- u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7 |
|
2891 |
- u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7 |
|
2892 |
- u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7 |
|
2893 |
- u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7 |
|
2894 |
- u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7 |
|
2895 |
- u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7 |
|
2896 |
- u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7 |
|
2897 |
- } |
|
2898 |
- |
|
2899 |
-_greek_date_format_re = \ |
|
2900 |
- re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)') |
|
2901 |
- |
|
2902 |
-def _parse_date_greek(dateString): |
|
2903 |
- '''Parse a string according to a Greek 8-bit date format.''' |
|
2904 |
- m = _greek_date_format_re.match(dateString) |
|
2905 |
- if not m: return |
|
2906 |
- try: |
|
2907 |
- wday = _greek_wdays[m.group(1)] |
|
2908 |
- month = _greek_months[m.group(3)] |
|
2909 |
- except: |
|
2910 |
- return |
|
2911 |
- rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \ |
|
2912 |
- {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\ |
|
2913 |
- 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\ |
|
2914 |
- 'zonediff': m.group(8)} |
|
2915 |
- if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date) |
|
2916 |
- return _parse_date_rfc822(rfc822date) |
|
2917 |
-registerDateHandler(_parse_date_greek) |
|
2918 |
- |
|
2919 |
-# Unicode strings for Hungarian date strings |
|
2920 |
-_hungarian_months = \ |
|
2921 |
- { \ |
|
2922 |
- u'janu\u00e1r': u'01', # e1 in iso-8859-2 |
|
2923 |
- u'febru\u00e1ri': u'02', # e1 in iso-8859-2 |
|
2924 |
- u'm\u00e1rcius': u'03', # e1 in iso-8859-2 |
|
2925 |
- u'\u00e1prilis': u'04', # e1 in iso-8859-2 |
|
2926 |
- u'm\u00e1ujus': u'05', # e1 in iso-8859-2 |
|
2927 |
- u'j\u00fanius': u'06', # fa in iso-8859-2 |
|
2928 |
- u'j\u00falius': u'07', # fa in iso-8859-2 |
|
2929 |
- u'augusztus': u'08', |
|
2930 |
- u'szeptember': u'09', |
|
2931 |
- u'okt\u00f3ber': u'10', # f3 in iso-8859-2 |
|
2932 |
- u'november': u'11', |
|
2933 |
- u'december': u'12', |
|
2934 |
- } |
|
2935 |
- |
|
2936 |
-_hungarian_date_format_re = \ |
|
2937 |
- re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))') |
|
2938 |
- |
|
2939 |
-def _parse_date_hungarian(dateString): |
|
2940 |
- '''Parse a string according to a Hungarian 8-bit date format.''' |
|
2941 |
- m = _hungarian_date_format_re.match(dateString) |
|
2942 |
- if not m: return |
|
2943 |
- try: |
|
2944 |
- month = _hungarian_months[m.group(2)] |
|
2945 |
- day = m.group(3) |
|
2946 |
- if len(day) == 1: |
|
2947 |
- day = '0' + day |
|
2948 |
- hour = m.group(4) |
|
2949 |
- if len(hour) == 1: |
|
2950 |
- hour = '0' + hour |
|
2951 |
- except: |
|
2952 |
- return |
|
2953 |
- w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \ |
|
2954 |
- {'year': m.group(1), 'month': month, 'day': day,\ |
|
2955 |
- 'hour': hour, 'minute': m.group(5),\ |
|
2956 |
- 'zonediff': m.group(6)} |
|
2957 |
- if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate) |
|
2958 |
- return _parse_date_w3dtf(w3dtfdate) |
|
2959 |
-registerDateHandler(_parse_date_hungarian) |
|
2960 |
- |
|
2961 |
-# W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by |
|
2962 |
-# Drake and licensed under the Python license. Removed all range checking |
|
2963 |
-# for month, day, hour, minute, and second, since mktime will normalize |
|
2964 |
-# these later |
|
2965 |
-def _parse_date_w3dtf(dateString): |
|
2966 |
- def __extract_date(m): |
|
2967 |
- year = int(m.group('year')) |
|
2968 |
- if year < 100: |
|
2969 |
- year = 100 * int(time.gmtime()[0] / 100) + int(year) |
|
2970 |
- if year < 1000: |
|
2971 |
- return 0, 0, 0 |
|
2972 |
- julian = m.group('julian') |
|
2973 |
- if julian: |
|
2974 |
- julian = int(julian) |
|
2975 |
- month = julian / 30 + 1 |
|
2976 |
- day = julian % 30 + 1 |
|
2977 |
- jday = None |
|
2978 |
- while jday != julian: |
|
2979 |
- t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0)) |
|
2980 |
- jday = time.gmtime(t)[-2] |
|
2981 |
- diff = abs(jday - julian) |
|
2982 |
- if jday > julian: |
|
2983 |
- if diff < day: |
|
2984 |
- day = day - diff |
|
2985 |
- else: |
|
2986 |
- month = month - 1 |
|
2987 |
- day = 31 |
|
2988 |
- elif jday < julian: |
|
2989 |
- if day + diff < 28: |
|
2990 |
- day = day + diff |
|
2991 |
- else: |
|
2992 |
- month = month + 1 |
|
2993 |
- return year, month, day |
|
2994 |
- month = m.group('month') |
|
2995 |
- day = 1 |
|
2996 |
- if month is None: |
|
2997 |
- month = 1 |
|
2998 |
- else: |
|
2999 |
- month = int(month) |
|
3000 |
- day = m.group('day') |
|
3001 |
- if day: |
|
3002 |
- day = int(day) |
|
3003 |
- else: |
|
3004 |
- day = 1 |
|
3005 |
- return year, month, day |
|
3006 |
- |
|
3007 |
- def __extract_time(m): |
|
3008 |
- if not m: |
|
3009 |
- return 0, 0, 0 |
|
3010 |
- hours = m.group('hours') |
|
3011 |
- if not hours: |
|
3012 |
- return 0, 0, 0 |
|
3013 |
- hours = int(hours) |
|
3014 |
- minutes = int(m.group('minutes')) |
|
3015 |
- seconds = m.group('seconds') |
|
3016 |
- if seconds: |
|
3017 |
- seconds = int(seconds) |
|
3018 |
- else: |
|
3019 |
- seconds = 0 |
|
3020 |
- return hours, minutes, seconds |
|
3021 |
- |
|
3022 |
- def __extract_tzd(m): |
|
3023 |
- '''Return the Time Zone Designator as an offset in seconds from UTC.''' |
|
3024 |
- if not m: |
|
3025 |
- return 0 |
|
3026 |
- tzd = m.group('tzd') |
|
3027 |
- if not tzd: |
|
3028 |
- return 0 |
|
3029 |
- if tzd == 'Z': |
|
3030 |
- return 0 |
|
3031 |
- hours = int(m.group('tzdhours')) |
|
3032 |
- minutes = m.group('tzdminutes') |
|
3033 |
- if minutes: |
|
3034 |
- minutes = int(minutes) |
|
3035 |
- else: |
|
3036 |
- minutes = 0 |
|
3037 |
- offset = (hours*60 + minutes) * 60 |
|
3038 |
- if tzd[0] == '+': |
|
3039 |
- return -offset |
|
3040 |
- return offset |
|
3041 |
- |
|
3042 |
- __date_re = ('(?P<year>\d\d\d\d)' |
|
3043 |
- '(?:(?P<dsep>-|)' |
|
3044 |
- '(?:(?P<julian>\d\d\d)' |
|
3045 |
- '|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?') |
|
3046 |
- __tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)' |
|
3047 |
- __tzd_rx = re.compile(__tzd_re) |
|
3048 |
- __time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)' |
|
3049 |
- '(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?' |
|
3050 |
- + __tzd_re) |
|
3051 |
- __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re) |
|
3052 |
- __datetime_rx = re.compile(__datetime_re) |
|
3053 |
- m = __datetime_rx.match(dateString) |
|
3054 |
- if (m is None) or (m.group() != dateString): return |
|
3055 |
- gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0) |
|
3056 |
- if gmt[0] == 0: return |
|
3057 |
- return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone) |
|
3058 |
-registerDateHandler(_parse_date_w3dtf) |
|
3059 |
- |
|
3060 |
-def _parse_date_rfc822(dateString): |
|
3061 |
- '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date''' |
|
3062 |
- data = dateString.split() |
|
3063 |
- if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames: |
|
3064 |
- del data[0] |
|
3065 |
- if len(data) == 4: |
|
3066 |
- s = data[3] |
|
3067 |
- i = s.find('+') |
|
3068 |
- if i > 0: |
|
3069 |
- data[3:] = [s[:i], s[i+1:]] |
|
3070 |
- else: |
|
3071 |
- data.append('') |
|
3072 |
- dateString = " ".join(data) |
|
3073 |
- if len(data) < 5: |
|
3074 |
- dateString += ' 00:00:00 GMT' |
|
3075 |
- tm = rfc822.parsedate_tz(dateString) |
|
3076 |
- if tm: |
|
3077 |
- return time.gmtime(rfc822.mktime_tz(tm)) |
|
3078 |
-# rfc822.py defines several time zones, but we define some extra ones. |
|
3079 |
-# 'ET' is equivalent to 'EST', etc. |
|
3080 |
-_additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800} |
|
3081 |
-rfc822._timezones.update(_additional_timezones) |
|
3082 |
-registerDateHandler(_parse_date_rfc822) |
|
3083 |
- |
|
3084 |
-def _parse_date_perforce(aDateString): |
|
3085 |
- """parse a date in yyyy/mm/dd hh:mm:ss TTT format""" |
|
3086 |
- # Fri, 2006/09/15 08:19:53 EDT |
|
3087 |
- _my_date_pattern = re.compile( \ |
|
3088 |
- r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})') |
|
3089 |
- |
|
3090 |
- dow, year, month, day, hour, minute, second, tz = \ |
|
3091 |
- _my_date_pattern.search(aDateString).groups() |
|
3092 |
- months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] |
|
3093 |
- dateString = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz) |
|
3094 |
- tm = rfc822.parsedate_tz(dateString) |
|
3095 |
- if tm: |
|
3096 |
- return time.gmtime(rfc822.mktime_tz(tm)) |
|
3097 |
-registerDateHandler(_parse_date_perforce) |
|
3098 |
- |
|
3099 |
-def _parse_date(dateString): |
|
3100 |
- '''Parses a variety of date formats into a 9-tuple in GMT''' |
|
3101 |
- for handler in _date_handlers: |
|
3102 |
- try: |
|
3103 |
- date9tuple = handler(dateString) |
|
3104 |
- if not date9tuple: continue |
|
3105 |
- if len(date9tuple) != 9: |
|
3106 |
- if _debug: sys.stderr.write('date handler function must return 9-tuple\n') |
|
3107 |
- raise ValueError |
|
3108 |
- map(int, date9tuple) |
|
3109 |
- return date9tuple |
|
3110 |
- except Exception as e: |
|
3111 |
- if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e))) |
|
3112 |
- pass |
|
3113 |
- return None |
|
3114 |
- |
|
3115 |
-def _getCharacterEncoding(http_headers, xml_data): |
|
3116 |
- '''Get the character encoding of the XML document |
|
3117 |
- |
|
3118 |
- http_headers is a dictionary |
|
3119 |
- xml_data is a raw string (not Unicode) |
|
3120 |
- |
|
3121 |
- This is so much trickier than it sounds, it's not even funny. |
|
3122 |
- According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type |
|
3123 |
- is application/xml, application/*+xml, |
|
3124 |
- application/xml-external-parsed-entity, or application/xml-dtd, |
|
3125 |
- the encoding given in the charset parameter of the HTTP Content-Type |
|
3126 |
- takes precedence over the encoding given in the XML prefix within the |
|
3127 |
- document, and defaults to 'utf-8' if neither are specified. But, if |
|
3128 |
- the HTTP Content-Type is text/xml, text/*+xml, or |
|
3129 |
- text/xml-external-parsed-entity, the encoding given in the XML prefix |
|
3130 |
- within the document is ALWAYS IGNORED and only the encoding given in |
|
3131 |
- the charset parameter of the HTTP Content-Type header should be |
|
3132 |
- respected, and it defaults to 'us-ascii' if not specified. |
|
3133 |
- |
|
3134 |
- Furthermore, discussion on the atom-syntax mailing list with the |
|
3135 |
- author of RFC 3023 leads me to the conclusion that any document |
|
3136 |
- served with a Content-Type of text/* and no charset parameter |
|
3137 |
- must be treated as us-ascii. (We now do this.) And also that it |
|
3138 |
- must always be flagged as non-well-formed. (We now do this too.) |
|
3139 |
- |
|
3140 |
- If Content-Type is unspecified (input was local file or non-HTTP source) |
|
3141 |
- or unrecognized (server just got it totally wrong), then go by the |
|
3142 |
- encoding given in the XML prefix of the document and default to |
|
3143 |
- 'iso-8859-1' as per the HTTP specification (RFC 2616). |
|
3144 |
- |
|
3145 |
- Then, assuming we didn't find a character encoding in the HTTP headers |
|
3146 |
- (and the HTTP Content-type allowed us to look in the body), we need |
|
3147 |
- to sniff the first few bytes of the XML data and try to determine |
|
3148 |
- whether the encoding is ASCII-compatible. Section F of the XML |
|
3149 |
- specification shows the way here: |
|
3150 |
- http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info |
|
3151 |
- |
|
3152 |
- If the sniffed encoding is not ASCII-compatible, we need to make it |
|
3153 |
- ASCII compatible so that we can sniff further into the XML declaration |
|
3154 |
- to find the encoding attribute, which will tell us the true encoding. |
|
3155 |
- |
|
3156 |
- Of course, none of this guarantees that we will be able to parse the |
|
3157 |
- feed in the declared character encoding (assuming it was declared |
|
3158 |
- correctly, which many are not). CJKCodecs and iconv_codec help a lot; |
|
3159 |
- you should definitely install them if you can. |
|
3160 |
- http://cjkpython.i18n.org/ |
|
3161 |
- ''' |
|
3162 |
- |
|
3163 |
- def _parseHTTPContentType(content_type): |
|
3164 |
- '''takes HTTP Content-Type header and returns (content type, charset) |
|
3165 |
- |
|
3166 |
- If no charset is specified, returns (content type, '') |
|
3167 |
- If no content type is specified, returns ('', '') |
|
3168 |
- Both return parameters are guaranteed to be lowercase strings |
|
3169 |
- ''' |
|
3170 |
- content_type = content_type or '' |
|
3171 |
- content_type, params = cgi.parse_header(content_type) |
|
3172 |
- return content_type, params.get('charset', '').replace("'", '') |
|
3173 |
- |
|
3174 |
- sniffed_xml_encoding = '' |
|
3175 |
- xml_encoding = '' |
|
3176 |
- true_encoding = '' |
|
3177 |
- http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type')) |
|
3178 |
- # Must sniff for non-ASCII-compatible character encodings before |
|
3179 |
- # searching for XML declaration. This heuristic is defined in |
|
3180 |
- # section F of the XML specification: |
|
3181 |
- # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info |
|
3182 |
- try: |
|
3183 |
- if xml_data[:4] == '\x4c\x6f\xa7\x94': |
|
3184 |
- # EBCDIC |
|
3185 |
- xml_data = _ebcdic_to_ascii(xml_data) |
|
3186 |
- elif xml_data[:4] == '\x00\x3c\x00\x3f': |
|
3187 |
- # UTF-16BE |
|
3188 |
- sniffed_xml_encoding = 'utf-16be' |
|
3189 |
- xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') |
|
3190 |
- elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'): |
|
3191 |
- # UTF-16BE with BOM |
|
3192 |
- sniffed_xml_encoding = 'utf-16be' |
|
3193 |
- xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') |
|
3194 |
- elif xml_data[:4] == '\x3c\x00\x3f\x00': |
|
3195 |
- # UTF-16LE |
|
3196 |
- sniffed_xml_encoding = 'utf-16le' |
|
3197 |
- xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') |
|
3198 |
- elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'): |
|
3199 |
- # UTF-16LE with BOM |
|
3200 |
- sniffed_xml_encoding = 'utf-16le' |
|
3201 |
- xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') |
|
3202 |
- elif xml_data[:4] == '\x00\x00\x00\x3c': |
|
3203 |
- # UTF-32BE |
|
3204 |
- sniffed_xml_encoding = 'utf-32be' |
|
3205 |
- xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') |
|
3206 |
- elif xml_data[:4] == '\x3c\x00\x00\x00': |
|
3207 |
- # UTF-32LE |
|
3208 |
- sniffed_xml_encoding = 'utf-32le' |
|
3209 |
- xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') |
|
3210 |
- elif xml_data[:4] == '\x00\x00\xfe\xff': |
|
3211 |
- # UTF-32BE with BOM |
|
3212 |
- sniffed_xml_encoding = 'utf-32be' |
|
3213 |
- xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') |
|
3214 |
- elif xml_data[:4] == '\xff\xfe\x00\x00': |
|
3215 |
- # UTF-32LE with BOM |
|
3216 |
- sniffed_xml_encoding = 'utf-32le' |
|
3217 |
- xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') |
|
3218 |
- elif xml_data[:3] == '\xef\xbb\xbf': |
|
3219 |
- # UTF-8 with BOM |
|
3220 |
- sniffed_xml_encoding = 'utf-8' |
|
3221 |
- xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') |
|
3222 |
- else: |
|
3223 |
- # ASCII-compatible |
|
3224 |
- pass |
|
3225 |
- xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data) |
|
3226 |
- except: |
|
3227 |
- xml_encoding_match = None |
|
3228 |
- if xml_encoding_match: |
|
3229 |
- xml_encoding = xml_encoding_match.groups()[0].lower() |
|
3230 |
- if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')): |
|
3231 |
- xml_encoding = sniffed_xml_encoding |
|
3232 |
- acceptable_content_type = 0 |
|
3233 |
- application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity') |
|
3234 |
- text_content_types = ('text/xml', 'text/xml-external-parsed-entity') |
|
3235 |
- if (http_content_type in application_content_types) or \ |
|
3236 |
- (http_content_type.startswith('application/') and http_content_type.endswith('+xml')): |
|
3237 |
- acceptable_content_type = 1 |
|
3238 |
- true_encoding = http_encoding or xml_encoding or 'utf-8' |
|
3239 |
- elif (http_content_type in text_content_types) or \ |
|
3240 |
- (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'): |
|
3241 |
- acceptable_content_type = 1 |
|
3242 |
- true_encoding = http_encoding or 'us-ascii' |
|
3243 |
- elif http_content_type.startswith('text/'): |
|
3244 |
- true_encoding = http_encoding or 'us-ascii' |
|
3245 |
- elif http_headers and (not http_headers.has_key('content-type')): |
|
3246 |
- true_encoding = xml_encoding or 'iso-8859-1' |
|
3247 |
- else: |
|
3248 |
- true_encoding = xml_encoding or 'utf-8' |
|
3249 |
- # some feeds claim to be gb2312 but are actually gb18030. |
|
3250 |
- # apparently MSIE and Firefox both do the following switch: |
|
3251 |
- if true_encoding.lower() == 'gb2312': |
|
3252 |
- true_encoding = 'gb18030' |
|
3253 |
- return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type |
|
3254 |
- |
|
3255 |
-def _toUTF8(data, encoding): |
|
3256 |
- '''Changes an XML data stream on the fly to specify a new encoding |
|
3257 |
- |
|
3258 |
- data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already |
|
3259 |
- encoding is a string recognized by encodings.aliases |
|
3260 |
- ''' |
|
3261 |
- if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding) |
|
3262 |
- # strip Byte Order Mark (if present) |
|
3263 |
- if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'): |
|
3264 |
- if _debug: |
|
3265 |
- sys.stderr.write('stripping BOM\n') |
|
3266 |
- if encoding != 'utf-16be': |
|
3267 |
- sys.stderr.write('trying utf-16be instead\n') |
|
3268 |
- encoding = 'utf-16be' |
|
3269 |
- data = data[2:] |
|
3270 |
- elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'): |
|
3271 |
- if _debug: |
|
3272 |
- sys.stderr.write('stripping BOM\n') |
|
3273 |
- if encoding != 'utf-16le': |
|
3274 |
- sys.stderr.write('trying utf-16le instead\n') |
|
3275 |
- encoding = 'utf-16le' |
|
3276 |
- data = data[2:] |
|
3277 |
- elif data[:3] == '\xef\xbb\xbf': |
|
3278 |
- if _debug: |
|
3279 |
- sys.stderr.write('stripping BOM\n') |
|
3280 |
- if encoding != 'utf-8': |
|
3281 |
- sys.stderr.write('trying utf-8 instead\n') |
|
3282 |
- encoding = 'utf-8' |
|
3283 |
- data = data[3:] |
|
3284 |
- elif data[:4] == '\x00\x00\xfe\xff': |
|
3285 |
- if _debug: |
|
3286 |
- sys.stderr.write('stripping BOM\n') |
|
3287 |
- if encoding != 'utf-32be': |
|
3288 |
- sys.stderr.write('trying utf-32be instead\n') |
|
3289 |
- encoding = 'utf-32be' |
|
3290 |
- data = data[4:] |
|
3291 |
- elif data[:4] == '\xff\xfe\x00\x00': |
|
3292 |
- if _debug: |
|
3293 |
- sys.stderr.write('stripping BOM\n') |
|
3294 |
- if encoding != 'utf-32le': |
|
3295 |
- sys.stderr.write('trying utf-32le instead\n') |
|
3296 |
- encoding = 'utf-32le' |
|
3297 |
- data = data[4:] |
|
3298 |
- newdata = unicode(data, encoding) |
|
3299 |
- if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding) |
|
3300 |
- declmatch = re.compile('^<\?xml[^>]*?>') |
|
3301 |
- newdecl = '''<?xml version='1.0' encoding='utf-8'?>''' |
|
3302 |
- if declmatch.search(newdata): |
|
3303 |
- newdata = declmatch.sub(newdecl, newdata) |
|
3304 |
- else: |
|
3305 |
- newdata = newdecl + u'\n' + newdata |
|
3306 |
- return newdata.encode('utf-8') |
|
3307 |
- |
|
3308 |
-def _stripDoctype(data): |
|
3309 |
- '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data) |
|
3310 |
- |
|
3311 |
- rss_version may be 'rss091n' or None |
|
3312 |
- stripped_data is the same XML document, minus the DOCTYPE |
|
3313 |
- ''' |
|
3314 |
- start = re.search('<\w',data) |
|
3315 |
- start = start and start.start() or -1 |
|
3316 |
- head,data = data[:start+1], data[start+1:] |
|
3317 |
- |
|
3318 |
- entity_pattern = re.compile(r'^\s*<!ENTITY([^>]*?)>', re.MULTILINE) |
|
3319 |
- entity_results=entity_pattern.findall(head) |
|
3320 |
- head = entity_pattern.sub('', head) |
|
3321 |
- doctype_pattern = re.compile(r'^\s*<!DOCTYPE([^>]*?)>', re.MULTILINE) |
|
3322 |
- doctype_results = doctype_pattern.findall(head) |
|
3323 |
- doctype = doctype_results and doctype_results[0] or '' |
|
3324 |
- if doctype.lower().count('netscape'): |
|
3325 |
- version = 'rss091n' |
|
3326 |
- else: |
|
3327 |
- version = None |
|
3328 |
- |
|
3329 |
- # only allow in 'safe' inline entity definitions |
|
3330 |
- replacement='' |
|
3331 |
- if len(doctype_results)==1 and entity_results: |
|
3332 |
- safe_pattern=re.compile('\s+(\w+)\s+"(&#\w+;|[^&"]*)"') |
|
3333 |
- safe_entities=filter(lambda e: safe_pattern.match(e),entity_results) |
|
3334 |
- if safe_entities: |
|
3335 |
- replacement='<!DOCTYPE feed [\n <!ENTITY %s>\n]>' % '>\n <!ENTITY '.join(safe_entities) |
|
3336 |
- data = doctype_pattern.sub(replacement, head) + data |
|
3337 |
- |
|
3338 |
- return version, data, dict(replacement and safe_pattern.findall(replacement)) |
|
3339 |
- |
|
3340 |
-def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]): |
|
3341 |
- '''Parse a feed from a URL, file, stream, or string''' |
|
3342 |
- result = FeedParserDict() |
|
3343 |
- result['feed'] = FeedParserDict() |
|
3344 |
- result['entries'] = [] |
|
3345 |
- if _XML_AVAILABLE: |
|
3346 |
- result['bozo'] = 0 |
|
3347 |
- if type(handlers) == types.InstanceType: |
|
3348 |
- handlers = [handlers] |
|
3349 |
- try: |
|
3350 |
- f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers) |
|
3351 |
- data = f.read() |
|
3352 |
- except Exception as e: |
|
3353 |
- result['bozo'] = 1 |
|
3354 |
- result['bozo_exception'] = e |
|
3355 |
- data = '' |
|
3356 |
- f = None |
|
3357 |
- |
|
3358 |
- # if feed is gzip-compressed, decompress it |
|
3359 |
- if f and data and hasattr(f, 'headers'): |
|
3360 |
- if gzip and f.headers.get('content-encoding', '') == 'gzip': |
|
3361 |
- try: |
|
3362 |
- data = gzip.GzipFile(fileobj=_StringIO(data)).read() |
|
3363 |
- except Exception as e: |
|
3364 |
- # Some feeds claim to be gzipped but they're not, so |
|
3365 |
- # we get garbage. Ideally, we should re-request the |
|
3366 |
- # feed without the 'Accept-encoding: gzip' header, |
|
3367 |
- # but we don't. |
|
3368 |
- result['bozo'] = 1 |
|
3369 |
- result['bozo_exception'] = e |
|
3370 |
- data = '' |
|
3371 |
- elif zlib and f.headers.get('content-encoding', '') == 'deflate': |
|
3372 |
- try: |
|
3373 |
- data = zlib.decompress(data, -zlib.MAX_WBITS) |
|
3374 |
- except Exception as e: |
|
3375 |
- result['bozo'] = 1 |
|
3376 |
- result['bozo_exception'] = e |
|
3377 |
- data = '' |
|
3378 |
- |
|
3379 |
- # save HTTP headers |
|
3380 |
- if hasattr(f, 'info'): |
|
3381 |
- info = f.info() |
|
3382 |
- etag = info.getheader('ETag') |
|
3383 |
- if etag: |
|
3384 |
- result['etag'] = etag |
|
3385 |
- last_modified = info.getheader('Last-Modified') |
|
3386 |
- if last_modified: |
|
3387 |
- result['modified'] = _parse_date(last_modified) |
|
3388 |
- if hasattr(f, 'url'): |
|
3389 |
- result['href'] = f.url |
|
3390 |
- result['status'] = 200 |
|
3391 |
- if hasattr(f, 'status'): |
|
3392 |
- result['status'] = f.status |
|
3393 |
- if hasattr(f, 'headers'): |
|
3394 |
- result['headers'] = f.headers.dict |
|
3395 |
- if hasattr(f, 'close'): |
|
3396 |
- f.close() |
|
3397 |
- |
|
3398 |
- # there are four encodings to keep track of: |
|
3399 |
- # - http_encoding is the encoding declared in the Content-Type HTTP header |
|
3400 |
- # - xml_encoding is the encoding declared in the <?xml declaration |
|
3401 |
- # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data |
|
3402 |
- # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications |
|
3403 |
- http_headers = result.get('headers', {}) |
|
3404 |
- result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \ |
|
3405 |
- _getCharacterEncoding(http_headers, data) |
|
3406 |
- if http_headers and (not acceptable_content_type): |
|
3407 |
- if http_headers.has_key('content-type'): |
|
3408 |
- bozo_message = '%s is not an XML media type' % http_headers['content-type'] |
|
3409 |
- else: |
|
3410 |
- bozo_message = 'no Content-type specified' |
|
3411 |
- result['bozo'] = 1 |
|
3412 |
- result['bozo_exception'] = NonXMLContentType(bozo_message) |
|
3413 |
- |
|
3414 |
- result['version'], data, entities = _stripDoctype(data) |
|
3415 |
- |
|
3416 |
- baseuri = http_headers.get('content-location', result.get('href')) |
|
3417 |
- baselang = http_headers.get('content-language', None) |
|
3418 |
- |
|
3419 |
- # if server sent 304, we're done |
|
3420 |
- if result.get('status', 0) == 304: |
|
3421 |
- result['version'] = '' |
|
3422 |
- result['debug_message'] = 'The feed has not changed since you last checked, ' + \ |
|
3423 |
- 'so the server sent no data. This is a feature, not a bug!' |
|
3424 |
- return result |
|
3425 |
- |
|
3426 |
- # if there was a problem downloading, we're done |
|
3427 |
- if not data: |
|
3428 |
- return result |
|
3429 |
- |
|
3430 |
- # determine character encoding |
|
3431 |
- use_strict_parser = 0 |
|
3432 |
- known_encoding = 0 |
|
3433 |
- tried_encodings = [] |
|
3434 |
- # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM |
|
3435 |
- for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding): |
|
3436 |
- if not proposed_encoding: continue |
|
3437 |
- if proposed_encoding in tried_encodings: continue |
|
3438 |
- tried_encodings.append(proposed_encoding) |
|
3439 |
- try: |
|
3440 |
- data = _toUTF8(data, proposed_encoding) |
|
3441 |
- known_encoding = use_strict_parser = 1 |
|
3442 |
- break |
|
3443 |
- except: |
|
3444 |
- pass |
|
3445 |
- # if no luck and we have auto-detection library, try that |
|
3446 |
- if (not known_encoding) and chardet: |
|
3447 |
- try: |
|
3448 |
- proposed_encoding = chardet.detect(data)['encoding'] |
|
3449 |
- if proposed_encoding and (proposed_encoding not in tried_encodings): |
|
3450 |
- tried_encodings.append(proposed_encoding) |
|
3451 |
- data = _toUTF8(data, proposed_encoding) |
|
3452 |
- known_encoding = use_strict_parser = 1 |
|
3453 |
- except: |
|
3454 |
- pass |
|
3455 |
- # if still no luck and we haven't tried utf-8 yet, try that |
|
3456 |
- if (not known_encoding) and ('utf-8' not in tried_encodings): |
|
3457 |
- try: |
|
3458 |
- proposed_encoding = 'utf-8' |
|
3459 |
- tried_encodings.append(proposed_encoding) |
|
3460 |
- data = _toUTF8(data, proposed_encoding) |
|
3461 |
- known_encoding = use_strict_parser = 1 |
|
3462 |
- except: |
|
3463 |
- pass |
|
3464 |
- # if still no luck and we haven't tried windows-1252 yet, try that |
|
3465 |
- if (not known_encoding) and ('windows-1252' not in tried_encodings): |
|
3466 |
- try: |
|
3467 |
- proposed_encoding = 'windows-1252' |
|
3468 |
- tried_encodings.append(proposed_encoding) |
|
3469 |
- data = _toUTF8(data, proposed_encoding) |
|
3470 |
- known_encoding = use_strict_parser = 1 |
|
3471 |
- except: |
|
3472 |
- pass |
|
3473 |
- # if still no luck and we haven't tried iso-8859-2 yet, try that. |
|
3474 |
- if (not known_encoding) and ('iso-8859-2' not in tried_encodings): |
|
3475 |
- try: |
|
3476 |
- proposed_encoding = 'iso-8859-2' |
|
3477 |
- tried_encodings.append(proposed_encoding) |
|
3478 |
- data = _toUTF8(data, proposed_encoding) |
|
3479 |
- known_encoding = use_strict_parser = 1 |
|
3480 |
- except: |
|
3481 |
- pass |
|
3482 |
- # if still no luck, give up |
|
3483 |
- if not known_encoding: |
|
3484 |
- result['bozo'] = 1 |
|
3485 |
- result['bozo_exception'] = CharacterEncodingUnknown( \ |
|
3486 |
- 'document encoding unknown, I tried ' + \ |
|
3487 |
- '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' % \ |
|
3488 |
- (result['encoding'], xml_encoding)) |
|
3489 |
- result['encoding'] = '' |
|
3490 |
- elif proposed_encoding != result['encoding']: |
|
3491 |
- result['bozo'] = 1 |
|
3492 |
- result['bozo_exception'] = CharacterEncodingOverride( \ |
|
3493 |
- 'documented declared as %s, but parsed as %s' % \ |
|
3494 |
- (result['encoding'], proposed_encoding)) |
|
3495 |
- result['encoding'] = proposed_encoding |
|
3496 |
- |
|
3497 |
- if not _XML_AVAILABLE: |
|
3498 |
- use_strict_parser = 0 |
|
3499 |
- if use_strict_parser: |
|
3500 |
- # initialize the SAX parser |
|
3501 |
- feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8') |
|
3502 |
- saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS) |
|
3503 |
- saxparser.setFeature(xml.sax.handler.feature_namespaces, 1) |
|
3504 |
- saxparser.setContentHandler(feedparser) |
|
3505 |
- saxparser.setErrorHandler(feedparser) |
|
3506 |
- source = xml.sax.xmlreader.InputSource() |
|
3507 |
- source.setByteStream(_StringIO(data)) |
|
3508 |
- if hasattr(saxparser, '_ns_stack'): |
|
3509 |
- # work around bug in built-in SAX parser (doesn't recognize xml: namespace) |
|
3510 |
- # PyXML doesn't have this problem, and it doesn't have _ns_stack either |
|
3511 |
- saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'}) |
|
3512 |
- try: |
|
3513 |
- saxparser.parse(source) |
|
3514 |
- except Exception, e: |
|
3515 |
- if _debug: |
|
3516 |
- import traceback |
|
3517 |
- traceback.print_stack() |
|
3518 |
- traceback.print_exc() |
|
3519 |
- sys.stderr.write('xml parsing failed\n') |
|
3520 |
- result['bozo'] = 1 |
|
3521 |
- result['bozo_exception'] = feedparser.exc or e |
|
3522 |
- use_strict_parser = 0 |
|
3523 |
- if not use_strict_parser: |
|
3524 |
- feedparser = _LooseFeedParser(baseuri, baselang, known_encoding and 'utf-8' or '', entities) |
|
3525 |
- feedparser.feed(data) |
|
3526 |
- result['feed'] = feedparser.feeddata |
|
3527 |
- result['entries'] = feedparser.entries |
|
3528 |
- result['version'] = result['version'] or feedparser.version |
|
3529 |
- result['namespaces'] = feedparser.namespacesInUse |
|
3530 |
- return result |
|
3531 |
- |
|
3532 |
-class Serializer: |
|
3533 |
- def __init__(self, results): |
|
3534 |
- self.results = results |
|
3535 |
- |
|
3536 |
-class TextSerializer(Serializer): |
|
3537 |
- def write(self, stream=sys.stdout): |
|
3538 |
- self._writer(stream, self.results, '') |
|
3539 |
- |
|
3540 |
- def _writer(self, stream, node, prefix): |
|
3541 |
- if not node: return |
|
3542 |
- if hasattr(node, 'keys'): |
|
3543 |
- keys = node.keys() |
|
3544 |
- keys.sort() |
|
3545 |
- for k in keys: |
|
3546 |
- if k in ('description', 'link'): continue |
|
3547 |
- if node.has_key(k + '_detail'): continue |
|
3548 |
- if node.has_key(k + '_parsed'): continue |
|
3549 |
- self._writer(stream, node[k], prefix + k + '.') |
|
3550 |
- elif type(node) == types.ListType: |
|
3551 |
- index = 0 |
|
3552 |
- for n in node: |
|
3553 |
- self._writer(stream, n, prefix[:-1] + '[' + str(index) + '].') |
|
3554 |
- index += 1 |
|
3555 |
- else: |
|
3556 |
- try: |
|
3557 |
- s = str(node).encode('utf-8') |
|
3558 |
- s = s.replace('\\', '\\\\') |
|
3559 |
- s = s.replace('\r', '') |
|
3560 |
- s = s.replace('\n', r'\n') |
|
3561 |
- stream.write(prefix[:-1]) |
|
3562 |
- stream.write('=') |
|
3563 |
- stream.write(s) |
|
3564 |
- stream.write('\n') |
|
3565 |
- except: |
|
3566 |
- pass |
|
3567 |
- |
|
3568 |
-class PprintSerializer(Serializer): |
|
3569 |
- def write(self, stream=sys.stdout): |
|
3570 |
- if self.results.has_key('href'): |
|
3571 |
- stream.write(self.results['href'] + '\n\n') |
|
3572 |
- from pprint import pprint |
|
3573 |
- pprint(self.results, stream) |
|
3574 |
- stream.write('\n') |
|
3575 |
- |
|
3576 |
-if __name__ == '__main__': |
|
3577 |
- try: |
|
3578 |
- from optparse import OptionParser |
|
3579 |
- except: |
|
3580 |
- OptionParser = None |
|
3581 |
- |
|
3582 |
- if OptionParser: |
|
3583 |
- optionParser = OptionParser(version=__version__, usage="%prog [options] url_or_filename_or_-") |
|
3584 |
- optionParser.set_defaults(format="pprint") |
|
3585 |
- optionParser.add_option("-A", "--user-agent", dest="agent", metavar="AGENT", help="User-Agent for HTTP URLs") |
|
3586 |
- optionParser.add_option("-e", "--referer", "--referrer", dest="referrer", metavar="URL", help="Referrer for HTTP URLs") |
|
3587 |
- optionParser.add_option("-t", "--etag", dest="etag", metavar="TAG", help="ETag/If-None-Match for HTTP URLs") |
|
3588 |
- optionParser.add_option("-m", "--last-modified", dest="modified", metavar="DATE", help="Last-modified/If-Modified-Since for HTTP URLs (any supported date format)") |
|
3589 |
- optionParser.add_option("-f", "--format", dest="format", metavar="FORMAT", help="output results in FORMAT (text, pprint)") |
|
3590 |
- optionParser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False, help="write debugging information to stderr") |
|
3591 |
- (options, urls) = optionParser.parse_args() |
|
3592 |
- if options.verbose: |
|
3593 |
- _debug = 1 |
|
3594 |
- if not urls: |
|
3595 |
- optionParser.print_help() |
|
3596 |
- sys.exit(0) |
|
3597 |
- else: |
|
3598 |
- if not sys.argv[1:]: |
|
3599 |
- print __doc__ |
|
3600 |
- sys.exit(0) |
|
3601 |
- class _Options: |
|
3602 |
- etag = modified = agent = referrer = None |
|
3603 |
- format = 'pprint' |
|
3604 |
- options = _Options() |
|
3605 |
- urls = sys.argv[1:] |
|
3606 |
- |
|
3607 |
- zopeCompatibilityHack() |
|
3608 |
- |
|
3609 |
- serializer = globals().get(options.format.capitalize() + 'Serializer', Serializer) |
|
3610 |
- for url in urls: |
|
3611 |
- results = parse(url, etag=options.etag, modified=options.modified, agent=options.agent, referrer=options.referrer) |
|
3612 |
- serializer(results).write(sys.stdout) |
... | ... |
@@ -1,4 +1,4 @@ |
1 |
-#!/usr/bin/env python |
|
1 |
+#!/home/dblume/opt/python-3.9.6/bin/python3 |
|
2 | 2 |
# |
3 | 3 |
# Testing without affecting the yaml file and saving the updated one aside: |
4 | 4 |
# cp techcrunch.yaml techcrunch.yaml_back; ./techcrunch.py; \ |
... | ... |
@@ -14,18 +14,16 @@ import codecs |
14 | 14 |
import traceback |
15 | 15 |
import calendar |
16 | 16 |
import pickle |
17 |
-import exceptions |
|
18 |
-import urllib |
|
19 |
-import urllib2 |
|
20 |
-import httplib |
|
17 |
+import urllib.request, urllib.parse, urllib.error |
|
18 |
+import http.client |
|
21 | 19 |
import shutil |
22 | 20 |
import smtplib |
23 | 21 |
import analysis |
24 | 22 |
import json |
25 | 23 |
import xml |
26 | 24 |
import operator |
27 |
-import cgi |
|
28 |
-import cStringIO |
|
25 |
+import html |
|
26 |
+import io |
|
29 | 27 |
import smtp_creds # Your own credentials, used in send_email() |
30 | 28 |
|
31 | 29 |
debug = True |
... | ... |
@@ -144,7 +142,7 @@ def send_email(subject, message, toaddrs, |
144 | 142 |
|
145 | 143 |
def index_id(a_list, op, elem): |
146 | 144 |
try: |
147 |
- return (index for index, item in enumerate(a_list) if op(item, elem)).next() |
|
145 |
+ return next((index for index, item in enumerate(a_list) if op(item, elem))) |
|
148 | 146 |
except: |
149 | 147 |
return -1 |
150 | 148 |
|
... | ... |
@@ -219,31 +217,31 @@ def process_feed(yaml_items): |
219 | 217 |
else: |
220 | 218 |
if feed.status != 200 and feed.status != 307 and feed.status != 301 and feed.status != 302: |
221 | 219 |
if feed.status == 503: |
222 |
- print "the feed is temporarily unavailable." |
|
220 |
+ print("the feed is temporarily unavailable.") |
|
223 | 221 |
elif feed.status == 400: |
224 |
- print "the feed says we made a bad request." |
|
222 |
+ print("the feed says we made a bad request.") |
|
225 | 223 |
elif feed.status == 502: |
226 |
- print "the feed reported a bad gateway error." |
|
224 |
+ print("the feed reported a bad gateway error.") |
|
227 | 225 |
elif feed.status == 404: |
228 |
- print "the feed says the page was not found." |
|
226 |
+ print("the feed says the page was not found.") |
|
229 | 227 |
elif feed.status == 500: |
230 |
- print "the feed had an internal server error." |
|
228 |
+ print("the feed had an internal server error.") |
|
231 | 229 |
elif feed.status == 403: |
232 |
- print "Access to the feed was forbidden." |
|
230 |
+ print("Access to the feed was forbidden.") |
|
233 | 231 |
else: |
234 |
- print "the feed returned feed.status %d." % ( feed.status, ) |
|
232 |
+ print("the feed returned feed.status %d." % ( feed.status, )) |
|
235 | 233 |
else: |
236 | 234 |
# Save off this |
237 | 235 |
if hasattr(feed, 'bozo_exception') and isinstance(feed.bozo_exception, xml.sax._exceptions.SAXParseException): |
238 |
- print "Didn't pickle TechCrunch feed because it had a bozo_exception: %s" % (str(feed.bozo_exception)) |
|
236 |
+ print("Didn't pickle TechCrunch feed because it had a bozo_exception: %s" % (str(feed.bozo_exception))) |
|
239 | 237 |
else: |
240 | 238 |
try: |
241 | 239 |
with open(os.path.join(localdir, 'techcrunch_feed.pickle'), 'wb') as f: |
242 | 240 |
pickle.dump(feed, f) |
243 | 241 |
except(pickle.PicklingError, exceptions.TypeError) as e: |
244 |
- print "An error occurred while pickling the feed: %s." % \ |
|
242 |
+ print("An error occurred while pickling the feed: %s." % \ |
|
245 | 243 |
(# str(e.__class__), |
246 |
- str(e)) |
|
244 |
+ str(e))) |
|
247 | 245 |
traceback.print_exc(3, file=sys.stdout) |
248 | 246 |
|
249 | 247 |
for i in reversed(feed.entries): |
... | ... |
@@ -261,32 +259,32 @@ def process_feed(yaml_items): |
261 | 259 |
else: |
262 | 260 |
if hasattr(feed, 'bozo_exception'): |
263 | 261 |
e = feed.bozo_exception |
264 |
- if isinstance(e, urllib2.URLError): |
|
262 |
+ if isinstance(e, urllib.error.URLError): |
|
265 | 263 |
print_last_line = True |
266 | 264 |
if hasattr(e, 'reason'): |
267 | 265 |
if e.reason[0] == 110: |
268 |
- print "the feed's connection timed out." |
|
266 |
+ print("the feed's connection timed out.") |
|
269 | 267 |
print_last_line = False |
270 | 268 |
elif e.reason[0] == 111: |
271 |
- print "the feed's connection was refused." |
|
269 |
+ print("the feed's connection was refused.") |
|
272 | 270 |
print_last_line = False |
273 | 271 |
elif e.reason[0] == 104: |
274 |
- print "the feed reset the connection." |
|
272 |
+ print("the feed reset the connection.") |
|
275 | 273 |
print_last_line = False |
276 | 274 |
else: |
277 |
- print "the feed had a URLError with reason %s." % (str(e.reason),) |
|
275 |
+ print("the feed had a URLError with reason %s." % (str(e.reason),)) |
|
278 | 276 |
print_last_line = False |
279 | 277 |
if print_last_line: |
280 |
- print "the feed had a URLError %s" % (str(e),) |
|
281 |
- elif isinstance(e, httplib.BadStatusLine): |
|
282 |
- print "the feed gave a bad status line. (%s)" % (str(e),) |
|
278 |
+ print("the feed had a URLError %s" % (str(e),)) |
|
279 |
+ elif isinstance(e, http.client.BadStatusLine): |
|
280 |
+ print("the feed gave a bad status line. (%s)" % (str(e),)) |
|
283 | 281 |
else: |
284 | 282 |
if len(str(e)): |
285 |
- print "the feed bozo_exception: %s \"%s\"" % (str(e.__class__), str(e)) |
|
283 |
+ print("the feed bozo_exception: %s \"%s\"" % (str(e.__class__), str(e))) |
|
286 | 284 |
else: |
287 |
- print "the feed bozo_exception: %s %s" % (str(e.__class__), repr(e)) |
|
285 |
+ print("the feed bozo_exception: %s %s" % (str(e.__class__), repr(e))) |
|
288 | 286 |
else: |
289 |
- print "the feed returned class %s, %s" % (str(feed.__class__), str(feed)) |
|
287 |
+ print("the feed returned class %s, %s" % (str(feed.__class__), str(feed))) |
|
290 | 288 |
|
291 | 289 |
|
292 | 290 |
def process_item(feed_item, yaml_items): |
... | ... |
@@ -300,7 +298,7 @@ def process_item(feed_item, yaml_items): |
300 | 298 |
elif hasattr(feed_item, 'date_parsed'): |
301 | 299 |
date_parsed = feed_item.date_parsed |
302 | 300 |
else: |
303 |
- print "process_item found no timestamp for", asciiize(feed_item.link) |
|
301 |
+ print("process_item found no timestamp for", asciiize(feed_item.link)) |
|
304 | 302 |
timecode_parsed = calendar.timegm(date_parsed) |
305 | 303 |
|
306 | 304 |
link = feed_item.link |
... | ... |
@@ -398,21 +396,21 @@ def Get_fb_stats(url_string): |
398 | 396 |
url_string = url_string.encode('utf-8') |
399 | 397 |
|
400 | 398 |
try: |
401 |
- encoded = urllib.urlencode({'access_token': facebook_token}) |
|
399 |
+ encoded = urllib.parse.urlencode({'access_token': facebook_token}) |
|
402 | 400 |
url = 'https://graph.facebook.com/v2.11/?id=%s&fields=engagement&%s' |
403 |
- f = urllib2.urlopen(url % (urllib.quote_plus(url_string), encoded)) |
|
401 |
+ f = urllib.request.urlopen(url % (urllib.parse.quote_plus(url_string), encoded)) |
|
404 | 402 |
data = f.read() |
405 | 403 |
f.close() |
406 |
- except (urllib2.URLError, httplib.BadStatusLine) as e: |
|
404 |
+ except (urllib.error.URLError, http.client.BadStatusLine) as e: |
|
407 | 405 |
if hasattr(e, 'reason'): # URLError |
408 | 406 |
if hasattr(e, 'code'): |
409 |
- print "Get_fb_stats got an error (1):", e.code, e.reason, url_string |
|
407 |
+ print("Get_fb_stats got an error (1):", e.code, e.reason, url_string) |
|
410 | 408 |
else: |
411 |
- print "Get_fb_stats got an error (2):", e.reason, url_string |
|
409 |
+ print("Get_fb_stats got an error (2):", e.reason, url_string) |
|
412 | 410 |
elif hasattr(e, 'code'): #URLError |
413 |
- print "Get_fb_stats got an error. Code:", e.code, url_string |
|
411 |
+ print("Get_fb_stats got an error. Code:", e.code, url_string) |
|
414 | 412 |
else: |
415 |
- print "Get_fb_stats got an error (3):", str(e) |
|
413 |
+ print("Get_fb_stats got an error (3):", str(e)) |
|
416 | 414 |
return shares, comments, likes |
417 | 415 |
if len(data) > 20: |
418 | 416 |
d = json.loads(data)['engagement'] |
... | ... |
@@ -435,7 +433,7 @@ def Get_fb_stats(url_string): |
435 | 433 |
except KeyError: |
436 | 434 |
comments = 0 |
437 | 435 |
else: |
438 |
- print "Get_fb_stats got too little data for ", url_string |
|
436 |
+ print("Get_fb_stats got too little data for ", url_string) |
|
439 | 437 |
return shares, comments, likes |
440 | 438 |
|
441 | 439 |
|
... | ... |
@@ -445,7 +443,7 @@ def make_index_html(yaml_items, weekend_stats, weekday_stats): |
445 | 443 |
new_index_fullpath = os.path.join(localdir, 'index.html_new') |
446 | 444 |
index_fullpath = os.path.join(localdir, 'index.html') |
447 | 445 |
|
448 |
- chart_io = cStringIO.StringIO() |
|
446 |
+ chart_io = io.StringIO() |
|
449 | 447 |
for image_index, image in enumerate(yaml_items[:40]): |
450 | 448 |
tag_hit = False |
451 | 449 |
if image['author'].lower() in authors_to_post: |
... | ... |
@@ -498,8 +496,8 @@ def make_feed_file(yaml_items): |
498 | 496 |
for item in yaml_items: |
499 | 497 |
now = time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime(item['orig_posted'])) |
500 | 498 |
if item['qualified'] != -1: |
501 |
- escaped_title = cgi.escape(item['title']).encode('ascii', 'xmlcharrefreplace') |
|
502 |
- escaped_author = cgi.escape(item['author']).encode('ascii', 'xmlcharrefreplace') |
|
499 |
+ escaped_title = html.escape(item['title']).encode('ascii', 'xmlcharrefreplace') |
|
500 |
+ escaped_author = html.escape(item['author']).encode('ascii', 'xmlcharrefreplace') |
|
503 | 501 |
f.write("<item><title>%s</title><pubDate>%s</pubDate><link>%s</link><guid isPermaLink=\"false\">%s</guid><description><![CDATA[By: %s]]></description></item>\n" % \ |
504 | 502 |
(escaped_title, now, item['link'], item['link'], escaped_author)) |
505 | 503 |
count += 1 |
... | ... |
@@ -514,7 +512,7 @@ if __name__=='__main__': |
514 | 512 |
|
515 | 513 |
old_stdout = sys.stdout |
516 | 514 |
old_stderr = sys.stderr |
517 |
- sys.stdout = sys.stderr = cStringIO.StringIO() |
|
515 |
+ sys.stdout = sys.stderr = io.StringIO() |
|
518 | 516 |
|
519 | 517 |
try: |
520 | 518 |
localdir = os.path.abspath(os.path.dirname(sys.argv[0])) |
... | ... |
@@ -542,7 +540,7 @@ if __name__=='__main__': |
542 | 540 |
with open(yaml_fullpath, 'rb') as f: |
543 | 541 |
items = yaml.load(f, Loader=yaml.Loader) |
544 | 542 |
if items is None: |
545 |
- print yaml_fullpath, "exists, but was empty." |
|
543 |
+ print(yaml_fullpath, "exists, but was empty.") |
|
546 | 544 |
items = [] |
547 | 545 |
|
548 | 546 |
# Do any dictionary item updating that might be necessary |
... | ... |
@@ -550,7 +548,7 @@ if __name__=='__main__': |
550 | 548 |
# if not item.has_key('fb_shares'): |
551 | 549 |
# item['fb_shares'] = [] |
552 | 550 |
else: |
553 |
- print "could not open", yaml_fullpath |
|
551 |
+ print("could not open", yaml_fullpath) |
|
554 | 552 |
items = [] |
555 | 553 |
|
556 | 554 |
with open(os.path.join(localdir, 'facebook-token.txt'), 'r') as f: |
... | ... |
@@ -607,7 +605,7 @@ if __name__=='__main__': |
607 | 605 |
try: |
608 | 606 |
os.rename(yaml_newfile_fullpath, yaml_fullpath) |
609 | 607 |
except OSError as e: |
610 |
- print "The source file was", yaml_newfile_fullpath, "and exists =", os.path.isfile(yaml_newfile_fullpath) |
|
608 |
+ print("The source file was", yaml_newfile_fullpath, "and exists =", os.path.isfile(yaml_newfile_fullpath)) |
|
611 | 609 |
with open(os.path.join(localdir, 'techcrunch_text.yaml'), 'w') as f: |
612 | 610 |
yaml.dump(items, f, default_flow_style=None, width=120) |
613 | 611 |
with codecs.open(os.path.join(localdir, 'techcrunch_unicode.yaml'), 'w', 'utf-8') as f: |
... | ... |
@@ -617,24 +615,24 @@ if __name__=='__main__': |
617 | 615 |
|
618 | 616 |
make_index_html(items, weekend_stats, weekday_stats) |
619 | 617 |
else: |
620 |
- print "No entries were added this time." |
|
618 |
+ print("No entries were added this time.") |
|
621 | 619 |
|
622 | 620 |
except Exception as e: |
623 | 621 |
exceptional_text = "An exception occurred: " + str(e.__class__) + " " + str(e) |
624 |
- print exceptional_text, ' '.join(progress_text) |
|
622 |
+ print(exceptional_text, ' '.join(progress_text)) |
|
625 | 623 |
traceback.print_exc(file=sys.stdout) |
626 | 624 |
try: |
627 | 625 |
send_email('Exception thrown in ' + os.path.basename(__file__), |
628 | 626 |
exceptional_text + "\n" + traceback.format_exc(), |
629 | 627 |
(smtp_creds.default_recipient,)) |
630 | 628 |
except Exception as e: |
631 |
- print "Could not send email to notify you of the exception. :(" |
|
629 |
+ print("Could not send email to notify you of the exception. :(") |
|
632 | 630 |
|
633 | 631 |
message = sys.stdout.getvalue() |
634 | 632 |
sys.stdout = old_stdout |
635 | 633 |
sys.stderr = old_stderr |
636 | 634 |
if not debug: |
637 |
- print message |
|
635 |
+ print(message) |
|
638 | 636 |
|
639 | 637 |
# Finally, let's save this to a statistics page |
640 | 638 |
if os.path.exists(os.path.join(localdir, 'stats.txt')): |
641 | 639 |