dblume commited on 2024-07-25 21:43:27
Showing 3 changed files, with 59 additions and 3674 deletions.
| ... | ... |
@@ -1,11 +1,10 @@ |
| 1 |
-#!/usr/bin/env python |
|
| 1 |
+#!/home/dblume/opt/python-3.9.6/bin/python3 |
|
| 2 | 2 |
|
| 3 | 3 |
import yaml |
| 4 | 4 |
import sys |
| 5 | 5 |
import os |
| 6 | 6 |
import time |
| 7 | 7 |
import traceback |
| 8 |
-import exceptions |
|
| 9 | 8 |
import math |
| 10 | 9 |
import bisect |
| 11 | 10 |
|
| ... | ... |
@@ -15,7 +14,7 @@ debug = True |
| 15 | 14 |
def get_standard_deviation(l): |
| 16 | 15 |
""" returns the standard deviation of the iterable l """ |
| 17 | 16 |
mean = sum(l) / len(l) |
| 18 |
- squares_of_diffs = map(lambda x: pow(x - mean, 2), l) |
|
| 17 |
+ squares_of_diffs = [pow(x - mean, 2) for x in l] |
|
| 19 | 18 |
mean_of_squares = sum(squares_of_diffs) / len(squares_of_diffs) |
| 20 | 19 |
return math.sqrt(mean_of_squares) |
| 21 | 20 |
|
| ... | ... |
@@ -48,7 +47,7 @@ def process_comments_for_feed(yaml_items): |
| 48 | 47 |
stats = [] |
| 49 | 48 |
for time_block in time_blocks: |
| 50 | 49 |
mean = sum(time_block) / len(time_block) |
| 51 |
- squares_of_diffs = map(lambda x: pow(x - mean, 2), time_block) |
|
| 50 |
+ squares_of_diffs = [pow(x - mean, 2) for x in time_block] |
|
| 52 | 51 |
mean_of_squares = sum(squares_of_diffs) / len(squares_of_diffs) |
| 53 | 52 |
std_dev = math.sqrt(mean_of_squares) |
| 54 | 53 |
stats.append((mean, std_dev)) |
| ... | ... |
@@ -84,7 +83,7 @@ def calculate_median_mean_stddev(time_blocks): |
| 84 | 83 |
# Calculate the mean and standard deviation |
| 85 | 84 |
if count > 0: |
| 86 | 85 |
mean = sum(block) / float(len(block)) |
| 87 |
- squares_of_diffs = map(lambda x: pow(x - mean, 2), block) |
|
| 86 |
+ squares_of_diffs = [pow(x - mean, 2) for x in block] |
|
| 88 | 87 |
mean_of_squares = sum(squares_of_diffs) / len(squares_of_diffs) |
| 89 | 88 |
else: |
| 90 | 89 |
mean = 0 |
| ... | ... |
@@ -153,7 +152,7 @@ if __name__=='__main__': |
| 153 | 152 |
items = yaml.load(f) |
| 154 | 153 |
f.close() |
| 155 | 154 |
else: |
| 156 |
- print "could not open", yaml_fullpath |
|
| 155 |
+ print("could not open", yaml_fullpath)
|
|
| 157 | 156 |
items = [] |
| 158 | 157 |
|
| 159 | 158 |
weekend_stats, weekday_stats = Process_feed(items, 'fb_shares', 'comment_times') |
| ... | ... |
@@ -163,24 +162,24 @@ if __name__=='__main__': |
| 163 | 162 |
weekend_threshold = weekend_median + (weekend_sigma) |
| 164 | 163 |
median, mean, sigma = weekday_stats[2] |
| 165 | 164 |
threshold = median + (sigma) |
| 166 |
- print "Weekend Median=%1.1f, Mean=%1.1f, Sigma=%1.1f --> Threshold = %1.1f" % (weekend_median, weekend_mean, weekend_sigma, weekend_threshold) |
|
| 167 |
- print "Weekday Median=%1.1f, Mean=%1.1f, Sigma=%1.1f --> Threshold = %1.1f" % (median, mean, sigma, threshold) |
|
| 165 |
+ print("Weekend Median=%1.1f, Mean=%1.1f, Sigma=%1.1f --> Threshold = %1.1f" % (weekend_median, weekend_mean, weekend_sigma, weekend_threshold))
|
|
| 166 |
+ print("Weekday Median=%1.1f, Mean=%1.1f, Sigma=%1.1f --> Threshold = %1.1f" % (median, mean, sigma, threshold))
|
|
| 168 | 167 |
for item in items: |
| 169 | 168 |
if item['qualified'] == -1: |
| 170 |
- print "Processing", item['title'].encode('ascii', 'replace')
|
|
| 169 |
+ print("Processing", item['title'].encode('ascii', 'replace'))
|
|
| 171 | 170 |
for i in range(len(item['retweet_times'])): |
| 172 | 171 |
r_time = item['retweet_times'][i] |
| 173 | 172 |
if r_time - item['orig_posted'] < 5400: |
| 174 |
- print "Time %1.1f = %d" % ((r_time - item['orig_posted']) / 1800.0, item['retweets'][i]), |
|
| 173 |
+ print("Time %1.1f = %d" % ((r_time - item['orig_posted']) / 1800.0, item['retweets'][i]), end=' ')
|
|
| 175 | 174 |
if item['retweets'][i] >= threshold: |
| 176 | 175 |
item['qualified'] = i |
| 177 |
- print "NOW QUALIFIES", |
|
| 176 |
+ print("NOW QUALIFIES", end=' ')
|
|
| 178 | 177 |
if r_time - item['orig_posted'] >= 3600: |
| 179 | 178 |
break |
| 180 |
|
|
| 179 |
+ print() |
|
| 181 | 180 |
|
| 182 | 181 |
except Exception as e: |
| 183 | 182 |
exceptional_text = "An exception occurred: " + str(e.__class__) + " " + str(e) |
| 184 |
- print exceptional_text, ' '.join(progress_text) |
|
| 183 |
+ print(exceptional_text, ' '.join(progress_text)) |
|
| 185 | 184 |
traceback.print_exc(file=sys.stdout) |
| 186 | 185 |
|
| ... | ... |
@@ -1,3612 +0,0 @@ |
| 1 |
-#!/usr/bin/env python |
|
| 2 |
-"""Universal feed parser |
|
| 3 |
- |
|
| 4 |
-Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds |
|
| 5 |
- |
|
| 6 |
-Visit http://feedparser.org/ for the latest version |
|
| 7 |
-Visit http://feedparser.org/docs/ for the latest documentation |
|
| 8 |
- |
|
| 9 |
-Required: Python 2.1 or later |
|
| 10 |
-Recommended: Python 2.3 or later |
|
| 11 |
-Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/> |
|
| 12 |
-""" |
|
| 13 |
- |
|
| 14 |
-__version__ = "4.2-pre-" + "$Revision: 291 $"[11:14] + "-svn" |
|
| 15 |
-__license__ = """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved. |
|
| 16 |
- |
|
| 17 |
-Redistribution and use in source and binary forms, with or without modification, |
|
| 18 |
-are permitted provided that the following conditions are met: |
|
| 19 |
- |
|
| 20 |
-* Redistributions of source code must retain the above copyright notice, |
|
| 21 |
- this list of conditions and the following disclaimer. |
|
| 22 |
-* Redistributions in binary form must reproduce the above copyright notice, |
|
| 23 |
- this list of conditions and the following disclaimer in the documentation |
|
| 24 |
- and/or other materials provided with the distribution. |
|
| 25 |
- |
|
| 26 |
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' |
|
| 27 |
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
| 28 |
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
| 29 |
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
|
| 30 |
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
|
| 31 |
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
|
| 32 |
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|
| 33 |
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
|
| 34 |
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
|
| 35 |
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
|
| 36 |
-POSSIBILITY OF SUCH DAMAGE.""" |
|
| 37 |
-__author__ = "Mark Pilgrim <http://diveintomark.org/>" |
|
| 38 |
-__contributors__ = ["Jason Diamond <http://injektilo.org/>", |
|
| 39 |
- "John Beimler <http://john.beimler.org/>", |
|
| 40 |
- "Fazal Majid <http://www.majid.info/mylos/weblog/>", |
|
| 41 |
- "Aaron Swartz <http://aaronsw.com/>", |
|
| 42 |
- "Kevin Marks <http://epeus.blogspot.com/>", |
|
| 43 |
- "Sam Ruby <http://intertwingly.net/>"] |
|
| 44 |
-_debug = 0 |
|
| 45 |
- |
|
| 46 |
-# HTTP "User-Agent" header to send to servers when downloading feeds. |
|
| 47 |
-# If you are embedding feedparser in a larger application, you should |
|
| 48 |
-# change this to your application name and URL. |
|
| 49 |
-USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__ |
|
| 50 |
- |
|
| 51 |
-# HTTP "Accept" header to send to servers when downloading feeds. If you don't |
|
| 52 |
-# want to send an Accept header, set this to None. |
|
| 53 |
-ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1" |
|
| 54 |
- |
|
| 55 |
-# List of preferred XML parsers, by SAX driver name. These will be tried first, |
|
| 56 |
-# but if they're not installed, Python will keep searching through its own list |
|
| 57 |
-# of pre-installed parsers until it finds one that supports everything we need. |
|
| 58 |
-PREFERRED_XML_PARSERS = ["drv_libxml2"] |
|
| 59 |
- |
|
| 60 |
-# If you want feedparser to automatically run HTML markup through HTML Tidy, set |
|
| 61 |
-# this to 1. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html> |
|
| 62 |
-# or utidylib <http://utidylib.berlios.de/>. |
|
| 63 |
-TIDY_MARKUP = 0 |
|
| 64 |
- |
|
| 65 |
-# List of Python interfaces for HTML Tidy, in order of preference. Only useful |
|
| 66 |
-# if TIDY_MARKUP = 1 |
|
| 67 |
-PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] |
|
| 68 |
- |
|
| 69 |
-# If you want feedparser to automatically resolve all relative URIs, set this |
|
| 70 |
-# to 1. |
|
| 71 |
-RESOLVE_RELATIVE_URIS = 1 |
|
| 72 |
- |
|
| 73 |
-# If you want feedparser to automatically sanitize all potentially unsafe |
|
| 74 |
-# HTML content, set this to 1. |
|
| 75 |
-SANITIZE_HTML = 1 |
|
| 76 |
- |
|
| 77 |
-# ---------- required modules (should come with any Python distribution) ---------- |
|
| 78 |
-import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2 |
|
| 79 |
-try: |
|
| 80 |
- from cStringIO import StringIO as _StringIO |
|
| 81 |
-except: |
|
| 82 |
- from StringIO import StringIO as _StringIO |
|
| 83 |
- |
|
| 84 |
-# ---------- optional modules (feedparser will work without these, but with reduced functionality) ---------- |
|
| 85 |
- |
|
| 86 |
-# gzip is included with most Python distributions, but may not be available if you compiled your own |
|
| 87 |
-try: |
|
| 88 |
- import gzip |
|
| 89 |
-except: |
|
| 90 |
- gzip = None |
|
| 91 |
-try: |
|
| 92 |
- import zlib |
|
| 93 |
-except: |
|
| 94 |
- zlib = None |
|
| 95 |
- |
|
| 96 |
-# If a real XML parser is available, feedparser will attempt to use it. feedparser has |
|
| 97 |
-# been tested with the built-in SAX parser, PyXML, and libxml2. On platforms where the |
|
| 98 |
-# Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some |
|
| 99 |
-# versions of FreeBSD), feedparser will quietly fall back on regex-based parsing. |
|
| 100 |
-try: |
|
| 101 |
- import xml.sax |
|
| 102 |
- xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers |
|
| 103 |
- from xml.sax.saxutils import escape as _xmlescape |
|
| 104 |
- _XML_AVAILABLE = 1 |
|
| 105 |
-except: |
|
| 106 |
- _XML_AVAILABLE = 0 |
|
| 107 |
- def _xmlescape(data,entities={}):
|
|
| 108 |
- data = data.replace('&', '&')
|
|
| 109 |
- data = data.replace('>', '>')
|
|
| 110 |
- data = data.replace('<', '<')
|
|
| 111 |
- for char, entity in entities: |
|
| 112 |
- data = data.replace(char, entity) |
|
| 113 |
- return data |
|
| 114 |
- |
|
| 115 |
-# base64 support for Atom feeds that contain embedded binary data |
|
| 116 |
-try: |
|
| 117 |
- import base64, binascii |
|
| 118 |
-except: |
|
| 119 |
- base64 = binascii = None |
|
| 120 |
- |
|
| 121 |
-# cjkcodecs and iconv_codec provide support for more character encodings. |
|
| 122 |
-# Both are available from http://cjkpython.i18n.org/ |
|
| 123 |
-try: |
|
| 124 |
- import cjkcodecs.aliases |
|
| 125 |
-except: |
|
| 126 |
- pass |
|
| 127 |
-try: |
|
| 128 |
- import iconv_codec |
|
| 129 |
-except: |
|
| 130 |
- pass |
|
| 131 |
- |
|
| 132 |
-# chardet library auto-detects character encodings |
|
| 133 |
-# Download from http://chardet.feedparser.org/ |
|
| 134 |
-try: |
|
| 135 |
- import chardet |
|
| 136 |
- if _debug: |
|
| 137 |
- import chardet.constants |
|
| 138 |
- chardet.constants._debug = 1 |
|
| 139 |
-except: |
|
| 140 |
- chardet = None |
|
| 141 |
- |
|
| 142 |
-# reversable htmlentitydefs mappings for Python 2.2 |
|
| 143 |
-try: |
|
| 144 |
- from htmlentitydefs import name2codepoint, codepoint2name |
|
| 145 |
-except: |
|
| 146 |
- import htmlentitydefs |
|
| 147 |
- name2codepoint={}
|
|
| 148 |
- codepoint2name={}
|
|
| 149 |
- for (name,codepoint) in htmlentitydefs.entitydefs.iteritems(): |
|
| 150 |
- if codepoint.startswith('&#'): codepoint=unichr(int(codepoint[2:-1]))
|
|
| 151 |
- name2codepoint[name]=ord(codepoint) |
|
| 152 |
- codepoint2name[ord(codepoint)]=name |
|
| 153 |
- |
|
| 154 |
-# BeautifulSoup parser used for parsing microformats from embedded HTML content |
|
| 155 |
-# http://www.crummy.com/software/BeautifulSoup/ |
|
| 156 |
-# feedparser is tested with BeautifulSoup 3.0.x, but it might work with the |
|
| 157 |
-# older 2.x series. If it doesn't, and you can figure out why, I'll accept a |
|
| 158 |
-# patch and modify the compatibility statement accordingly. |
|
| 159 |
-try: |
|
| 160 |
- import BeautifulSoup |
|
| 161 |
-except: |
|
| 162 |
- BeautifulSoup = None |
|
| 163 |
- |
|
| 164 |
-# ---------- don't touch these ---------- |
|
| 165 |
-class ThingsNobodyCaresAboutButMe(Exception): pass |
|
| 166 |
-class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass |
|
| 167 |
-class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass |
|
| 168 |
-class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass |
|
| 169 |
-class UndeclaredNamespace(Exception): pass |
|
| 170 |
- |
|
| 171 |
-sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
|
|
| 172 |
-sgmllib.special = re.compile('<!')
|
|
| 173 |
-sgmllib.charref = re.compile('&#(\d+|x[0-9a-fA-F]+);')
|
|
| 174 |
- |
|
| 175 |
-if sgmllib.endbracket.search(' <').start(0):
|
|
| 176 |
- class EndBracketMatch: |
|
| 177 |
- endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''')
|
|
| 178 |
- def search(self,string,index=0): |
|
| 179 |
- self.match = self.endbracket.match(string,index) |
|
| 180 |
- if self.match: return self |
|
| 181 |
- def start(self,n): |
|
| 182 |
- return self.match.end(n) |
|
| 183 |
- sgmllib.endbracket = EndBracketMatch() |
|
| 184 |
- |
|
| 185 |
-SUPPORTED_VERSIONS = {'': 'unknown',
|
|
| 186 |
- 'rss090': 'RSS 0.90', |
|
| 187 |
- 'rss091n': 'RSS 0.91 (Netscape)', |
|
| 188 |
- 'rss091u': 'RSS 0.91 (Userland)', |
|
| 189 |
- 'rss092': 'RSS 0.92', |
|
| 190 |
- 'rss093': 'RSS 0.93', |
|
| 191 |
- 'rss094': 'RSS 0.94', |
|
| 192 |
- 'rss20': 'RSS 2.0', |
|
| 193 |
- 'rss10': 'RSS 1.0', |
|
| 194 |
- 'rss': 'RSS (unknown version)', |
|
| 195 |
- 'atom01': 'Atom 0.1', |
|
| 196 |
- 'atom02': 'Atom 0.2', |
|
| 197 |
- 'atom03': 'Atom 0.3', |
|
| 198 |
- 'atom10': 'Atom 1.0', |
|
| 199 |
- 'atom': 'Atom (unknown version)', |
|
| 200 |
- 'cdf': 'CDF', |
|
| 201 |
- 'hotrss': 'Hot RSS' |
|
| 202 |
- } |
|
| 203 |
- |
|
| 204 |
-try: |
|
| 205 |
- UserDict = dict |
|
| 206 |
-except NameError: |
|
| 207 |
- # Python 2.1 does not have dict |
|
| 208 |
- from UserDict import UserDict |
|
| 209 |
- def dict(aList): |
|
| 210 |
- rc = {}
|
|
| 211 |
- for k, v in aList: |
|
| 212 |
- rc[k] = v |
|
| 213 |
- return rc |
|
| 214 |
- |
|
| 215 |
-class FeedParserDict(UserDict): |
|
| 216 |
- keymap = {'channel': 'feed',
|
|
| 217 |
- 'items': 'entries', |
|
| 218 |
- 'guid': 'id', |
|
| 219 |
- 'date': 'updated', |
|
| 220 |
- 'date_parsed': 'updated_parsed', |
|
| 221 |
- 'description': ['subtitle', 'summary'], |
|
| 222 |
- 'url': ['href'], |
|
| 223 |
- 'modified': 'updated', |
|
| 224 |
- 'modified_parsed': 'updated_parsed', |
|
| 225 |
- 'issued': 'published', |
|
| 226 |
- 'issued_parsed': 'published_parsed', |
|
| 227 |
- 'copyright': 'rights', |
|
| 228 |
- 'copyright_detail': 'rights_detail', |
|
| 229 |
- 'tagline': 'subtitle', |
|
| 230 |
- 'tagline_detail': 'subtitle_detail'} |
|
| 231 |
- def __getitem__(self, key): |
|
| 232 |
- if key == 'category': |
|
| 233 |
- return UserDict.__getitem__(self, 'tags')[0]['term'] |
|
| 234 |
- if key == 'enclosures': |
|
| 235 |
- norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel']) |
|
| 236 |
- return [norel(link) for link in UserDict.__getitem__(self, 'links') if link['rel']=='enclosure'] |
|
| 237 |
- if key == 'license': |
|
| 238 |
- for link in UserDict.__getitem__(self, 'links'): |
|
| 239 |
- if link['rel']=='license' and link.has_key('href'):
|
|
| 240 |
- return link['href'] |
|
| 241 |
- if key == 'categories': |
|
| 242 |
- return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')] |
|
| 243 |
- realkey = self.keymap.get(key, key) |
|
| 244 |
- if type(realkey) == types.ListType: |
|
| 245 |
- for k in realkey: |
|
| 246 |
- if UserDict.has_key(self, k): |
|
| 247 |
- return UserDict.__getitem__(self, k) |
|
| 248 |
- if UserDict.has_key(self, key): |
|
| 249 |
- return UserDict.__getitem__(self, key) |
|
| 250 |
- return UserDict.__getitem__(self, realkey) |
|
| 251 |
- |
|
| 252 |
- def __setitem__(self, key, value): |
|
| 253 |
- for k in self.keymap.keys(): |
|
| 254 |
- if key == k: |
|
| 255 |
- key = self.keymap[k] |
|
| 256 |
- if type(key) == types.ListType: |
|
| 257 |
- key = key[0] |
|
| 258 |
- return UserDict.__setitem__(self, key, value) |
|
| 259 |
- |
|
| 260 |
- def get(self, key, default=None): |
|
| 261 |
- if self.has_key(key): |
|
| 262 |
- return self[key] |
|
| 263 |
- else: |
|
| 264 |
- return default |
|
| 265 |
- |
|
| 266 |
- def setdefault(self, key, value): |
|
| 267 |
- if not self.has_key(key): |
|
| 268 |
- self[key] = value |
|
| 269 |
- return self[key] |
|
| 270 |
- |
|
| 271 |
- def has_key(self, key): |
|
| 272 |
- try: |
|
| 273 |
- return hasattr(self, key) or UserDict.has_key(self, key) |
|
| 274 |
- except AttributeError: |
|
| 275 |
- return False |
|
| 276 |
- |
|
| 277 |
- def __getattr__(self, key): |
|
| 278 |
- try: |
|
| 279 |
- return self.__dict__[key] |
|
| 280 |
- except KeyError: |
|
| 281 |
- pass |
|
| 282 |
- try: |
|
| 283 |
- assert not key.startswith('_')
|
|
| 284 |
- return self.__getitem__(key) |
|
| 285 |
- except: |
|
| 286 |
- raise AttributeError, "object has no attribute '%s'" % key |
|
| 287 |
- |
|
| 288 |
- def __setattr__(self, key, value): |
|
| 289 |
- if key.startswith('_') or key == 'data':
|
|
| 290 |
- self.__dict__[key] = value |
|
| 291 |
- else: |
|
| 292 |
- return self.__setitem__(key, value) |
|
| 293 |
- |
|
| 294 |
- def __contains__(self, key): |
|
| 295 |
- return self.has_key(key) |
|
| 296 |
- |
|
| 297 |
-def zopeCompatibilityHack(): |
|
| 298 |
- global FeedParserDict |
|
| 299 |
- del FeedParserDict |
|
| 300 |
- def FeedParserDict(aDict=None): |
|
| 301 |
- rc = {}
|
|
| 302 |
- if aDict: |
|
| 303 |
- rc.update(aDict) |
|
| 304 |
- return rc |
|
| 305 |
- |
|
| 306 |
-_ebcdic_to_ascii_map = None |
|
| 307 |
-def _ebcdic_to_ascii(s): |
|
| 308 |
- global _ebcdic_to_ascii_map |
|
| 309 |
- if not _ebcdic_to_ascii_map: |
|
| 310 |
- emap = ( |
|
| 311 |
- 0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, |
|
| 312 |
- 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, |
|
| 313 |
- 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, |
|
| 314 |
- 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, |
|
| 315 |
- 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, |
|
| 316 |
- 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, |
|
| 317 |
- 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, |
|
| 318 |
- 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, |
|
| 319 |
- 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201, |
|
| 320 |
- 202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208, |
|
| 321 |
- 209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215, |
|
| 322 |
- 216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231, |
|
| 323 |
- 123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237, |
|
| 324 |
- 125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243, |
|
| 325 |
- 92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249, |
|
| 326 |
- 48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255 |
|
| 327 |
- ) |
|
| 328 |
- import string |
|
| 329 |
- _ebcdic_to_ascii_map = string.maketrans( \ |
|
| 330 |
- ''.join(map(chr, range(256))), ''.join(map(chr, emap))) |
|
| 331 |
- return s.translate(_ebcdic_to_ascii_map) |
|
| 332 |
- |
|
| 333 |
-_cp1252 = {
|
|
| 334 |
- unichr(128): unichr(8364), # euro sign |
|
| 335 |
- unichr(130): unichr(8218), # single low-9 quotation mark |
|
| 336 |
- unichr(131): unichr( 402), # latin small letter f with hook |
|
| 337 |
- unichr(132): unichr(8222), # double low-9 quotation mark |
|
| 338 |
- unichr(133): unichr(8230), # horizontal ellipsis |
|
| 339 |
- unichr(134): unichr(8224), # dagger |
|
| 340 |
- unichr(135): unichr(8225), # double dagger |
|
| 341 |
- unichr(136): unichr( 710), # modifier letter circumflex accent |
|
| 342 |
- unichr(137): unichr(8240), # per mille sign |
|
| 343 |
- unichr(138): unichr( 352), # latin capital letter s with caron |
|
| 344 |
- unichr(139): unichr(8249), # single left-pointing angle quotation mark |
|
| 345 |
- unichr(140): unichr( 338), # latin capital ligature oe |
|
| 346 |
- unichr(142): unichr( 381), # latin capital letter z with caron |
|
| 347 |
- unichr(145): unichr(8216), # left single quotation mark |
|
| 348 |
- unichr(146): unichr(8217), # right single quotation mark |
|
| 349 |
- unichr(147): unichr(8220), # left double quotation mark |
|
| 350 |
- unichr(148): unichr(8221), # right double quotation mark |
|
| 351 |
- unichr(149): unichr(8226), # bullet |
|
| 352 |
- unichr(150): unichr(8211), # en dash |
|
| 353 |
- unichr(151): unichr(8212), # em dash |
|
| 354 |
- unichr(152): unichr( 732), # small tilde |
|
| 355 |
- unichr(153): unichr(8482), # trade mark sign |
|
| 356 |
- unichr(154): unichr( 353), # latin small letter s with caron |
|
| 357 |
- unichr(155): unichr(8250), # single right-pointing angle quotation mark |
|
| 358 |
- unichr(156): unichr( 339), # latin small ligature oe |
|
| 359 |
- unichr(158): unichr( 382), # latin small letter z with caron |
|
| 360 |
- unichr(159): unichr( 376)} # latin capital letter y with diaeresis |
|
| 361 |
- |
|
| 362 |
-_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
|
|
| 363 |
-def _urljoin(base, uri): |
|
| 364 |
- uri = _urifixer.sub(r'\1\3', uri) |
|
| 365 |
- try: |
|
| 366 |
- return urlparse.urljoin(base, uri) |
|
| 367 |
- except: |
|
| 368 |
- uri = urlparse.urlunparse([urllib.quote(part) for part in urlparse.urlparse(uri)]) |
|
| 369 |
- return urlparse.urljoin(base, uri) |
|
| 370 |
- |
|
| 371 |
-class _FeedParserMixin: |
|
| 372 |
- namespaces = {'': '',
|
|
| 373 |
- 'http://backend.userland.com/rss': '', |
|
| 374 |
- 'http://blogs.law.harvard.edu/tech/rss': '', |
|
| 375 |
- 'http://purl.org/rss/1.0/': '', |
|
| 376 |
- 'http://my.netscape.com/rdf/simple/0.9/': '', |
|
| 377 |
- 'http://example.com/newformat#': '', |
|
| 378 |
- 'http://example.com/necho': '', |
|
| 379 |
- 'http://purl.org/echo/': '', |
|
| 380 |
- 'uri/of/echo/namespace#': '', |
|
| 381 |
- 'http://purl.org/pie/': '', |
|
| 382 |
- 'http://purl.org/atom/ns#': '', |
|
| 383 |
- 'http://www.w3.org/2005/Atom': '', |
|
| 384 |
- 'http://purl.org/rss/1.0/modules/rss091#': '', |
|
| 385 |
- |
|
| 386 |
- 'http://webns.net/mvcb/': 'admin', |
|
| 387 |
- 'http://purl.org/rss/1.0/modules/aggregation/': 'ag', |
|
| 388 |
- 'http://purl.org/rss/1.0/modules/annotate/': 'annotate', |
|
| 389 |
- 'http://media.tangent.org/rss/1.0/': 'audio', |
|
| 390 |
- 'http://backend.userland.com/blogChannelModule': 'blogChannel', |
|
| 391 |
- 'http://web.resource.org/cc/': 'cc', |
|
| 392 |
- 'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons', |
|
| 393 |
- 'http://purl.org/rss/1.0/modules/company': 'co', |
|
| 394 |
- 'http://purl.org/rss/1.0/modules/content/': 'content', |
|
| 395 |
- 'http://my.theinfo.org/changed/1.0/rss/': 'cp', |
|
| 396 |
- 'http://purl.org/dc/elements/1.1/': 'dc', |
|
| 397 |
- 'http://purl.org/dc/terms/': 'dcterms', |
|
| 398 |
- 'http://purl.org/rss/1.0/modules/email/': 'email', |
|
| 399 |
- 'http://purl.org/rss/1.0/modules/event/': 'ev', |
|
| 400 |
- 'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner', |
|
| 401 |
- 'http://freshmeat.net/rss/fm/': 'fm', |
|
| 402 |
- 'http://xmlns.com/foaf/0.1/': 'foaf', |
|
| 403 |
- 'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo', |
|
| 404 |
- 'http://postneo.com/icbm/': 'icbm', |
|
| 405 |
- 'http://purl.org/rss/1.0/modules/image/': 'image', |
|
| 406 |
- 'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes', |
|
| 407 |
- 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes', |
|
| 408 |
- 'http://purl.org/rss/1.0/modules/link/': 'l', |
|
| 409 |
- 'http://search.yahoo.com/mrss': 'media', |
|
| 410 |
- 'http://madskills.com/public/xml/rss/module/pingback/': 'pingback', |
|
| 411 |
- 'http://prismstandard.org/namespaces/1.2/basic/': 'prism', |
|
| 412 |
- 'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf', |
|
| 413 |
- 'http://www.w3.org/2000/01/rdf-schema#': 'rdfs', |
|
| 414 |
- 'http://purl.org/rss/1.0/modules/reference/': 'ref', |
|
| 415 |
- 'http://purl.org/rss/1.0/modules/richequiv/': 'reqv', |
|
| 416 |
- 'http://purl.org/rss/1.0/modules/search/': 'search', |
|
| 417 |
- 'http://purl.org/rss/1.0/modules/slash/': 'slash', |
|
| 418 |
- 'http://schemas.xmlsoap.org/soap/envelope/': 'soap', |
|
| 419 |
- 'http://purl.org/rss/1.0/modules/servicestatus/': 'ss', |
|
| 420 |
- 'http://hacks.benhammersley.com/rss/streaming/': 'str', |
|
| 421 |
- 'http://purl.org/rss/1.0/modules/subscription/': 'sub', |
|
| 422 |
- 'http://purl.org/rss/1.0/modules/syndication/': 'sy', |
|
| 423 |
- 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf', |
|
| 424 |
- 'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo', |
|
| 425 |
- 'http://purl.org/rss/1.0/modules/threading/': 'thr', |
|
| 426 |
- 'http://purl.org/rss/1.0/modules/textinput/': 'ti', |
|
| 427 |
- 'http://madskills.com/public/xml/rss/module/trackback/':'trackback', |
|
| 428 |
- 'http://wellformedweb.org/commentAPI/': 'wfw', |
|
| 429 |
- 'http://purl.org/rss/1.0/modules/wiki/': 'wiki', |
|
| 430 |
- 'http://www.w3.org/1999/xhtml': 'xhtml', |
|
| 431 |
- 'http://www.w3.org/1999/xlink': 'xlink', |
|
| 432 |
- 'http://www.w3.org/XML/1998/namespace': 'xml' |
|
| 433 |
-} |
|
| 434 |
- _matchnamespaces = {}
|
|
| 435 |
- |
|
| 436 |
- can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo'] |
|
| 437 |
- can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'] |
|
| 438 |
- can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'] |
|
| 439 |
- html_types = ['text/html', 'application/xhtml+xml'] |
|
| 440 |
- |
|
| 441 |
- def __init__(self, baseuri=None, baselang=None, encoding='utf-8'): |
|
| 442 |
- if _debug: sys.stderr.write('initializing FeedParser\n')
|
|
| 443 |
- if not self._matchnamespaces: |
|
| 444 |
- for k, v in self.namespaces.items(): |
|
| 445 |
- self._matchnamespaces[k.lower()] = v |
|
| 446 |
- self.feeddata = FeedParserDict() # feed-level data |
|
| 447 |
- self.encoding = encoding # character encoding |
|
| 448 |
- self.entries = [] # list of entry-level data |
|
| 449 |
- self.version = '' # feed type/version, see SUPPORTED_VERSIONS |
|
| 450 |
- self.namespacesInUse = {} # dictionary of namespaces defined by the feed
|
|
| 451 |
- |
|
| 452 |
- # the following are used internally to track state; |
|
| 453 |
- # this is really out of control and should be refactored |
|
| 454 |
- self.infeed = 0 |
|
| 455 |
- self.inentry = 0 |
|
| 456 |
- self.incontent = 0 |
|
| 457 |
- self.intextinput = 0 |
|
| 458 |
- self.inimage = 0 |
|
| 459 |
- self.inauthor = 0 |
|
| 460 |
- self.incontributor = 0 |
|
| 461 |
- self.inpublisher = 0 |
|
| 462 |
- self.insource = 0 |
|
| 463 |
- self.sourcedata = FeedParserDict() |
|
| 464 |
- self.contentparams = FeedParserDict() |
|
| 465 |
- self._summaryKey = None |
|
| 466 |
- self.namespacemap = {}
|
|
| 467 |
- self.elementstack = [] |
|
| 468 |
- self.basestack = [] |
|
| 469 |
- self.langstack = [] |
|
| 470 |
- self.baseuri = baseuri or '' |
|
| 471 |
- self.lang = baselang or None |
|
| 472 |
- self.svgOK = 0 |
|
| 473 |
- self.hasTitle = 0 |
|
| 474 |
- if baselang: |
|
| 475 |
- self.feeddata['language'] = baselang.replace('_','-')
|
|
| 476 |
- |
|
| 477 |
- def unknown_starttag(self, tag, attrs): |
|
| 478 |
- if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
|
|
| 479 |
- # normalize attrs |
|
| 480 |
- attrs = [(k.lower(), v) for k, v in attrs] |
|
| 481 |
- attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
|
|
| 482 |
- |
|
| 483 |
- # track xml:base and xml:lang |
|
| 484 |
- attrsD = dict(attrs) |
|
| 485 |
- baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
|
|
| 486 |
- if type(baseuri) != type(u''): |
|
| 487 |
- try: |
|
| 488 |
- baseuri = unicode(baseuri, self.encoding) |
|
| 489 |
- except: |
|
| 490 |
- baseuri = unicode(baseuri, 'iso-8859-1') |
|
| 491 |
- self.baseuri = _urljoin(self.baseuri, baseuri) |
|
| 492 |
- lang = attrsD.get('xml:lang', attrsD.get('lang'))
|
|
| 493 |
- if lang == '': |
|
| 494 |
- # xml:lang could be explicitly set to '', we need to capture that |
|
| 495 |
- lang = None |
|
| 496 |
- elif lang is None: |
|
| 497 |
- # if no xml:lang is specified, use parent lang |
|
| 498 |
- lang = self.lang |
|
| 499 |
- if lang: |
|
| 500 |
- if tag in ('feed', 'rss', 'rdf:RDF'):
|
|
| 501 |
- self.feeddata['language'] = lang.replace('_','-')
|
|
| 502 |
- self.lang = lang |
|
| 503 |
- self.basestack.append(self.baseuri) |
|
| 504 |
- self.langstack.append(lang) |
|
| 505 |
- |
|
| 506 |
- # track namespaces |
|
| 507 |
- for prefix, uri in attrs: |
|
| 508 |
- if prefix.startswith('xmlns:'):
|
|
| 509 |
- self.trackNamespace(prefix[6:], uri) |
|
| 510 |
- elif prefix == 'xmlns': |
|
| 511 |
- self.trackNamespace(None, uri) |
|
| 512 |
- |
|
| 513 |
- # track inline content |
|
| 514 |
- if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
|
|
| 515 |
- if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007 |
|
| 516 |
- # element declared itself as escaped markup, but it isn't really |
|
| 517 |
- self.contentparams['type'] = 'application/xhtml+xml' |
|
| 518 |
- if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
|
|
| 519 |
- if tag.find(':') <> -1:
|
|
| 520 |
- prefix, tag = tag.split(':', 1)
|
|
| 521 |
- namespace = self.namespacesInUse.get(prefix, '') |
|
| 522 |
- if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML': |
|
| 523 |
- attrs.append(('xmlns',namespace))
|
|
| 524 |
- if tag=='svg' and namespace=='http://www.w3.org/2000/svg': |
|
| 525 |
- attrs.append(('xmlns',namespace))
|
|
| 526 |
- if tag == 'svg': self.svgOK += 1 |
|
| 527 |
- return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0)
|
|
| 528 |
- |
|
| 529 |
- # match namespaces |
|
| 530 |
- if tag.find(':') <> -1:
|
|
| 531 |
- prefix, suffix = tag.split(':', 1)
|
|
| 532 |
- else: |
|
| 533 |
- prefix, suffix = '', tag |
|
| 534 |
- prefix = self.namespacemap.get(prefix, prefix) |
|
| 535 |
- if prefix: |
|
| 536 |
- prefix = prefix + '_' |
|
| 537 |
- |
|
| 538 |
- # special hack for better tracking of empty textinput/image elements in illformed feeds |
|
| 539 |
- if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
|
|
| 540 |
- self.intextinput = 0 |
|
| 541 |
- if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
|
|
| 542 |
- self.inimage = 0 |
|
| 543 |
- |
|
| 544 |
- # call special handler (if defined) or default handler |
|
| 545 |
- methodname = '_start_' + prefix + suffix |
|
| 546 |
- try: |
|
| 547 |
- method = getattr(self, methodname) |
|
| 548 |
- return method(attrsD) |
|
| 549 |
- except AttributeError: |
|
| 550 |
- return self.push(prefix + suffix, 1) |
|
| 551 |
- |
|
| 552 |
- def unknown_endtag(self, tag): |
|
| 553 |
- if _debug: sys.stderr.write('end %s\n' % tag)
|
|
| 554 |
- # match namespaces |
|
| 555 |
- if tag.find(':') <> -1:
|
|
| 556 |
- prefix, suffix = tag.split(':', 1)
|
|
| 557 |
- else: |
|
| 558 |
- prefix, suffix = '', tag |
|
| 559 |
- prefix = self.namespacemap.get(prefix, prefix) |
|
| 560 |
- if prefix: |
|
| 561 |
- prefix = prefix + '_' |
|
| 562 |
- if suffix == 'svg' and self.svgOK: self.svgOK -= 1 |
|
| 563 |
- |
|
| 564 |
- # call special handler (if defined) or default handler |
|
| 565 |
- methodname = '_end_' + prefix + suffix |
|
| 566 |
- try: |
|
| 567 |
- if self.svgOK: raise AttributeError() |
|
| 568 |
- method = getattr(self, methodname) |
|
| 569 |
- method() |
|
| 570 |
- except AttributeError: |
|
| 571 |
- self.pop(prefix + suffix) |
|
| 572 |
- |
|
| 573 |
- # track inline content |
|
| 574 |
- if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
|
|
| 575 |
- # element declared itself as escaped markup, but it isn't really |
|
| 576 |
- if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007 |
|
| 577 |
- self.contentparams['type'] = 'application/xhtml+xml' |
|
| 578 |
- if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
|
|
| 579 |
- tag = tag.split(':')[-1]
|
|
| 580 |
- self.handle_data('</%s>' % tag, escape=0)
|
|
| 581 |
- |
|
| 582 |
- # track xml:base and xml:lang going out of scope |
|
| 583 |
- if self.basestack: |
|
| 584 |
- self.basestack.pop() |
|
| 585 |
- if self.basestack and self.basestack[-1]: |
|
| 586 |
- self.baseuri = self.basestack[-1] |
|
| 587 |
- if self.langstack: |
|
| 588 |
- self.langstack.pop() |
|
| 589 |
- if self.langstack: # and (self.langstack[-1] is not None): |
|
| 590 |
- self.lang = self.langstack[-1] |
|
| 591 |
- |
|
| 592 |
- def handle_charref(self, ref): |
|
| 593 |
- # called for each character reference, e.g. for ' ', ref will be '160' |
|
| 594 |
- if not self.elementstack: return |
|
| 595 |
- ref = ref.lower() |
|
| 596 |
- if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
|
|
| 597 |
- text = '&#%s;' % ref |
|
| 598 |
- else: |
|
| 599 |
- if ref[0] == 'x': |
|
| 600 |
- c = int(ref[1:], 16) |
|
| 601 |
- else: |
|
| 602 |
- c = int(ref) |
|
| 603 |
- text = unichr(c).encode('utf-8')
|
|
| 604 |
- self.elementstack[-1][2].append(text) |
|
| 605 |
- |
|
| 606 |
- def handle_entityref(self, ref): |
|
| 607 |
- # called for each entity reference, e.g. for '©', ref will be 'copy' |
|
| 608 |
- if not self.elementstack: return |
|
| 609 |
- if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref)
|
|
| 610 |
- if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
|
|
| 611 |
- text = '&%s;' % ref |
|
| 612 |
- elif ref in self.entities.keys(): |
|
| 613 |
- text = self.entities[ref] |
|
| 614 |
- if text.startswith('&#') and text.endswith(';'):
|
|
| 615 |
- return self.handle_entityref(text) |
|
| 616 |
- else: |
|
| 617 |
- try: name2codepoint[ref] |
|
| 618 |
- except KeyError: text = '&%s;' % ref |
|
| 619 |
- else: text = unichr(name2codepoint[ref]).encode('utf-8')
|
|
| 620 |
- self.elementstack[-1][2].append(text) |
|
| 621 |
- |
|
| 622 |
- def handle_data(self, text, escape=1): |
|
| 623 |
- # called for each block of plain text, i.e. outside of any tag and |
|
| 624 |
- # not containing any character or entity references |
|
| 625 |
- if not self.elementstack: return |
|
| 626 |
- if escape and self.contentparams.get('type') == 'application/xhtml+xml':
|
|
| 627 |
- text = _xmlescape(text) |
|
| 628 |
- self.elementstack[-1][2].append(text) |
|
| 629 |
- |
|
| 630 |
- def handle_comment(self, text): |
|
| 631 |
- # called for each comment, e.g. <!-- insert message here --> |
|
| 632 |
- pass |
|
| 633 |
- |
|
| 634 |
- def handle_pi(self, text): |
|
| 635 |
- # called for each processing instruction, e.g. <?instruction> |
|
| 636 |
- pass |
|
| 637 |
- |
|
| 638 |
- def handle_decl(self, text): |
|
| 639 |
- pass |
|
| 640 |
- |
|
| 641 |
- def parse_declaration(self, i): |
|
| 642 |
- # override internal declaration handler to handle CDATA blocks |
|
| 643 |
- if _debug: sys.stderr.write('entering parse_declaration\n')
|
|
| 644 |
- if self.rawdata[i:i+9] == '<![CDATA[': |
|
| 645 |
- k = self.rawdata.find(']]>', i)
|
|
| 646 |
- if k == -1: k = len(self.rawdata) |
|
| 647 |
- self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0) |
|
| 648 |
- return k+3 |
|
| 649 |
- else: |
|
| 650 |
- k = self.rawdata.find('>', i)
|
|
| 651 |
- return k+1 |
|
| 652 |
- |
|
| 653 |
- def mapContentType(self, contentType): |
|
| 654 |
- contentType = contentType.lower() |
|
| 655 |
- if contentType == 'text': |
|
| 656 |
- contentType = 'text/plain' |
|
| 657 |
- elif contentType == 'html': |
|
| 658 |
- contentType = 'text/html' |
|
| 659 |
- elif contentType == 'xhtml': |
|
| 660 |
- contentType = 'application/xhtml+xml' |
|
| 661 |
- return contentType |
|
| 662 |
- |
|
| 663 |
- def trackNamespace(self, prefix, uri): |
|
| 664 |
- loweruri = uri.lower() |
|
| 665 |
- if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version: |
|
| 666 |
- self.version = 'rss090' |
|
| 667 |
- if loweruri == 'http://purl.org/rss/1.0/' and not self.version: |
|
| 668 |
- self.version = 'rss10' |
|
| 669 |
- if loweruri == 'http://www.w3.org/2005/atom' and not self.version: |
|
| 670 |
- self.version = 'atom10' |
|
| 671 |
- if loweruri.find('backend.userland.com/rss') <> -1:
|
|
| 672 |
- # match any backend.userland.com namespace |
|
| 673 |
- uri = 'http://backend.userland.com/rss' |
|
| 674 |
- loweruri = uri |
|
| 675 |
- if self._matchnamespaces.has_key(loweruri): |
|
| 676 |
- self.namespacemap[prefix] = self._matchnamespaces[loweruri] |
|
| 677 |
- self.namespacesInUse[self._matchnamespaces[loweruri]] = uri |
|
| 678 |
- else: |
|
| 679 |
- self.namespacesInUse[prefix or ''] = uri |
|
| 680 |
- |
|
| 681 |
- def resolveURI(self, uri): |
|
| 682 |
- return _urljoin(self.baseuri or '', uri) |
|
| 683 |
- |
|
| 684 |
- def decodeEntities(self, element, data): |
|
| 685 |
- return data |
|
| 686 |
- |
|
| 687 |
- def strattrs(self, attrs): |
|
| 688 |
- return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'"'})) for t in attrs])
|
|
| 689 |
- |
|
| 690 |
- def push(self, element, expectingText): |
|
| 691 |
- self.elementstack.append([element, expectingText, []]) |
|
| 692 |
- |
|
| 693 |
- def pop(self, element, stripWhitespace=1): |
|
| 694 |
- if not self.elementstack: return |
|
| 695 |
- if self.elementstack[-1][0] != element: return |
|
| 696 |
- |
|
| 697 |
- element, expectingText, pieces = self.elementstack.pop() |
|
| 698 |
- |
|
| 699 |
- if self.version == 'atom10' and self.contentparams.get('type','text') == 'application/xhtml+xml':
|
|
| 700 |
- # remove enclosing child element, but only if it is a <div> and |
|
| 701 |
- # only if all the remaining content is nested underneath it. |
|
| 702 |
- # This means that the divs would be retained in the following: |
|
| 703 |
- # <div>foo</div><div>bar</div> |
|
| 704 |
- while pieces and len(pieces)>1 and not pieces[-1].strip(): |
|
| 705 |
- del pieces[-1] |
|
| 706 |
- while pieces and len(pieces)>1 and not pieces[0].strip(): |
|
| 707 |
- del pieces[0] |
|
| 708 |
- if pieces and (pieces[0] == '<div>' or pieces[0].startswith('<div ')) and pieces[-1]=='</div>':
|
|
| 709 |
- depth = 0 |
|
| 710 |
- for piece in pieces[:-1]: |
|
| 711 |
- if piece.startswith('</'):
|
|
| 712 |
- depth -= 1 |
|
| 713 |
- if depth == 0: break |
|
| 714 |
- elif piece.startswith('<') and not piece.endswith('/>'):
|
|
| 715 |
- depth += 1 |
|
| 716 |
- else: |
|
| 717 |
- pieces = pieces[1:-1] |
|
| 718 |
- |
|
| 719 |
- output = ''.join(pieces) |
|
| 720 |
- if stripWhitespace: |
|
| 721 |
- output = output.strip() |
|
| 722 |
- if not expectingText: return output |
|
| 723 |
- |
|
| 724 |
- # decode base64 content |
|
| 725 |
- if base64 and self.contentparams.get('base64', 0):
|
|
| 726 |
- try: |
|
| 727 |
- output = base64.decodestring(output) |
|
| 728 |
- except binascii.Error: |
|
| 729 |
- pass |
|
| 730 |
- except binascii.Incomplete: |
|
| 731 |
- pass |
|
| 732 |
- |
|
| 733 |
- # resolve relative URIs |
|
| 734 |
- if (element in self.can_be_relative_uri) and output: |
|
| 735 |
- output = self.resolveURI(output) |
|
| 736 |
- |
|
| 737 |
- # decode entities within embedded markup |
|
| 738 |
- if not self.contentparams.get('base64', 0):
|
|
| 739 |
- output = self.decodeEntities(element, output) |
|
| 740 |
- |
|
| 741 |
- if self.lookslikehtml(output): |
|
| 742 |
- self.contentparams['type']='text/html' |
|
| 743 |
- |
|
| 744 |
- # remove temporary cruft from contentparams |
|
| 745 |
- try: |
|
| 746 |
- del self.contentparams['mode'] |
|
| 747 |
- except KeyError: |
|
| 748 |
- pass |
|
| 749 |
- try: |
|
| 750 |
- del self.contentparams['base64'] |
|
| 751 |
- except KeyError: |
|
| 752 |
- pass |
|
| 753 |
- |
|
| 754 |
- is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types
|
|
| 755 |
- # resolve relative URIs within embedded markup |
|
| 756 |
- if is_htmlish and RESOLVE_RELATIVE_URIS: |
|
| 757 |
- if element in self.can_contain_relative_uris: |
|
| 758 |
- output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html'))
|
|
| 759 |
- |
|
| 760 |
- # parse microformats |
|
| 761 |
- # (must do this before sanitizing because some microformats |
|
| 762 |
- # rely on elements that we sanitize) |
|
| 763 |
- if is_htmlish and element in ['content', 'description', 'summary']: |
|
| 764 |
- mfresults = _parseMicroformats(output, self.baseuri, self.encoding) |
|
| 765 |
- if mfresults: |
|
| 766 |
- for tag in mfresults.get('tags', []):
|
|
| 767 |
- self._addTag(tag['term'], tag['scheme'], tag['label']) |
|
| 768 |
- for enclosure in mfresults.get('enclosures', []):
|
|
| 769 |
- self._start_enclosure(enclosure) |
|
| 770 |
- for xfn in mfresults.get('xfn', []):
|
|
| 771 |
- self._addXFN(xfn['relationships'], xfn['href'], xfn['name']) |
|
| 772 |
- vcard = mfresults.get('vcard')
|
|
| 773 |
- if vcard: |
|
| 774 |
- self._getContext()['vcard'] = vcard |
|
| 775 |
- |
|
| 776 |
- # sanitize embedded markup |
|
| 777 |
- if is_htmlish and SANITIZE_HTML: |
|
| 778 |
- if element in self.can_contain_dangerous_markup: |
|
| 779 |
- output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html'))
|
|
| 780 |
- |
|
| 781 |
- if self.encoding and type(output) != type(u''): |
|
| 782 |
- try: |
|
| 783 |
- output = unicode(output, self.encoding) |
|
| 784 |
- except: |
|
| 785 |
- pass |
|
| 786 |
- |
|
| 787 |
- # address common error where people take data that is already |
|
| 788 |
- # utf-8, presume that it is iso-8859-1, and re-encode it. |
|
| 789 |
- if self.encoding=='utf-8' and type(output) == type(u''): |
|
| 790 |
- try: |
|
| 791 |
- output = unicode(output.encode('iso-8859-1'), 'utf-8')
|
|
| 792 |
- except: |
|
| 793 |
- pass |
|
| 794 |
- |
|
| 795 |
- # map win-1252 extensions to the proper code points |
|
| 796 |
- if type(output) == type(u''): |
|
| 797 |
- output = u''.join([c in _cp1252.keys() and _cp1252[c] or c for c in output]) |
|
| 798 |
- |
|
| 799 |
- # categories/tags/keywords/whatever are handled in _end_category |
|
| 800 |
- if element == 'category': |
|
| 801 |
- return output |
|
| 802 |
- |
|
| 803 |
- if element == 'title' and self.hasTitle: |
|
| 804 |
- return output |
|
| 805 |
- |
|
| 806 |
- # store output in appropriate place(s) |
|
| 807 |
- if self.inentry and not self.insource: |
|
| 808 |
- if element == 'content': |
|
| 809 |
- self.entries[-1].setdefault(element, []) |
|
| 810 |
- contentparams = copy.deepcopy(self.contentparams) |
|
| 811 |
- contentparams['value'] = output |
|
| 812 |
- self.entries[-1][element].append(contentparams) |
|
| 813 |
- elif element == 'link': |
|
| 814 |
- self.entries[-1][element] = output |
|
| 815 |
- if output: |
|
| 816 |
- self.entries[-1]['links'][-1]['href'] = output |
|
| 817 |
- else: |
|
| 818 |
- if element == 'description': |
|
| 819 |
- element = 'summary' |
|
| 820 |
- self.entries[-1][element] = output |
|
| 821 |
- if self.incontent: |
|
| 822 |
- contentparams = copy.deepcopy(self.contentparams) |
|
| 823 |
- contentparams['value'] = output |
|
| 824 |
- self.entries[-1][element + '_detail'] = contentparams |
|
| 825 |
- elif (self.infeed or self.insource):# and (not self.intextinput) and (not self.inimage): |
|
| 826 |
- context = self._getContext() |
|
| 827 |
- if element == 'description': |
|
| 828 |
- element = 'subtitle' |
|
| 829 |
- context[element] = output |
|
| 830 |
- if element == 'link': |
|
| 831 |
- context['links'][-1]['href'] = output |
|
| 832 |
- elif self.incontent: |
|
| 833 |
- contentparams = copy.deepcopy(self.contentparams) |
|
| 834 |
- contentparams['value'] = output |
|
| 835 |
- context[element + '_detail'] = contentparams |
|
| 836 |
- return output |
|
| 837 |
- |
|
| 838 |
- def pushContent(self, tag, attrsD, defaultContentType, expectingText): |
|
| 839 |
- self.incontent += 1 |
|
| 840 |
- if self.lang: self.lang=self.lang.replace('_','-')
|
|
| 841 |
- self.contentparams = FeedParserDict({
|
|
| 842 |
- 'type': self.mapContentType(attrsD.get('type', defaultContentType)),
|
|
| 843 |
- 'language': self.lang, |
|
| 844 |
- 'base': self.baseuri}) |
|
| 845 |
- self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams) |
|
| 846 |
- self.push(tag, expectingText) |
|
| 847 |
- |
|
| 848 |
- def popContent(self, tag): |
|
| 849 |
- value = self.pop(tag) |
|
| 850 |
- self.incontent -= 1 |
|
| 851 |
- self.contentparams.clear() |
|
| 852 |
- return value |
|
| 853 |
- |
|
| 854 |
- # a number of elements in a number of RSS variants are nominally plain |
|
| 855 |
- # text, but this is routinely ignored. This is an attempt to detect |
|
| 856 |
- # the most common cases. As false positives often result in silent |
|
| 857 |
- # data loss, this function errs on the conservative side. |
|
| 858 |
- def lookslikehtml(self, str): |
|
| 859 |
- if self.version.startswith('atom'): return
|
|
| 860 |
- if self.contentparams.get('type','text/html') != 'text/plain': return
|
|
| 861 |
- |
|
| 862 |
- # must have a close tag or a entity reference to qualify |
|
| 863 |
- if not (re.search(r'</(\w+)>',str) or re.search("&#?\w+;",str)): return
|
|
| 864 |
- |
|
| 865 |
- # all tags must be in a restricted subset of valid HTML tags |
|
| 866 |
- if filter(lambda t: t.lower() not in _HTMLSanitizer.acceptable_elements, |
|
| 867 |
- re.findall(r'</?(\w+)',str)): return |
|
| 868 |
- |
|
| 869 |
- # all entities must have been defined as valid HTML entities |
|
| 870 |
- from htmlentitydefs import entitydefs |
|
| 871 |
- if filter(lambda e: e not in entitydefs.keys(), |
|
| 872 |
- re.findall(r'&(\w+);',str)): return |
|
| 873 |
- |
|
| 874 |
- return 1 |
|
| 875 |
- |
|
| 876 |
- def _mapToStandardPrefix(self, name): |
|
| 877 |
- colonpos = name.find(':')
|
|
| 878 |
- if colonpos <> -1: |
|
| 879 |
- prefix = name[:colonpos] |
|
| 880 |
- suffix = name[colonpos+1:] |
|
| 881 |
- prefix = self.namespacemap.get(prefix, prefix) |
|
| 882 |
- name = prefix + ':' + suffix |
|
| 883 |
- return name |
|
| 884 |
- |
|
| 885 |
- def _getAttribute(self, attrsD, name): |
|
| 886 |
- return attrsD.get(self._mapToStandardPrefix(name)) |
|
| 887 |
- |
|
| 888 |
- def _isBase64(self, attrsD, contentparams): |
|
| 889 |
- if attrsD.get('mode', '') == 'base64':
|
|
| 890 |
- return 1 |
|
| 891 |
- if self.contentparams['type'].startswith('text/'):
|
|
| 892 |
- return 0 |
|
| 893 |
- if self.contentparams['type'].endswith('+xml'):
|
|
| 894 |
- return 0 |
|
| 895 |
- if self.contentparams['type'].endswith('/xml'):
|
|
| 896 |
- return 0 |
|
| 897 |
- return 1 |
|
| 898 |
- |
|
| 899 |
- def _itsAnHrefDamnIt(self, attrsD): |
|
| 900 |
- href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
|
|
| 901 |
- if href: |
|
| 902 |
- try: |
|
| 903 |
- del attrsD['url'] |
|
| 904 |
- except KeyError: |
|
| 905 |
- pass |
|
| 906 |
- try: |
|
| 907 |
- del attrsD['uri'] |
|
| 908 |
- except KeyError: |
|
| 909 |
- pass |
|
| 910 |
- attrsD['href'] = href |
|
| 911 |
- return attrsD |
|
| 912 |
- |
|
| 913 |
- def _save(self, key, value): |
|
| 914 |
- context = self._getContext() |
|
| 915 |
- context.setdefault(key, value) |
|
| 916 |
- |
|
| 917 |
- def _start_rss(self, attrsD): |
|
| 918 |
- versionmap = {'0.91': 'rss091u',
|
|
| 919 |
- '0.92': 'rss092', |
|
| 920 |
- '0.93': 'rss093', |
|
| 921 |
- '0.94': 'rss094'} |
|
| 922 |
- if not self.version: |
|
| 923 |
- attr_version = attrsD.get('version', '')
|
|
| 924 |
- version = versionmap.get(attr_version) |
|
| 925 |
- if version: |
|
| 926 |
- self.version = version |
|
| 927 |
- elif attr_version.startswith('2.'):
|
|
| 928 |
- self.version = 'rss20' |
|
| 929 |
- else: |
|
| 930 |
- self.version = 'rss' |
|
| 931 |
- |
|
| 932 |
- def _start_dlhottitles(self, attrsD): |
|
| 933 |
- self.version = 'hotrss' |
|
| 934 |
- |
|
| 935 |
- def _start_channel(self, attrsD): |
|
| 936 |
- self.infeed = 1 |
|
| 937 |
- self._cdf_common(attrsD) |
|
| 938 |
- _start_feedinfo = _start_channel |
|
| 939 |
- |
|
| 940 |
- def _cdf_common(self, attrsD): |
|
| 941 |
- if attrsD.has_key('lastmod'):
|
|
| 942 |
- self._start_modified({})
|
|
| 943 |
- self.elementstack[-1][-1] = attrsD['lastmod'] |
|
| 944 |
- self._end_modified() |
|
| 945 |
- if attrsD.has_key('href'):
|
|
| 946 |
- self._start_link({})
|
|
| 947 |
- self.elementstack[-1][-1] = attrsD['href'] |
|
| 948 |
- self._end_link() |
|
| 949 |
- |
|
| 950 |
- def _start_feed(self, attrsD): |
|
| 951 |
- self.infeed = 1 |
|
| 952 |
- versionmap = {'0.1': 'atom01',
|
|
| 953 |
- '0.2': 'atom02', |
|
| 954 |
- '0.3': 'atom03'} |
|
| 955 |
- if not self.version: |
|
| 956 |
- attr_version = attrsD.get('version')
|
|
| 957 |
- version = versionmap.get(attr_version) |
|
| 958 |
- if version: |
|
| 959 |
- self.version = version |
|
| 960 |
- else: |
|
| 961 |
- self.version = 'atom' |
|
| 962 |
- |
|
| 963 |
- def _end_channel(self): |
|
| 964 |
- self.infeed = 0 |
|
| 965 |
- _end_feed = _end_channel |
|
| 966 |
- |
|
| 967 |
- def _start_image(self, attrsD): |
|
| 968 |
- context = self._getContext() |
|
| 969 |
- context.setdefault('image', FeedParserDict())
|
|
| 970 |
- self.inimage = 1 |
|
| 971 |
- self.hasTitle = 0 |
|
| 972 |
- self.push('image', 0)
|
|
| 973 |
- |
|
| 974 |
- def _end_image(self): |
|
| 975 |
- self.pop('image')
|
|
| 976 |
- self.inimage = 0 |
|
| 977 |
- |
|
| 978 |
- def _start_textinput(self, attrsD): |
|
| 979 |
- context = self._getContext() |
|
| 980 |
- context.setdefault('textinput', FeedParserDict())
|
|
| 981 |
- self.intextinput = 1 |
|
| 982 |
- self.hasTitle = 0 |
|
| 983 |
- self.push('textinput', 0)
|
|
| 984 |
- _start_textInput = _start_textinput |
|
| 985 |
- |
|
| 986 |
- def _end_textinput(self): |
|
| 987 |
- self.pop('textinput')
|
|
| 988 |
- self.intextinput = 0 |
|
| 989 |
- _end_textInput = _end_textinput |
|
| 990 |
- |
|
| 991 |
- def _start_author(self, attrsD): |
|
| 992 |
- self.inauthor = 1 |
|
| 993 |
- self.push('author', 1)
|
|
| 994 |
- _start_managingeditor = _start_author |
|
| 995 |
- _start_dc_author = _start_author |
|
| 996 |
- _start_dc_creator = _start_author |
|
| 997 |
- _start_itunes_author = _start_author |
|
| 998 |
- |
|
| 999 |
- def _end_author(self): |
|
| 1000 |
- self.pop('author')
|
|
| 1001 |
- self.inauthor = 0 |
|
| 1002 |
- self._sync_author_detail() |
|
| 1003 |
- _end_managingeditor = _end_author |
|
| 1004 |
- _end_dc_author = _end_author |
|
| 1005 |
- _end_dc_creator = _end_author |
|
| 1006 |
- _end_itunes_author = _end_author |
|
| 1007 |
- |
|
| 1008 |
- def _start_itunes_owner(self, attrsD): |
|
| 1009 |
- self.inpublisher = 1 |
|
| 1010 |
- self.push('publisher', 0)
|
|
| 1011 |
- |
|
| 1012 |
- def _end_itunes_owner(self): |
|
| 1013 |
- self.pop('publisher')
|
|
| 1014 |
- self.inpublisher = 0 |
|
| 1015 |
- self._sync_author_detail('publisher')
|
|
| 1016 |
- |
|
| 1017 |
- def _start_contributor(self, attrsD): |
|
| 1018 |
- self.incontributor = 1 |
|
| 1019 |
- context = self._getContext() |
|
| 1020 |
- context.setdefault('contributors', [])
|
|
| 1021 |
- context['contributors'].append(FeedParserDict()) |
|
| 1022 |
- self.push('contributor', 0)
|
|
| 1023 |
- |
|
| 1024 |
- def _end_contributor(self): |
|
| 1025 |
- self.pop('contributor')
|
|
| 1026 |
- self.incontributor = 0 |
|
| 1027 |
- |
|
| 1028 |
- def _start_dc_contributor(self, attrsD): |
|
| 1029 |
- self.incontributor = 1 |
|
| 1030 |
- context = self._getContext() |
|
| 1031 |
- context.setdefault('contributors', [])
|
|
| 1032 |
- context['contributors'].append(FeedParserDict()) |
|
| 1033 |
- self.push('name', 0)
|
|
| 1034 |
- |
|
| 1035 |
- def _end_dc_contributor(self): |
|
| 1036 |
- self._end_name() |
|
| 1037 |
- self.incontributor = 0 |
|
| 1038 |
- |
|
| 1039 |
- def _start_name(self, attrsD): |
|
| 1040 |
- self.push('name', 0)
|
|
| 1041 |
- _start_itunes_name = _start_name |
|
| 1042 |
- |
|
| 1043 |
- def _end_name(self): |
|
| 1044 |
- value = self.pop('name')
|
|
| 1045 |
- if self.inpublisher: |
|
| 1046 |
- self._save_author('name', value, 'publisher')
|
|
| 1047 |
- elif self.inauthor: |
|
| 1048 |
- self._save_author('name', value)
|
|
| 1049 |
- elif self.incontributor: |
|
| 1050 |
- self._save_contributor('name', value)
|
|
| 1051 |
- elif self.intextinput: |
|
| 1052 |
- context = self._getContext() |
|
| 1053 |
- context['name'] = value |
|
| 1054 |
- _end_itunes_name = _end_name |
|
| 1055 |
- |
|
| 1056 |
- def _start_width(self, attrsD): |
|
| 1057 |
- self.push('width', 0)
|
|
| 1058 |
- |
|
| 1059 |
- def _end_width(self): |
|
| 1060 |
- value = self.pop('width')
|
|
| 1061 |
- try: |
|
| 1062 |
- value = int(value) |
|
| 1063 |
- except: |
|
| 1064 |
- value = 0 |
|
| 1065 |
- if self.inimage: |
|
| 1066 |
- context = self._getContext() |
|
| 1067 |
- context['width'] = value |
|
| 1068 |
- |
|
| 1069 |
- def _start_height(self, attrsD): |
|
| 1070 |
- self.push('height', 0)
|
|
| 1071 |
- |
|
| 1072 |
- def _end_height(self): |
|
| 1073 |
- value = self.pop('height')
|
|
| 1074 |
- try: |
|
| 1075 |
- value = int(value) |
|
| 1076 |
- except: |
|
| 1077 |
- value = 0 |
|
| 1078 |
- if self.inimage: |
|
| 1079 |
- context = self._getContext() |
|
| 1080 |
- context['height'] = value |
|
| 1081 |
- |
|
| 1082 |
- def _start_url(self, attrsD): |
|
| 1083 |
- self.push('href', 1)
|
|
| 1084 |
- _start_homepage = _start_url |
|
| 1085 |
- _start_uri = _start_url |
|
| 1086 |
- |
|
| 1087 |
- def _end_url(self): |
|
| 1088 |
- value = self.pop('href')
|
|
| 1089 |
- if self.inauthor: |
|
| 1090 |
- self._save_author('href', value)
|
|
| 1091 |
- elif self.incontributor: |
|
| 1092 |
- self._save_contributor('href', value)
|
|
| 1093 |
- _end_homepage = _end_url |
|
| 1094 |
- _end_uri = _end_url |
|
| 1095 |
- |
|
| 1096 |
- def _start_email(self, attrsD): |
|
| 1097 |
- self.push('email', 0)
|
|
| 1098 |
- _start_itunes_email = _start_email |
|
| 1099 |
- |
|
| 1100 |
- def _end_email(self): |
|
| 1101 |
- value = self.pop('email')
|
|
| 1102 |
- if self.inpublisher: |
|
| 1103 |
- self._save_author('email', value, 'publisher')
|
|
| 1104 |
- elif self.inauthor: |
|
| 1105 |
- self._save_author('email', value)
|
|
| 1106 |
- elif self.incontributor: |
|
| 1107 |
- self._save_contributor('email', value)
|
|
| 1108 |
- _end_itunes_email = _end_email |
|
| 1109 |
- |
|
| 1110 |
- def _getContext(self): |
|
| 1111 |
- if self.insource: |
|
| 1112 |
- context = self.sourcedata |
|
| 1113 |
- elif self.inimage: |
|
| 1114 |
- context = self.feeddata['image'] |
|
| 1115 |
- elif self.intextinput: |
|
| 1116 |
- context = self.feeddata['textinput'] |
|
| 1117 |
- elif self.inentry: |
|
| 1118 |
- context = self.entries[-1] |
|
| 1119 |
- else: |
|
| 1120 |
- context = self.feeddata |
|
| 1121 |
- return context |
|
| 1122 |
- |
|
| 1123 |
- def _save_author(self, key, value, prefix='author'): |
|
| 1124 |
- context = self._getContext() |
|
| 1125 |
- context.setdefault(prefix + '_detail', FeedParserDict()) |
|
| 1126 |
- context[prefix + '_detail'][key] = value |
|
| 1127 |
- self._sync_author_detail() |
|
| 1128 |
- |
|
| 1129 |
- def _save_contributor(self, key, value): |
|
| 1130 |
- context = self._getContext() |
|
| 1131 |
- context.setdefault('contributors', [FeedParserDict()])
|
|
| 1132 |
- context['contributors'][-1][key] = value |
|
| 1133 |
- |
|
| 1134 |
- def _sync_author_detail(self, key='author'): |
|
| 1135 |
- context = self._getContext() |
|
| 1136 |
- detail = context.get('%s_detail' % key)
|
|
| 1137 |
- if detail: |
|
| 1138 |
- name = detail.get('name')
|
|
| 1139 |
- email = detail.get('email')
|
|
| 1140 |
- if name and email: |
|
| 1141 |
- context[key] = '%s (%s)' % (name, email) |
|
| 1142 |
- elif name: |
|
| 1143 |
- context[key] = name |
|
| 1144 |
- elif email: |
|
| 1145 |
- context[key] = email |
|
| 1146 |
- else: |
|
| 1147 |
- author, email = context.get(key), None |
|
| 1148 |
- if not author: return |
|
| 1149 |
- emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author)
|
|
| 1150 |
- if emailmatch: |
|
| 1151 |
- email = emailmatch.group(0) |
|
| 1152 |
- # probably a better way to do the following, but it passes all the tests |
|
| 1153 |
- author = author.replace(email, '') |
|
| 1154 |
- author = author.replace('()', '')
|
|
| 1155 |
- author = author.replace('<>', '')
|
|
| 1156 |
- author = author.replace('<>', '')
|
|
| 1157 |
- author = author.strip() |
|
| 1158 |
- if author and (author[0] == '('):
|
|
| 1159 |
- author = author[1:] |
|
| 1160 |
- if author and (author[-1] == ')'): |
|
| 1161 |
- author = author[:-1] |
|
| 1162 |
- author = author.strip() |
|
| 1163 |
- if author or email: |
|
| 1164 |
- context.setdefault('%s_detail' % key, FeedParserDict())
|
|
| 1165 |
- if author: |
|
| 1166 |
- context['%s_detail' % key]['name'] = author |
|
| 1167 |
- if email: |
|
| 1168 |
- context['%s_detail' % key]['email'] = email |
|
| 1169 |
- |
|
| 1170 |
- def _start_subtitle(self, attrsD): |
|
| 1171 |
- self.pushContent('subtitle', attrsD, 'text/plain', 1)
|
|
| 1172 |
- _start_tagline = _start_subtitle |
|
| 1173 |
- _start_itunes_subtitle = _start_subtitle |
|
| 1174 |
- |
|
| 1175 |
- def _end_subtitle(self): |
|
| 1176 |
- self.popContent('subtitle')
|
|
| 1177 |
- _end_tagline = _end_subtitle |
|
| 1178 |
- _end_itunes_subtitle = _end_subtitle |
|
| 1179 |
- |
|
| 1180 |
- def _start_rights(self, attrsD): |
|
| 1181 |
- self.pushContent('rights', attrsD, 'text/plain', 1)
|
|
| 1182 |
- _start_dc_rights = _start_rights |
|
| 1183 |
- _start_copyright = _start_rights |
|
| 1184 |
- |
|
| 1185 |
- def _end_rights(self): |
|
| 1186 |
- self.popContent('rights')
|
|
| 1187 |
- _end_dc_rights = _end_rights |
|
| 1188 |
- _end_copyright = _end_rights |
|
| 1189 |
- |
|
| 1190 |
- def _start_item(self, attrsD): |
|
| 1191 |
- self.entries.append(FeedParserDict()) |
|
| 1192 |
- self.push('item', 0)
|
|
| 1193 |
- self.inentry = 1 |
|
| 1194 |
- self.guidislink = 0 |
|
| 1195 |
- self.hasTitle = 0 |
|
| 1196 |
- id = self._getAttribute(attrsD, 'rdf:about') |
|
| 1197 |
- if id: |
|
| 1198 |
- context = self._getContext() |
|
| 1199 |
- context['id'] = id |
|
| 1200 |
- self._cdf_common(attrsD) |
|
| 1201 |
- _start_entry = _start_item |
|
| 1202 |
- _start_product = _start_item |
|
| 1203 |
- |
|
| 1204 |
- def _end_item(self): |
|
| 1205 |
- self.pop('item')
|
|
| 1206 |
- self.inentry = 0 |
|
| 1207 |
- _end_entry = _end_item |
|
| 1208 |
- |
|
| 1209 |
- def _start_dc_language(self, attrsD): |
|
| 1210 |
- self.push('language', 1)
|
|
| 1211 |
- _start_language = _start_dc_language |
|
| 1212 |
- |
|
| 1213 |
- def _end_dc_language(self): |
|
| 1214 |
- self.lang = self.pop('language')
|
|
| 1215 |
- _end_language = _end_dc_language |
|
| 1216 |
- |
|
| 1217 |
- def _start_dc_publisher(self, attrsD): |
|
| 1218 |
- self.push('publisher', 1)
|
|
| 1219 |
- _start_webmaster = _start_dc_publisher |
|
| 1220 |
- |
|
| 1221 |
- def _end_dc_publisher(self): |
|
| 1222 |
- self.pop('publisher')
|
|
| 1223 |
- self._sync_author_detail('publisher')
|
|
| 1224 |
- _end_webmaster = _end_dc_publisher |
|
| 1225 |
- |
|
| 1226 |
- def _start_published(self, attrsD): |
|
| 1227 |
- self.push('published', 1)
|
|
| 1228 |
- _start_dcterms_issued = _start_published |
|
| 1229 |
- _start_issued = _start_published |
|
| 1230 |
- |
|
| 1231 |
- def _end_published(self): |
|
| 1232 |
- value = self.pop('published')
|
|
| 1233 |
- self._save('published_parsed', _parse_date(value))
|
|
| 1234 |
- _end_dcterms_issued = _end_published |
|
| 1235 |
- _end_issued = _end_published |
|
| 1236 |
- |
|
| 1237 |
- def _start_updated(self, attrsD): |
|
| 1238 |
- self.push('updated', 1)
|
|
| 1239 |
- _start_modified = _start_updated |
|
| 1240 |
- _start_dcterms_modified = _start_updated |
|
| 1241 |
- _start_pubdate = _start_updated |
|
| 1242 |
- _start_dc_date = _start_updated |
|
| 1243 |
- |
|
| 1244 |
- def _end_updated(self): |
|
| 1245 |
- value = self.pop('updated')
|
|
| 1246 |
- parsed_value = _parse_date(value) |
|
| 1247 |
- self._save('updated_parsed', parsed_value)
|
|
| 1248 |
- _end_modified = _end_updated |
|
| 1249 |
- _end_dcterms_modified = _end_updated |
|
| 1250 |
- _end_pubdate = _end_updated |
|
| 1251 |
- _end_dc_date = _end_updated |
|
| 1252 |
- |
|
| 1253 |
- def _start_created(self, attrsD): |
|
| 1254 |
- self.push('created', 1)
|
|
| 1255 |
- _start_dcterms_created = _start_created |
|
| 1256 |
- |
|
| 1257 |
- def _end_created(self): |
|
| 1258 |
- value = self.pop('created')
|
|
| 1259 |
- self._save('created_parsed', _parse_date(value))
|
|
| 1260 |
- _end_dcterms_created = _end_created |
|
| 1261 |
- |
|
| 1262 |
- def _start_expirationdate(self, attrsD): |
|
| 1263 |
- self.push('expired', 1)
|
|
| 1264 |
- |
|
| 1265 |
- def _end_expirationdate(self): |
|
| 1266 |
- self._save('expired_parsed', _parse_date(self.pop('expired')))
|
|
| 1267 |
- |
|
| 1268 |
- def _start_cc_license(self, attrsD): |
|
| 1269 |
- context = self._getContext() |
|
| 1270 |
- value = self._getAttribute(attrsD, 'rdf:resource') |
|
| 1271 |
- attrsD = FeedParserDict() |
|
| 1272 |
- attrsD['rel']='license' |
|
| 1273 |
- if value: attrsD['href']=value |
|
| 1274 |
- context.setdefault('links', []).append(attrsD)
|
|
| 1275 |
- |
|
| 1276 |
- def _start_creativecommons_license(self, attrsD): |
|
| 1277 |
- self.push('license', 1)
|
|
| 1278 |
- _start_creativeCommons_license = _start_creativecommons_license |
|
| 1279 |
- |
|
| 1280 |
- def _end_creativecommons_license(self): |
|
| 1281 |
- value = self.pop('license')
|
|
| 1282 |
- context = self._getContext() |
|
| 1283 |
- attrsD = FeedParserDict() |
|
| 1284 |
- attrsD['rel']='license' |
|
| 1285 |
- if value: attrsD['href']=value |
|
| 1286 |
- context.setdefault('links', []).append(attrsD)
|
|
| 1287 |
- del context['license'] |
|
| 1288 |
- _end_creativeCommons_license = _end_creativecommons_license |
|
| 1289 |
- |
|
| 1290 |
- def _addXFN(self, relationships, href, name): |
|
| 1291 |
- context = self._getContext() |
|
| 1292 |
- xfn = context.setdefault('xfn', [])
|
|
| 1293 |
- value = FeedParserDict({'relationships': relationships, 'href': href, 'name': name})
|
|
| 1294 |
- if value not in xfn: |
|
| 1295 |
- xfn.append(value) |
|
| 1296 |
- |
|
| 1297 |
- def _addTag(self, term, scheme, label): |
|
| 1298 |
- context = self._getContext() |
|
| 1299 |
- tags = context.setdefault('tags', [])
|
|
| 1300 |
- if (not term) and (not scheme) and (not label): return |
|
| 1301 |
- value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
|
|
| 1302 |
- if value not in tags: |
|
| 1303 |
- tags.append(value) |
|
| 1304 |
- |
|
| 1305 |
- def _start_category(self, attrsD): |
|
| 1306 |
- if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD))
|
|
| 1307 |
- term = attrsD.get('term')
|
|
| 1308 |
- scheme = attrsD.get('scheme', attrsD.get('domain'))
|
|
| 1309 |
- label = attrsD.get('label')
|
|
| 1310 |
- self._addTag(term, scheme, label) |
|
| 1311 |
- self.push('category', 1)
|
|
| 1312 |
- _start_dc_subject = _start_category |
|
| 1313 |
- _start_keywords = _start_category |
|
| 1314 |
- |
|
| 1315 |
- def _end_itunes_keywords(self): |
|
| 1316 |
- for term in self.pop('itunes_keywords').split():
|
|
| 1317 |
- self._addTag(term, 'http://www.itunes.com/', None) |
|
| 1318 |
- |
|
| 1319 |
- def _start_itunes_category(self, attrsD): |
|
| 1320 |
- self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None)
|
|
| 1321 |
- self.push('category', 1)
|
|
| 1322 |
- |
|
| 1323 |
- def _end_category(self): |
|
| 1324 |
- value = self.pop('category')
|
|
| 1325 |
- if not value: return |
|
| 1326 |
- context = self._getContext() |
|
| 1327 |
- tags = context['tags'] |
|
| 1328 |
- if value and len(tags) and not tags[-1]['term']: |
|
| 1329 |
- tags[-1]['term'] = value |
|
| 1330 |
- else: |
|
| 1331 |
- self._addTag(value, None, None) |
|
| 1332 |
- _end_dc_subject = _end_category |
|
| 1333 |
- _end_keywords = _end_category |
|
| 1334 |
- _end_itunes_category = _end_category |
|
| 1335 |
- |
|
| 1336 |
- def _start_cloud(self, attrsD): |
|
| 1337 |
- self._getContext()['cloud'] = FeedParserDict(attrsD) |
|
| 1338 |
- |
|
| 1339 |
- def _start_link(self, attrsD): |
|
| 1340 |
- attrsD.setdefault('rel', 'alternate')
|
|
| 1341 |
- if attrsD['rel'] == 'self': |
|
| 1342 |
- attrsD.setdefault('type', 'application/atom+xml')
|
|
| 1343 |
- else: |
|
| 1344 |
- attrsD.setdefault('type', 'text/html')
|
|
| 1345 |
- context = self._getContext() |
|
| 1346 |
- attrsD = self._itsAnHrefDamnIt(attrsD) |
|
| 1347 |
- if attrsD.has_key('href'):
|
|
| 1348 |
- attrsD['href'] = self.resolveURI(attrsD['href']) |
|
| 1349 |
- if attrsD.get('rel')=='enclosure' and not context.get('id'):
|
|
| 1350 |
- context['id'] = attrsD.get('href')
|
|
| 1351 |
- expectingText = self.infeed or self.inentry or self.insource |
|
| 1352 |
- context.setdefault('links', [])
|
|
| 1353 |
- context['links'].append(FeedParserDict(attrsD)) |
|
| 1354 |
- if attrsD.has_key('href'):
|
|
| 1355 |
- expectingText = 0 |
|
| 1356 |
- if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
|
|
| 1357 |
- context['link'] = attrsD['href'] |
|
| 1358 |
- else: |
|
| 1359 |
- self.push('link', expectingText)
|
|
| 1360 |
- _start_producturl = _start_link |
|
| 1361 |
- |
|
| 1362 |
- def _end_link(self): |
|
| 1363 |
- value = self.pop('link')
|
|
| 1364 |
- context = self._getContext() |
|
| 1365 |
- _end_producturl = _end_link |
|
| 1366 |
- |
|
| 1367 |
- def _start_guid(self, attrsD): |
|
| 1368 |
- self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
|
|
| 1369 |
- self.push('id', 1)
|
|
| 1370 |
- |
|
| 1371 |
- def _end_guid(self): |
|
| 1372 |
- value = self.pop('id')
|
|
| 1373 |
- self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
|
|
| 1374 |
- if self.guidislink: |
|
| 1375 |
- # guid acts as link, but only if 'ispermalink' is not present or is 'true', |
|
| 1376 |
- # and only if the item doesn't already have a link element |
|
| 1377 |
- self._save('link', value)
|
|
| 1378 |
- |
|
| 1379 |
- def _start_title(self, attrsD): |
|
| 1380 |
- if self.svgOK: return self.unknown_starttag('title', attrsD.items())
|
|
| 1381 |
- self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
|
|
| 1382 |
- _start_dc_title = _start_title |
|
| 1383 |
- _start_media_title = _start_title |
|
| 1384 |
- |
|
| 1385 |
- def _end_title(self): |
|
| 1386 |
- if self.svgOK: return |
|
| 1387 |
- value = self.popContent('title')
|
|
| 1388 |
- if not value: return |
|
| 1389 |
- context = self._getContext() |
|
| 1390 |
- self.hasTitle = 1 |
|
| 1391 |
- _end_dc_title = _end_title |
|
| 1392 |
- |
|
| 1393 |
- def _end_media_title(self): |
|
| 1394 |
- hasTitle = self.hasTitle |
|
| 1395 |
- self._end_title() |
|
| 1396 |
- self.hasTitle = hasTitle |
|
| 1397 |
- |
|
| 1398 |
- def _start_description(self, attrsD): |
|
| 1399 |
- context = self._getContext() |
|
| 1400 |
- if context.has_key('summary'):
|
|
| 1401 |
- self._summaryKey = 'content' |
|
| 1402 |
- self._start_content(attrsD) |
|
| 1403 |
- else: |
|
| 1404 |
- self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)
|
|
| 1405 |
- _start_dc_description = _start_description |
|
| 1406 |
- |
|
| 1407 |
- def _start_abstract(self, attrsD): |
|
| 1408 |
- self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
|
|
| 1409 |
- |
|
| 1410 |
- def _end_description(self): |
|
| 1411 |
- if self._summaryKey == 'content': |
|
| 1412 |
- self._end_content() |
|
| 1413 |
- else: |
|
| 1414 |
- value = self.popContent('description')
|
|
| 1415 |
- self._summaryKey = None |
|
| 1416 |
- _end_abstract = _end_description |
|
| 1417 |
- _end_dc_description = _end_description |
|
| 1418 |
- |
|
| 1419 |
- def _start_info(self, attrsD): |
|
| 1420 |
- self.pushContent('info', attrsD, 'text/plain', 1)
|
|
| 1421 |
- _start_feedburner_browserfriendly = _start_info |
|
| 1422 |
- |
|
| 1423 |
- def _end_info(self): |
|
| 1424 |
- self.popContent('info')
|
|
| 1425 |
- _end_feedburner_browserfriendly = _end_info |
|
| 1426 |
- |
|
| 1427 |
- def _start_generator(self, attrsD): |
|
| 1428 |
- if attrsD: |
|
| 1429 |
- attrsD = self._itsAnHrefDamnIt(attrsD) |
|
| 1430 |
- if attrsD.has_key('href'):
|
|
| 1431 |
- attrsD['href'] = self.resolveURI(attrsD['href']) |
|
| 1432 |
- self._getContext()['generator_detail'] = FeedParserDict(attrsD) |
|
| 1433 |
- self.push('generator', 1)
|
|
| 1434 |
- |
|
| 1435 |
- def _end_generator(self): |
|
| 1436 |
- value = self.pop('generator')
|
|
| 1437 |
- context = self._getContext() |
|
| 1438 |
- if context.has_key('generator_detail'):
|
|
| 1439 |
- context['generator_detail']['name'] = value |
|
| 1440 |
- |
|
| 1441 |
- def _start_admin_generatoragent(self, attrsD): |
|
| 1442 |
- self.push('generator', 1)
|
|
| 1443 |
- value = self._getAttribute(attrsD, 'rdf:resource') |
|
| 1444 |
- if value: |
|
| 1445 |
- self.elementstack[-1][2].append(value) |
|
| 1446 |
- self.pop('generator')
|
|
| 1447 |
- self._getContext()['generator_detail'] = FeedParserDict({'href': value})
|
|
| 1448 |
- |
|
| 1449 |
- def _start_admin_errorreportsto(self, attrsD): |
|
| 1450 |
- self.push('errorreportsto', 1)
|
|
| 1451 |
- value = self._getAttribute(attrsD, 'rdf:resource') |
|
| 1452 |
- if value: |
|
| 1453 |
- self.elementstack[-1][2].append(value) |
|
| 1454 |
- self.pop('errorreportsto')
|
|
| 1455 |
- |
|
| 1456 |
- def _start_summary(self, attrsD): |
|
| 1457 |
- context = self._getContext() |
|
| 1458 |
- if context.has_key('summary'):
|
|
| 1459 |
- self._summaryKey = 'content' |
|
| 1460 |
- self._start_content(attrsD) |
|
| 1461 |
- else: |
|
| 1462 |
- self._summaryKey = 'summary' |
|
| 1463 |
- self.pushContent(self._summaryKey, attrsD, 'text/plain', 1) |
|
| 1464 |
- _start_itunes_summary = _start_summary |
|
| 1465 |
- |
|
| 1466 |
- def _end_summary(self): |
|
| 1467 |
- if self._summaryKey == 'content': |
|
| 1468 |
- self._end_content() |
|
| 1469 |
- else: |
|
| 1470 |
- self.popContent(self._summaryKey or 'summary') |
|
| 1471 |
- self._summaryKey = None |
|
| 1472 |
- _end_itunes_summary = _end_summary |
|
| 1473 |
- |
|
| 1474 |
- def _start_enclosure(self, attrsD): |
|
| 1475 |
- attrsD = self._itsAnHrefDamnIt(attrsD) |
|
| 1476 |
- context = self._getContext() |
|
| 1477 |
- attrsD['rel']='enclosure' |
|
| 1478 |
- context.setdefault('links', []).append(FeedParserDict(attrsD))
|
|
| 1479 |
- href = attrsD.get('href')
|
|
| 1480 |
- if href and not context.get('id'):
|
|
| 1481 |
- context['id'] = href |
|
| 1482 |
- |
|
| 1483 |
- def _start_source(self, attrsD): |
|
| 1484 |
- self.insource = 1 |
|
| 1485 |
- self.hasTitle = 0 |
|
| 1486 |
- |
|
| 1487 |
- def _end_source(self): |
|
| 1488 |
- self.insource = 0 |
|
| 1489 |
- self._getContext()['source'] = copy.deepcopy(self.sourcedata) |
|
| 1490 |
- self.sourcedata.clear() |
|
| 1491 |
- |
|
| 1492 |
- def _start_content(self, attrsD): |
|
| 1493 |
- self.pushContent('content', attrsD, 'text/plain', 1)
|
|
| 1494 |
- src = attrsD.get('src')
|
|
| 1495 |
- if src: |
|
| 1496 |
- self.contentparams['src'] = src |
|
| 1497 |
- self.push('content', 1)
|
|
| 1498 |
- |
|
| 1499 |
- def _start_prodlink(self, attrsD): |
|
| 1500 |
- self.pushContent('content', attrsD, 'text/html', 1)
|
|
| 1501 |
- |
|
| 1502 |
- def _start_body(self, attrsD): |
|
| 1503 |
- self.pushContent('content', attrsD, 'application/xhtml+xml', 1)
|
|
| 1504 |
- _start_xhtml_body = _start_body |
|
| 1505 |
- |
|
| 1506 |
- def _start_content_encoded(self, attrsD): |
|
| 1507 |
- self.pushContent('content', attrsD, 'text/html', 1)
|
|
| 1508 |
- _start_fullitem = _start_content_encoded |
|
| 1509 |
- |
|
| 1510 |
- def _end_content(self): |
|
| 1511 |
- copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)
|
|
| 1512 |
- value = self.popContent('content')
|
|
| 1513 |
- if copyToDescription: |
|
| 1514 |
- self._save('description', value)
|
|
| 1515 |
- |
|
| 1516 |
- _end_body = _end_content |
|
| 1517 |
- _end_xhtml_body = _end_content |
|
| 1518 |
- _end_content_encoded = _end_content |
|
| 1519 |
- _end_fullitem = _end_content |
|
| 1520 |
- _end_prodlink = _end_content |
|
| 1521 |
- |
|
| 1522 |
- def _start_itunes_image(self, attrsD): |
|
| 1523 |
- self.push('itunes_image', 0)
|
|
| 1524 |
- self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
|
|
| 1525 |
- _start_itunes_link = _start_itunes_image |
|
| 1526 |
- |
|
| 1527 |
- def _end_itunes_block(self): |
|
| 1528 |
- value = self.pop('itunes_block', 0)
|
|
| 1529 |
- self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0 |
|
| 1530 |
- |
|
| 1531 |
- def _end_itunes_explicit(self): |
|
| 1532 |
- value = self.pop('itunes_explicit', 0)
|
|
| 1533 |
- self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0 |
|
| 1534 |
- |
|
| 1535 |
-if _XML_AVAILABLE: |
|
| 1536 |
- class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler): |
|
| 1537 |
- def __init__(self, baseuri, baselang, encoding): |
|
| 1538 |
- if _debug: sys.stderr.write('trying StrictFeedParser\n')
|
|
| 1539 |
- xml.sax.handler.ContentHandler.__init__(self) |
|
| 1540 |
- _FeedParserMixin.__init__(self, baseuri, baselang, encoding) |
|
| 1541 |
- self.bozo = 0 |
|
| 1542 |
- self.exc = None |
|
| 1543 |
- |
|
| 1544 |
- def startPrefixMapping(self, prefix, uri): |
|
| 1545 |
- self.trackNamespace(prefix, uri) |
|
| 1546 |
- |
|
| 1547 |
- def startElementNS(self, name, qname, attrs): |
|
| 1548 |
- namespace, localname = name |
|
| 1549 |
- lowernamespace = str(namespace or '').lower() |
|
| 1550 |
- if lowernamespace.find('backend.userland.com/rss') <> -1:
|
|
| 1551 |
- # match any backend.userland.com namespace |
|
| 1552 |
- namespace = 'http://backend.userland.com/rss' |
|
| 1553 |
- lowernamespace = namespace |
|
| 1554 |
- if qname and qname.find(':') > 0:
|
|
| 1555 |
- givenprefix = qname.split(':')[0]
|
|
| 1556 |
- else: |
|
| 1557 |
- givenprefix = None |
|
| 1558 |
- prefix = self._matchnamespaces.get(lowernamespace, givenprefix) |
|
| 1559 |
- if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix): |
|
| 1560 |
- raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix |
|
| 1561 |
- localname = str(localname).lower() |
|
| 1562 |
- |
|
| 1563 |
- # qname implementation is horribly broken in Python 2.1 (it |
|
| 1564 |
- # doesn't report any), and slightly broken in Python 2.2 (it |
|
| 1565 |
- # doesn't report the xml: namespace). So we match up namespaces |
|
| 1566 |
- # with a known list first, and then possibly override them with |
|
| 1567 |
- # the qnames the SAX parser gives us (if indeed it gives us any |
|
| 1568 |
- # at all). Thanks to MatejC for helping me test this and |
|
| 1569 |
- # tirelessly telling me that it didn't work yet. |
|
| 1570 |
- attrsD = {}
|
|
| 1571 |
- if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML': |
|
| 1572 |
- attrsD['xmlns']=namespace |
|
| 1573 |
- if localname=='svg' and namespace=='http://www.w3.org/2000/svg': |
|
| 1574 |
- attrsD['xmlns']=namespace |
|
| 1575 |
- |
|
| 1576 |
- if prefix: |
|
| 1577 |
- localname = prefix.lower() + ':' + localname |
|
| 1578 |
- elif namespace and not qname: #Expat |
|
| 1579 |
- for name,value in self.namespacesInUse.items(): |
|
| 1580 |
- if name and value == namespace: |
|
| 1581 |
- localname = name + ':' + localname |
|
| 1582 |
- break |
|
| 1583 |
- if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname))
|
|
| 1584 |
- |
|
| 1585 |
- for (namespace, attrlocalname), attrvalue in attrs._attrs.items(): |
|
| 1586 |
- lowernamespace = (namespace or '').lower() |
|
| 1587 |
- prefix = self._matchnamespaces.get(lowernamespace, '') |
|
| 1588 |
- if prefix: |
|
| 1589 |
- attrlocalname = prefix + ':' + attrlocalname |
|
| 1590 |
- attrsD[str(attrlocalname).lower()] = attrvalue |
|
| 1591 |
- for qname in attrs.getQNames(): |
|
| 1592 |
- attrsD[str(qname).lower()] = attrs.getValueByQName(qname) |
|
| 1593 |
- self.unknown_starttag(localname, attrsD.items()) |
|
| 1594 |
- |
|
| 1595 |
- def characters(self, text): |
|
| 1596 |
- self.handle_data(text) |
|
| 1597 |
- |
|
| 1598 |
- def endElementNS(self, name, qname): |
|
| 1599 |
- namespace, localname = name |
|
| 1600 |
- lowernamespace = str(namespace or '').lower() |
|
| 1601 |
- if qname and qname.find(':') > 0:
|
|
| 1602 |
- givenprefix = qname.split(':')[0]
|
|
| 1603 |
- else: |
|
| 1604 |
- givenprefix = '' |
|
| 1605 |
- prefix = self._matchnamespaces.get(lowernamespace, givenprefix) |
|
| 1606 |
- if prefix: |
|
| 1607 |
- localname = prefix + ':' + localname |
|
| 1608 |
- elif namespace and not qname: #Expat |
|
| 1609 |
- for name,value in self.namespacesInUse.items(): |
|
| 1610 |
- if name and value == namespace: |
|
| 1611 |
- localname = name + ':' + localname |
|
| 1612 |
- break |
|
| 1613 |
- localname = str(localname).lower() |
|
| 1614 |
- self.unknown_endtag(localname) |
|
| 1615 |
- |
|
| 1616 |
- def error(self, exc): |
|
| 1617 |
- self.bozo = 1 |
|
| 1618 |
- self.exc = exc |
|
| 1619 |
- |
|
| 1620 |
- def fatalError(self, exc): |
|
| 1621 |
- self.error(exc) |
|
| 1622 |
- raise exc |
|
| 1623 |
- |
|
| 1624 |
-class _BaseHTMLProcessor(sgmllib.SGMLParser): |
|
| 1625 |
- special = re.compile('''[<>'"]''')
|
|
| 1626 |
- bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
|
|
| 1627 |
- elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', |
|
| 1628 |
- 'img', 'input', 'isindex', 'link', 'meta', 'param'] |
|
| 1629 |
- |
|
| 1630 |
- def __init__(self, encoding, type): |
|
| 1631 |
- self.encoding = encoding |
|
| 1632 |
- self.type = type |
|
| 1633 |
- if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
|
|
| 1634 |
- sgmllib.SGMLParser.__init__(self) |
|
| 1635 |
- |
|
| 1636 |
- def reset(self): |
|
| 1637 |
- self.pieces = [] |
|
| 1638 |
- sgmllib.SGMLParser.reset(self) |
|
| 1639 |
- |
|
| 1640 |
- def _shorttag_replace(self, match): |
|
| 1641 |
- tag = match.group(1) |
|
| 1642 |
- if tag in self.elements_no_end_tag: |
|
| 1643 |
- return '<' + tag + ' />' |
|
| 1644 |
- else: |
|
| 1645 |
- return '<' + tag + '></' + tag + '>' |
|
| 1646 |
- |
|
| 1647 |
- def parse_starttag(self,i): |
|
| 1648 |
- j=sgmllib.SGMLParser.parse_starttag(self, i) |
|
| 1649 |
- if self.type == 'application/xhtml+xml': |
|
| 1650 |
- if j>2 and self.rawdata[j-2:j]=='/>': |
|
| 1651 |
- self.unknown_endtag(self.lasttag) |
|
| 1652 |
- return j |
|
| 1653 |
- |
|
| 1654 |
- def feed(self, data): |
|
| 1655 |
- data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data) |
|
| 1656 |
- #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace |
|
| 1657 |
- data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data) |
|
| 1658 |
- data = data.replace(''', "'")
|
|
| 1659 |
- data = data.replace('"', '"')
|
|
| 1660 |
- if self.encoding and type(data) == type(u''): |
|
| 1661 |
- data = data.encode(self.encoding) |
|
| 1662 |
- sgmllib.SGMLParser.feed(self, data) |
|
| 1663 |
- sgmllib.SGMLParser.close(self) |
|
| 1664 |
- |
|
| 1665 |
- def normalize_attrs(self, attrs): |
|
| 1666 |
- if not attrs: return attrs |
|
| 1667 |
- # utility method to be called by descendants |
|
| 1668 |
- attrs = dict([(k.lower(), v) for k, v in attrs]).items() |
|
| 1669 |
- attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
|
|
| 1670 |
- attrs.sort() |
|
| 1671 |
- return attrs |
|
| 1672 |
- |
|
| 1673 |
- def unknown_starttag(self, tag, attrs): |
|
| 1674 |
- # called for each start tag |
|
| 1675 |
- # attrs is a list of (attr, value) tuples |
|
| 1676 |
- # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
|
|
| 1677 |
- if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
|
|
| 1678 |
- uattrs = [] |
|
| 1679 |
- strattrs='' |
|
| 1680 |
- if attrs: |
|
| 1681 |
- for key, value in attrs: |
|
| 1682 |
- value=value.replace('>','>').replace('<','<').replace('"','"')
|
|
| 1683 |
- value = self.bare_ampersand.sub("&", value)
|
|
| 1684 |
- # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds |
|
| 1685 |
- if type(value) != type(u''): |
|
| 1686 |
- try: |
|
| 1687 |
- value = unicode(value, self.encoding) |
|
| 1688 |
- except: |
|
| 1689 |
- value = unicode(value, 'iso-8859-1') |
|
| 1690 |
- uattrs.append((unicode(key, self.encoding), value)) |
|
| 1691 |
- strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]) |
|
| 1692 |
- if self.encoding: |
|
| 1693 |
- try: |
|
| 1694 |
- strattrs=strattrs.encode(self.encoding) |
|
| 1695 |
- except: |
|
| 1696 |
- pass |
|
| 1697 |
- if tag in self.elements_no_end_tag: |
|
| 1698 |
- self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
|
|
| 1699 |
- else: |
|
| 1700 |
- self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
|
|
| 1701 |
- |
|
| 1702 |
- def unknown_endtag(self, tag): |
|
| 1703 |
- # called for each end tag, e.g. for </pre>, tag will be 'pre' |
|
| 1704 |
- # Reconstruct the original end tag. |
|
| 1705 |
- if tag not in self.elements_no_end_tag: |
|
| 1706 |
- self.pieces.append("</%(tag)s>" % locals())
|
|
| 1707 |
- |
|
| 1708 |
- def handle_charref(self, ref): |
|
| 1709 |
- # called for each character reference, e.g. for ' ', ref will be '160' |
|
| 1710 |
- # Reconstruct the original character reference. |
|
| 1711 |
- if ref.startswith('x'):
|
|
| 1712 |
- value = unichr(int(ref[1:],16)) |
|
| 1713 |
- else: |
|
| 1714 |
- value = unichr(int(ref)) |
|
| 1715 |
- |
|
| 1716 |
- if value in _cp1252.keys(): |
|
| 1717 |
- self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:])
|
|
| 1718 |
- else: |
|
| 1719 |
- self.pieces.append('&#%(ref)s;' % locals())
|
|
| 1720 |
- |
|
| 1721 |
- def handle_entityref(self, ref): |
|
| 1722 |
- # called for each entity reference, e.g. for '©', ref will be 'copy' |
|
| 1723 |
- # Reconstruct the original entity reference. |
|
| 1724 |
- if name2codepoint.has_key(ref): |
|
| 1725 |
- self.pieces.append('&%(ref)s;' % locals())
|
|
| 1726 |
- else: |
|
| 1727 |
- self.pieces.append('&%(ref)s' % locals())
|
|
| 1728 |
- |
|
| 1729 |
- def handle_data(self, text): |
|
| 1730 |
- # called for each block of plain text, i.e. outside of any tag and |
|
| 1731 |
- # not containing any character or entity references |
|
| 1732 |
- # Store the original text verbatim. |
|
| 1733 |
- if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
|
|
| 1734 |
- self.pieces.append(text) |
|
| 1735 |
- |
|
| 1736 |
- def handle_comment(self, text): |
|
| 1737 |
- # called for each HTML comment, e.g. <!-- insert Javascript code here --> |
|
| 1738 |
- # Reconstruct the original comment. |
|
| 1739 |
- self.pieces.append('<!--%(text)s-->' % locals())
|
|
| 1740 |
- |
|
| 1741 |
- def handle_pi(self, text): |
|
| 1742 |
- # called for each processing instruction, e.g. <?instruction> |
|
| 1743 |
- # Reconstruct original processing instruction. |
|
| 1744 |
- self.pieces.append('<?%(text)s>' % locals())
|
|
| 1745 |
- |
|
| 1746 |
- def handle_decl(self, text): |
|
| 1747 |
- # called for the DOCTYPE, if present, e.g. |
|
| 1748 |
- # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" |
|
| 1749 |
- # "http://www.w3.org/TR/html4/loose.dtd"> |
|
| 1750 |
- # Reconstruct original DOCTYPE |
|
| 1751 |
- self.pieces.append('<!%(text)s>' % locals())
|
|
| 1752 |
- |
|
| 1753 |
- _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match |
|
| 1754 |
- def _scan_name(self, i, declstartpos): |
|
| 1755 |
- rawdata = self.rawdata |
|
| 1756 |
- n = len(rawdata) |
|
| 1757 |
- if i == n: |
|
| 1758 |
- return None, -1 |
|
| 1759 |
- m = self._new_declname_match(rawdata, i) |
|
| 1760 |
- if m: |
|
| 1761 |
- s = m.group() |
|
| 1762 |
- name = s.strip() |
|
| 1763 |
- if (i + len(s)) == n: |
|
| 1764 |
- return None, -1 # end of buffer |
|
| 1765 |
- return name.lower(), m.end() |
|
| 1766 |
- else: |
|
| 1767 |
- self.handle_data(rawdata) |
|
| 1768 |
-# self.updatepos(declstartpos, i) |
|
| 1769 |
- return None, -1 |
|
| 1770 |
- |
|
| 1771 |
- def convert_charref(self, name): |
|
| 1772 |
- return '&#%s;' % name |
|
| 1773 |
- |
|
| 1774 |
- def convert_entityref(self, name): |
|
| 1775 |
- return '&%s;' % name |
|
| 1776 |
- |
|
| 1777 |
- def output(self): |
|
| 1778 |
- '''Return processed HTML as a single string''' |
|
| 1779 |
- return ''.join([str(p) for p in self.pieces]) |
|
| 1780 |
- |
|
| 1781 |
-class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor): |
|
| 1782 |
- def __init__(self, baseuri, baselang, encoding, entities): |
|
| 1783 |
- sgmllib.SGMLParser.__init__(self) |
|
| 1784 |
- _FeedParserMixin.__init__(self, baseuri, baselang, encoding) |
|
| 1785 |
- _BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml') |
|
| 1786 |
- self.entities=entities |
|
| 1787 |
- |
|
| 1788 |
- def decodeEntities(self, element, data): |
|
| 1789 |
- data = data.replace('<', '<')
|
|
| 1790 |
- data = data.replace('<', '<')
|
|
| 1791 |
- data = data.replace('<', '<')
|
|
| 1792 |
- data = data.replace('>', '>')
|
|
| 1793 |
- data = data.replace('>', '>')
|
|
| 1794 |
- data = data.replace('>', '>')
|
|
| 1795 |
- data = data.replace('&', '&')
|
|
| 1796 |
- data = data.replace('&', '&')
|
|
| 1797 |
- data = data.replace('"', '"')
|
|
| 1798 |
- data = data.replace('"', '"')
|
|
| 1799 |
- data = data.replace(''', ''')
|
|
| 1800 |
- data = data.replace(''', ''')
|
|
| 1801 |
- if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
|
|
| 1802 |
- data = data.replace('<', '<')
|
|
| 1803 |
- data = data.replace('>', '>')
|
|
| 1804 |
- data = data.replace('&', '&')
|
|
| 1805 |
- data = data.replace('"', '"')
|
|
| 1806 |
- data = data.replace(''', "'")
|
|
| 1807 |
- return data |
|
| 1808 |
- |
|
| 1809 |
- def strattrs(self, attrs): |
|
| 1810 |
- return ''.join([' %s="%s"' % (n,v.replace('"','"')) for n,v in attrs])
|
|
| 1811 |
- |
|
| 1812 |
-class _MicroformatsParser: |
|
| 1813 |
- STRING = 1 |
|
| 1814 |
- DATE = 2 |
|
| 1815 |
- URI = 3 |
|
| 1816 |
- NODE = 4 |
|
| 1817 |
- EMAIL = 5 |
|
| 1818 |
- |
|
| 1819 |
- known_xfn_relationships = ['contact', 'acquaintance', 'friend', 'met', 'co-worker', 'coworker', 'colleague', 'co-resident', 'coresident', 'neighbor', 'child', 'parent', 'sibling', 'brother', 'sister', 'spouse', 'wife', 'husband', 'kin', 'relative', 'muse', 'crush', 'date', 'sweetheart', 'me'] |
|
| 1820 |
- known_binary_extensions = ['zip','rar','exe','gz','tar','tgz','tbz2','bz2','z','7z','dmg','img','sit','sitx','hqx','deb','rpm','bz2','jar','rar','iso','bin','msi','mp2','mp3','ogg','ogm','mp4','m4v','m4a','avi','wma','wmv'] |
|
| 1821 |
- |
|
| 1822 |
- def __init__(self, data, baseuri, encoding): |
|
| 1823 |
- self.document = BeautifulSoup.BeautifulSoup(data) |
|
| 1824 |
- self.baseuri = baseuri |
|
| 1825 |
- self.encoding = encoding |
|
| 1826 |
- if type(data) == type(u''): |
|
| 1827 |
- data = data.encode(encoding) |
|
| 1828 |
- self.tags = [] |
|
| 1829 |
- self.enclosures = [] |
|
| 1830 |
- self.xfn = [] |
|
| 1831 |
- self.vcard = None |
|
| 1832 |
- |
|
| 1833 |
- def vcardEscape(self, s): |
|
| 1834 |
- if type(s) in (type(''), type(u'')):
|
|
| 1835 |
- s = s.replace(',', '\\,').replace(';', '\\;').replace('\n', '\\n')
|
|
| 1836 |
- return s |
|
| 1837 |
- |
|
| 1838 |
- def vcardFold(self, s): |
|
| 1839 |
- s = re.sub(';+$', '', s)
|
|
| 1840 |
- sFolded = '' |
|
| 1841 |
- iMax = 75 |
|
| 1842 |
- sPrefix = '' |
|
| 1843 |
- while len(s) > iMax: |
|
| 1844 |
- sFolded += sPrefix + s[:iMax] + '\n' |
|
| 1845 |
- s = s[iMax:] |
|
| 1846 |
- sPrefix = ' ' |
|
| 1847 |
- iMax = 74 |
|
| 1848 |
- sFolded += sPrefix + s |
|
| 1849 |
- return sFolded |
|
| 1850 |
- |
|
| 1851 |
- def normalize(self, s): |
|
| 1852 |
- return re.sub(r'\s+', ' ', s).strip() |
|
| 1853 |
- |
|
| 1854 |
- def unique(self, aList): |
|
| 1855 |
- results = [] |
|
| 1856 |
- for element in aList: |
|
| 1857 |
- if element not in results: |
|
| 1858 |
- results.append(element) |
|
| 1859 |
- return results |
|
| 1860 |
- |
|
| 1861 |
- def toISO8601(self, dt): |
|
| 1862 |
- return time.strftime('%Y-%m-%dT%H:%M:%SZ', dt)
|
|
| 1863 |
- |
|
| 1864 |
- def getPropertyValue(self, elmRoot, sProperty, iPropertyType=4, bAllowMultiple=0, bAutoEscape=0): |
|
| 1865 |
- all = lambda x: 1 |
|
| 1866 |
- sProperty = sProperty.lower() |
|
| 1867 |
- bFound = 0 |
|
| 1868 |
- bNormalize = 1 |
|
| 1869 |
- propertyMatch = {'class': re.compile(r'\b%s\b' % sProperty)}
|
|
| 1870 |
- if bAllowMultiple and (iPropertyType != self.NODE): |
|
| 1871 |
- snapResults = [] |
|
| 1872 |
- containers = elmRoot(['ul', 'ol'], propertyMatch) |
|
| 1873 |
- for container in containers: |
|
| 1874 |
- snapResults.extend(container('li'))
|
|
| 1875 |
- bFound = (len(snapResults) != 0) |
|
| 1876 |
- if not bFound: |
|
| 1877 |
- snapResults = elmRoot(all, propertyMatch) |
|
| 1878 |
- bFound = (len(snapResults) != 0) |
|
| 1879 |
- if (not bFound) and (sProperty == 'value'): |
|
| 1880 |
- snapResults = elmRoot('pre')
|
|
| 1881 |
- bFound = (len(snapResults) != 0) |
|
| 1882 |
- bNormalize = not bFound |
|
| 1883 |
- if not bFound: |
|
| 1884 |
- snapResults = [elmRoot] |
|
| 1885 |
- bFound = (len(snapResults) != 0) |
|
| 1886 |
- arFilter = [] |
|
| 1887 |
- if sProperty == 'vcard': |
|
| 1888 |
- snapFilter = elmRoot(all, propertyMatch) |
|
| 1889 |
- for node in snapFilter: |
|
| 1890 |
- if node.findParent(all, propertyMatch): |
|
| 1891 |
- arFilter.append(node) |
|
| 1892 |
- arResults = [] |
|
| 1893 |
- for node in snapResults: |
|
| 1894 |
- if node not in arFilter: |
|
| 1895 |
- arResults.append(node) |
|
| 1896 |
- bFound = (len(arResults) != 0) |
|
| 1897 |
- if not bFound: |
|
| 1898 |
- if bAllowMultiple: return [] |
|
| 1899 |
- elif iPropertyType == self.STRING: return '' |
|
| 1900 |
- elif iPropertyType == self.DATE: return None |
|
| 1901 |
- elif iPropertyType == self.URI: return '' |
|
| 1902 |
- elif iPropertyType == self.NODE: return None |
|
| 1903 |
- else: return None |
|
| 1904 |
- arValues = [] |
|
| 1905 |
- for elmResult in arResults: |
|
| 1906 |
- sValue = None |
|
| 1907 |
- if iPropertyType == self.NODE: |
|
| 1908 |
- if bAllowMultiple: |
|
| 1909 |
- arValues.append(elmResult) |
|
| 1910 |
- continue |
|
| 1911 |
- else: |
|
| 1912 |
- return elmResult |
|
| 1913 |
- sNodeName = elmResult.name.lower() |
|
| 1914 |
- if (iPropertyType == self.EMAIL) and (sNodeName == 'a'): |
|
| 1915 |
- sValue = (elmResult.get('href') or '').split('mailto:').pop().split('?')[0]
|
|
| 1916 |
- if sValue: |
|
| 1917 |
- sValue = bNormalize and self.normalize(sValue) or sValue.strip() |
|
| 1918 |
- if (not sValue) and (sNodeName == 'abbr'): |
|
| 1919 |
- sValue = elmResult.get('title')
|
|
| 1920 |
- if sValue: |
|
| 1921 |
- sValue = bNormalize and self.normalize(sValue) or sValue.strip() |
|
| 1922 |
- if (not sValue) and (iPropertyType == self.URI): |
|
| 1923 |
- if sNodeName == 'a': sValue = elmResult.get('href')
|
|
| 1924 |
- elif sNodeName == 'img': sValue = elmResult.get('src')
|
|
| 1925 |
- elif sNodeName == 'object': sValue = elmResult.get('data')
|
|
| 1926 |
- if sValue: |
|
| 1927 |
- sValue = bNormalize and self.normalize(sValue) or sValue.strip() |
|
| 1928 |
- if (not sValue) and (sNodeName == 'img'): |
|
| 1929 |
- sValue = elmResult.get('alt')
|
|
| 1930 |
- if sValue: |
|
| 1931 |
- sValue = bNormalize and self.normalize(sValue) or sValue.strip() |
|
| 1932 |
- if not sValue: |
|
| 1933 |
- sValue = elmResult.renderContents() |
|
| 1934 |
- sValue = re.sub(r'<\S[^>]*>', '', sValue) |
|
| 1935 |
- sValue = sValue.replace('\r\n', '\n')
|
|
| 1936 |
- sValue = sValue.replace('\r', '\n')
|
|
| 1937 |
- if sValue: |
|
| 1938 |
- sValue = bNormalize and self.normalize(sValue) or sValue.strip() |
|
| 1939 |
- if not sValue: continue |
|
| 1940 |
- if iPropertyType == self.DATE: |
|
| 1941 |
- sValue = _parse_date_iso8601(sValue) |
|
| 1942 |
- if bAllowMultiple: |
|
| 1943 |
- arValues.append(bAutoEscape and self.vcardEscape(sValue) or sValue) |
|
| 1944 |
- else: |
|
| 1945 |
- return bAutoEscape and self.vcardEscape(sValue) or sValue |
|
| 1946 |
- return arValues |
|
| 1947 |
- |
|
| 1948 |
- def findVCards(self, elmRoot, bAgentParsing=0): |
|
| 1949 |
- sVCards = '' |
|
| 1950 |
- |
|
| 1951 |
- if not bAgentParsing: |
|
| 1952 |
- arCards = self.getPropertyValue(elmRoot, 'vcard', bAllowMultiple=1) |
|
| 1953 |
- else: |
|
| 1954 |
- arCards = [elmRoot] |
|
| 1955 |
- |
|
| 1956 |
- for elmCard in arCards: |
|
| 1957 |
- arLines = [] |
|
| 1958 |
- |
|
| 1959 |
- def processSingleString(sProperty): |
|
| 1960 |
- sValue = self.getPropertyValue(elmCard, sProperty, self.STRING, bAutoEscape=1) |
|
| 1961 |
- if sValue: |
|
| 1962 |
- arLines.append(self.vcardFold(sProperty.upper() + ':' + sValue)) |
|
| 1963 |
- return sValue or '' |
|
| 1964 |
- |
|
| 1965 |
- def processSingleURI(sProperty): |
|
| 1966 |
- sValue = self.getPropertyValue(elmCard, sProperty, self.URI) |
|
| 1967 |
- if sValue: |
|
| 1968 |
- sContentType = '' |
|
| 1969 |
- sEncoding = '' |
|
| 1970 |
- sValueKey = '' |
|
| 1971 |
- if sValue.startswith('data:'):
|
|
| 1972 |
- sEncoding = ';ENCODING=b' |
|
| 1973 |
- sContentType = sValue.split(';')[0].split('/').pop()
|
|
| 1974 |
- sValue = sValue.split(',', 1).pop()
|
|
| 1975 |
- else: |
|
| 1976 |
- elmValue = self.getPropertyValue(elmCard, sProperty) |
|
| 1977 |
- if elmValue: |
|
| 1978 |
- if sProperty != 'url': |
|
| 1979 |
- sValueKey = ';VALUE=uri' |
|
| 1980 |
- sContentType = elmValue.get('type', '').strip().split('/').pop().strip()
|
|
| 1981 |
- sContentType = sContentType.upper() |
|
| 1982 |
- if sContentType == 'OCTET-STREAM': |
|
| 1983 |
- sContentType = '' |
|
| 1984 |
- if sContentType: |
|
| 1985 |
- sContentType = ';TYPE=' + sContentType.upper() |
|
| 1986 |
- arLines.append(self.vcardFold(sProperty.upper() + sEncoding + sContentType + sValueKey + ':' + sValue)) |
|
| 1987 |
- |
|
| 1988 |
- def processTypeValue(sProperty, arDefaultType, arForceType=None): |
|
| 1989 |
- arResults = self.getPropertyValue(elmCard, sProperty, bAllowMultiple=1) |
|
| 1990 |
- for elmResult in arResults: |
|
| 1991 |
- arType = self.getPropertyValue(elmResult, 'type', self.STRING, 1, 1) |
|
| 1992 |
- if arForceType: |
|
| 1993 |
- arType = self.unique(arForceType + arType) |
|
| 1994 |
- if not arType: |
|
| 1995 |
- arType = arDefaultType |
|
| 1996 |
- sValue = self.getPropertyValue(elmResult, 'value', self.EMAIL, 0) |
|
| 1997 |
- if sValue: |
|
| 1998 |
- arLines.append(self.vcardFold(sProperty.upper() + ';TYPE=' + ','.join(arType) + ':' + sValue)) |
|
| 1999 |
- |
|
| 2000 |
- # AGENT |
|
| 2001 |
- # must do this before all other properties because it is destructive |
|
| 2002 |
- # (removes nested class="vcard" nodes so they don't interfere with |
|
| 2003 |
- # this vcard's other properties) |
|
| 2004 |
- arAgent = self.getPropertyValue(elmCard, 'agent', bAllowMultiple=1) |
|
| 2005 |
- for elmAgent in arAgent: |
|
| 2006 |
- if re.compile(r'\bvcard\b').search(elmAgent.get('class')):
|
|
| 2007 |
- sAgentValue = self.findVCards(elmAgent, 1) + '\n' |
|
| 2008 |
- sAgentValue = sAgentValue.replace('\n', '\\n')
|
|
| 2009 |
- sAgentValue = sAgentValue.replace(';', '\\;')
|
|
| 2010 |
- if sAgentValue: |
|
| 2011 |
- arLines.append(self.vcardFold('AGENT:' + sAgentValue))
|
|
| 2012 |
- elmAgent['class'] = '' |
|
| 2013 |
- elmAgent.contents = [] |
|
| 2014 |
- else: |
|
| 2015 |
- sAgentValue = self.getPropertyValue(elmAgent, 'value', self.URI, bAutoEscape=1); |
|
| 2016 |
- if sAgentValue: |
|
| 2017 |
- arLines.append(self.vcardFold('AGENT;VALUE=uri:' + sAgentValue))
|
|
| 2018 |
- |
|
| 2019 |
- # FN (full name) |
|
| 2020 |
- sFN = processSingleString('fn')
|
|
| 2021 |
- |
|
| 2022 |
- # N (name) |
|
| 2023 |
- elmName = self.getPropertyValue(elmCard, 'n') |
|
| 2024 |
- if elmName: |
|
| 2025 |
- sFamilyName = self.getPropertyValue(elmName, 'family-name', self.STRING, bAutoEscape=1) |
|
| 2026 |
- sGivenName = self.getPropertyValue(elmName, 'given-name', self.STRING, bAutoEscape=1) |
|
| 2027 |
- arAdditionalNames = self.getPropertyValue(elmName, 'additional-name', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'additional-names', self.STRING, 1, 1) |
|
| 2028 |
- arHonorificPrefixes = self.getPropertyValue(elmName, 'honorific-prefix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-prefixes', self.STRING, 1, 1) |
|
| 2029 |
- arHonorificSuffixes = self.getPropertyValue(elmName, 'honorific-suffix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-suffixes', self.STRING, 1, 1) |
|
| 2030 |
- arLines.append(self.vcardFold('N:' + sFamilyName + ';' +
|
|
| 2031 |
- sGivenName + ';' + |
|
| 2032 |
- ','.join(arAdditionalNames) + ';' + |
|
| 2033 |
- ','.join(arHonorificPrefixes) + ';' + |
|
| 2034 |
- ','.join(arHonorificSuffixes))) |
|
| 2035 |
- elif sFN: |
|
| 2036 |
- # implied "N" optimization |
|
| 2037 |
- # http://microformats.org/wiki/hcard#Implied_.22N.22_Optimization |
|
| 2038 |
- arNames = self.normalize(sFN).split() |
|
| 2039 |
- if len(arNames) == 2: |
|
| 2040 |
- bFamilyNameFirst = (arNames[0].endswith(',') or
|
|
| 2041 |
- len(arNames[1]) == 1 or |
|
| 2042 |
- ((len(arNames[1]) == 2) and (arNames[1].endswith('.'))))
|
|
| 2043 |
- if bFamilyNameFirst: |
|
| 2044 |
- arLines.append(self.vcardFold('N:' + arNames[0] + ';' + arNames[1]))
|
|
| 2045 |
- else: |
|
| 2046 |
- arLines.append(self.vcardFold('N:' + arNames[1] + ';' + arNames[0]))
|
|
| 2047 |
- |
|
| 2048 |
- # SORT-STRING |
|
| 2049 |
- sSortString = self.getPropertyValue(elmCard, 'sort-string', self.STRING, bAutoEscape=1) |
|
| 2050 |
- if sSortString: |
|
| 2051 |
- arLines.append(self.vcardFold('SORT-STRING:' + sSortString))
|
|
| 2052 |
- |
|
| 2053 |
- # NICKNAME |
|
| 2054 |
- arNickname = self.getPropertyValue(elmCard, 'nickname', self.STRING, 1, 1) |
|
| 2055 |
- if arNickname: |
|
| 2056 |
- arLines.append(self.vcardFold('NICKNAME:' + ','.join(arNickname)))
|
|
| 2057 |
- |
|
| 2058 |
- # PHOTO |
|
| 2059 |
- processSingleURI('photo')
|
|
| 2060 |
- |
|
| 2061 |
- # BDAY |
|
| 2062 |
- dtBday = self.getPropertyValue(elmCard, 'bday', self.DATE) |
|
| 2063 |
- if dtBday: |
|
| 2064 |
- arLines.append(self.vcardFold('BDAY:' + self.toISO8601(dtBday)))
|
|
| 2065 |
- |
|
| 2066 |
- # ADR (address) |
|
| 2067 |
- arAdr = self.getPropertyValue(elmCard, 'adr', bAllowMultiple=1) |
|
| 2068 |
- for elmAdr in arAdr: |
|
| 2069 |
- arType = self.getPropertyValue(elmAdr, 'type', self.STRING, 1, 1) |
|
| 2070 |
- if not arType: |
|
| 2071 |
- arType = ['intl','postal','parcel','work'] # default adr types, see RFC 2426 section 3.2.1 |
|
| 2072 |
- sPostOfficeBox = self.getPropertyValue(elmAdr, 'post-office-box', self.STRING, 0, 1) |
|
| 2073 |
- sExtendedAddress = self.getPropertyValue(elmAdr, 'extended-address', self.STRING, 0, 1) |
|
| 2074 |
- sStreetAddress = self.getPropertyValue(elmAdr, 'street-address', self.STRING, 0, 1) |
|
| 2075 |
- sLocality = self.getPropertyValue(elmAdr, 'locality', self.STRING, 0, 1) |
|
| 2076 |
- sRegion = self.getPropertyValue(elmAdr, 'region', self.STRING, 0, 1) |
|
| 2077 |
- sPostalCode = self.getPropertyValue(elmAdr, 'postal-code', self.STRING, 0, 1) |
|
| 2078 |
- sCountryName = self.getPropertyValue(elmAdr, 'country-name', self.STRING, 0, 1) |
|
| 2079 |
- arLines.append(self.vcardFold('ADR;TYPE=' + ','.join(arType) + ':' +
|
|
| 2080 |
- sPostOfficeBox + ';' + |
|
| 2081 |
- sExtendedAddress + ';' + |
|
| 2082 |
- sStreetAddress + ';' + |
|
| 2083 |
- sLocality + ';' + |
|
| 2084 |
- sRegion + ';' + |
|
| 2085 |
- sPostalCode + ';' + |
|
| 2086 |
- sCountryName)) |
|
| 2087 |
- |
|
| 2088 |
- # LABEL |
|
| 2089 |
- processTypeValue('label', ['intl','postal','parcel','work'])
|
|
| 2090 |
- |
|
| 2091 |
- # TEL (phone number) |
|
| 2092 |
- processTypeValue('tel', ['voice'])
|
|
| 2093 |
- |
|
| 2094 |
|
|
| 2095 |
- processTypeValue('email', ['internet'], ['internet'])
|
|
| 2096 |
- |
|
| 2097 |
- # MAILER |
|
| 2098 |
- processSingleString('mailer')
|
|
| 2099 |
- |
|
| 2100 |
- # TZ (timezone) |
|
| 2101 |
- processSingleString('tz')
|
|
| 2102 |
- |
|
| 2103 |
- # GEO (geographical information) |
|
| 2104 |
- elmGeo = self.getPropertyValue(elmCard, 'geo') |
|
| 2105 |
- if elmGeo: |
|
| 2106 |
- sLatitude = self.getPropertyValue(elmGeo, 'latitude', self.STRING, 0, 1) |
|
| 2107 |
- sLongitude = self.getPropertyValue(elmGeo, 'longitude', self.STRING, 0, 1) |
|
| 2108 |
- arLines.append(self.vcardFold('GEO:' + sLatitude + ';' + sLongitude))
|
|
| 2109 |
- |
|
| 2110 |
- # TITLE |
|
| 2111 |
- processSingleString('title')
|
|
| 2112 |
- |
|
| 2113 |
- # ROLE |
|
| 2114 |
- processSingleString('role')
|
|
| 2115 |
- |
|
| 2116 |
- # LOGO |
|
| 2117 |
- processSingleURI('logo')
|
|
| 2118 |
- |
|
| 2119 |
- # ORG (organization) |
|
| 2120 |
- elmOrg = self.getPropertyValue(elmCard, 'org') |
|
| 2121 |
- if elmOrg: |
|
| 2122 |
- sOrganizationName = self.getPropertyValue(elmOrg, 'organization-name', self.STRING, 0, 1) |
|
| 2123 |
- if not sOrganizationName: |
|
| 2124 |
- # implied "organization-name" optimization |
|
| 2125 |
- # http://microformats.org/wiki/hcard#Implied_.22organization-name.22_Optimization |
|
| 2126 |
- sOrganizationName = self.getPropertyValue(elmCard, 'org', self.STRING, 0, 1) |
|
| 2127 |
- if sOrganizationName: |
|
| 2128 |
- arLines.append(self.vcardFold('ORG:' + sOrganizationName))
|
|
| 2129 |
- else: |
|
| 2130 |
- arOrganizationUnit = self.getPropertyValue(elmOrg, 'organization-unit', self.STRING, 1, 1) |
|
| 2131 |
- arLines.append(self.vcardFold('ORG:' + sOrganizationName + ';' + ';'.join(arOrganizationUnit)))
|
|
| 2132 |
- |
|
| 2133 |
- # CATEGORY |
|
| 2134 |
- arCategory = self.getPropertyValue(elmCard, 'category', self.STRING, 1, 1) + self.getPropertyValue(elmCard, 'categories', self.STRING, 1, 1) |
|
| 2135 |
- if arCategory: |
|
| 2136 |
- arLines.append(self.vcardFold('CATEGORIES:' + ','.join(arCategory)))
|
|
| 2137 |
- |
|
| 2138 |
- # NOTE |
|
| 2139 |
- processSingleString('note')
|
|
| 2140 |
- |
|
| 2141 |
- # REV |
|
| 2142 |
- processSingleString('rev')
|
|
| 2143 |
- |
|
| 2144 |
- # SOUND |
|
| 2145 |
- processSingleURI('sound')
|
|
| 2146 |
- |
|
| 2147 |
- # UID |
|
| 2148 |
- processSingleString('uid')
|
|
| 2149 |
- |
|
| 2150 |
- # URL |
|
| 2151 |
- processSingleURI('url')
|
|
| 2152 |
- |
|
| 2153 |
- # CLASS |
|
| 2154 |
- processSingleString('class')
|
|
| 2155 |
- |
|
| 2156 |
- # KEY |
|
| 2157 |
- processSingleURI('key')
|
|
| 2158 |
- |
|
| 2159 |
- if arLines: |
|
| 2160 |
- arLines = ['BEGIN:vCard','VERSION:3.0'] + arLines + ['END:vCard'] |
|
| 2161 |
- sVCards += '\n'.join(arLines) + '\n' |
|
| 2162 |
- |
|
| 2163 |
- return sVCards.strip() |
|
| 2164 |
- |
|
| 2165 |
- def isProbablyDownloadable(self, elm): |
|
| 2166 |
- attrsD = elm.attrMap |
|
| 2167 |
- if not attrsD.has_key('href'): return 0
|
|
| 2168 |
- linktype = attrsD.get('type', '').strip()
|
|
| 2169 |
- if linktype.startswith('audio/') or \
|
|
| 2170 |
- linktype.startswith('video/') or \
|
|
| 2171 |
- (linktype.startswith('application/') and not linktype.endswith('xml')):
|
|
| 2172 |
- return 1 |
|
| 2173 |
- path = urlparse.urlparse(attrsD['href'])[2] |
|
| 2174 |
- if path.find('.') == -1: return 0
|
|
| 2175 |
- fileext = path.split('.').pop().lower()
|
|
| 2176 |
- return fileext in self.known_binary_extensions |
|
| 2177 |
- |
|
| 2178 |
- def findTags(self): |
|
| 2179 |
- all = lambda x: 1 |
|
| 2180 |
- for elm in self.document(all, {'rel': re.compile(r'\btag\b')}):
|
|
| 2181 |
- href = elm.get('href')
|
|
| 2182 |
- if not href: continue |
|
| 2183 |
- urlscheme, domain, path, params, query, fragment = \ |
|
| 2184 |
- urlparse.urlparse(_urljoin(self.baseuri, href)) |
|
| 2185 |
- segments = path.split('/')
|
|
| 2186 |
- tag = segments.pop() |
|
| 2187 |
- if not tag: |
|
| 2188 |
- tag = segments.pop() |
|
| 2189 |
- tagscheme = urlparse.urlunparse((urlscheme, domain, '/'.join(segments), '', '', '')) |
|
| 2190 |
- if not tagscheme.endswith('/'):
|
|
| 2191 |
- tagscheme += '/' |
|
| 2192 |
- self.tags.append(FeedParserDict({"term": tag, "scheme": tagscheme, "label": elm.string or ''}))
|
|
| 2193 |
- |
|
| 2194 |
- def findEnclosures(self): |
|
| 2195 |
- all = lambda x: 1 |
|
| 2196 |
- enclosure_match = re.compile(r'\benclosure\b') |
|
| 2197 |
- for elm in self.document(all, {'href': re.compile(r'.+')}):
|
|
| 2198 |
- if not enclosure_match.search(elm.get('rel', '')) and not self.isProbablyDownloadable(elm): continue
|
|
| 2199 |
- if elm.attrMap not in self.enclosures: |
|
| 2200 |
- self.enclosures.append(elm.attrMap) |
|
| 2201 |
- if elm.string and not elm.get('title'):
|
|
| 2202 |
- self.enclosures[-1]['title'] = elm.string |
|
| 2203 |
- |
|
| 2204 |
- def findXFN(self): |
|
| 2205 |
- all = lambda x: 1 |
|
| 2206 |
- for elm in self.document(all, {'rel': re.compile('.+'), 'href': re.compile('.+')}):
|
|
| 2207 |
- rels = elm.get('rel', '').split()
|
|
| 2208 |
- xfn_rels = [] |
|
| 2209 |
- for rel in rels: |
|
| 2210 |
- if rel in self.known_xfn_relationships: |
|
| 2211 |
- xfn_rels.append(rel) |
|
| 2212 |
- if xfn_rels: |
|
| 2213 |
- self.xfn.append({"relationships": xfn_rels, "href": elm.get('href', ''), "name": elm.string})
|
|
| 2214 |
- |
|
| 2215 |
-def _parseMicroformats(htmlSource, baseURI, encoding): |
|
| 2216 |
- if not BeautifulSoup: return |
|
| 2217 |
- if _debug: sys.stderr.write('entering _parseMicroformats\n')
|
|
| 2218 |
- p = _MicroformatsParser(htmlSource, baseURI, encoding) |
|
| 2219 |
- p.vcard = p.findVCards(p.document) |
|
| 2220 |
- p.findTags() |
|
| 2221 |
- p.findEnclosures() |
|
| 2222 |
- p.findXFN() |
|
| 2223 |
- return {"tags": p.tags, "enclosures": p.enclosures, "xfn": p.xfn, "vcard": p.vcard}
|
|
| 2224 |
- |
|
| 2225 |
-class _RelativeURIResolver(_BaseHTMLProcessor): |
|
| 2226 |
- relative_uris = [('a', 'href'),
|
|
| 2227 |
- ('applet', 'codebase'),
|
|
| 2228 |
- ('area', 'href'),
|
|
| 2229 |
- ('blockquote', 'cite'),
|
|
| 2230 |
- ('body', 'background'),
|
|
| 2231 |
- ('del', 'cite'),
|
|
| 2232 |
- ('form', 'action'),
|
|
| 2233 |
- ('frame', 'longdesc'),
|
|
| 2234 |
- ('frame', 'src'),
|
|
| 2235 |
- ('iframe', 'longdesc'),
|
|
| 2236 |
- ('iframe', 'src'),
|
|
| 2237 |
- ('head', 'profile'),
|
|
| 2238 |
- ('img', 'longdesc'),
|
|
| 2239 |
- ('img', 'src'),
|
|
| 2240 |
- ('img', 'usemap'),
|
|
| 2241 |
- ('input', 'src'),
|
|
| 2242 |
- ('input', 'usemap'),
|
|
| 2243 |
- ('ins', 'cite'),
|
|
| 2244 |
- ('link', 'href'),
|
|
| 2245 |
- ('object', 'classid'),
|
|
| 2246 |
- ('object', 'codebase'),
|
|
| 2247 |
- ('object', 'data'),
|
|
| 2248 |
- ('object', 'usemap'),
|
|
| 2249 |
- ('q', 'cite'),
|
|
| 2250 |
- ('script', 'src')]
|
|
| 2251 |
- |
|
| 2252 |
- def __init__(self, baseuri, encoding, type): |
|
| 2253 |
- _BaseHTMLProcessor.__init__(self, encoding, type) |
|
| 2254 |
- self.baseuri = baseuri |
|
| 2255 |
- |
|
| 2256 |
- def resolveURI(self, uri): |
|
| 2257 |
- return _urljoin(self.baseuri, uri.strip()) |
|
| 2258 |
- |
|
| 2259 |
- def unknown_starttag(self, tag, attrs): |
|
| 2260 |
- attrs = self.normalize_attrs(attrs) |
|
| 2261 |
- attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs] |
|
| 2262 |
- _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) |
|
| 2263 |
- |
|
| 2264 |
-def _resolveRelativeURIs(htmlSource, baseURI, encoding, type): |
|
| 2265 |
- if _debug: sys.stderr.write('entering _resolveRelativeURIs\n')
|
|
| 2266 |
- p = _RelativeURIResolver(baseURI, encoding, type) |
|
| 2267 |
- p.feed(htmlSource) |
|
| 2268 |
- return p.output() |
|
| 2269 |
- |
|
| 2270 |
-class _HTMLSanitizer(_BaseHTMLProcessor): |
|
| 2271 |
- acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'article', |
|
| 2272 |
- 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button', 'canvas', |
|
| 2273 |
- 'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'command', |
|
| 2274 |
- 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir', |
|
| 2275 |
- 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', 'figure', 'footer', |
|
| 2276 |
- 'font', 'form', 'header', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', |
|
| 2277 |
- 'img', 'input', 'ins', 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', |
|
| 2278 |
- 'menu', 'meter', 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', |
|
| 2279 |
- 'option', 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select', |
|
| 2280 |
- 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', 'sub', |
|
| 2281 |
- 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', 'th', 'thead', |
|
| 2282 |
- 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript'] |
|
| 2283 |
- |
|
| 2284 |
- acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey', |
|
| 2285 |
- 'action', 'align', 'alt', 'autoplay', 'autocomplete', 'autofocus', 'axis', |
|
| 2286 |
- 'background', 'balance', 'bgcolor', 'bgproperties', 'border', |
|
| 2287 |
- 'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding', |
|
| 2288 |
- 'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff', |
|
| 2289 |
- 'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols', |
|
| 2290 |
- 'colspan', 'compact', 'contenteditable', 'coords', 'data', 'datafld', |
|
| 2291 |
- 'datapagesize', 'datasrc', 'datetime', 'default', 'delay', 'dir', |
|
| 2292 |
- 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for', |
|
| 2293 |
- 'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus', |
|
| 2294 |
- 'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode', |
|
| 2295 |
- 'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc', |
|
| 2296 |
- 'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max', |
|
| 2297 |
- 'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref', |
|
| 2298 |
- 'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size', |
|
| 2299 |
- 'prompt', 'pqg', 'radiogroup', 'readonly', 'rel', 'repeat-max', |
|
| 2300 |
- 'repeat-min', 'replace', 'required', 'rev', 'rightspacing', 'rows', |
|
| 2301 |
- 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', |
|
| 2302 |
- 'start', 'step', 'summary', 'suppress', 'tabindex', 'target', 'template', |
|
| 2303 |
- 'title', 'toppadding', 'type', 'unselectable', 'usemap', 'urn', 'valign', |
|
| 2304 |
- 'value', 'variable', 'volume', 'vspace', 'vrml', 'width', 'wrap', |
|
| 2305 |
- 'xml:lang'] |
|
| 2306 |
- |
|
| 2307 |
- unacceptable_elements_with_end_tag = ['script', 'applet', 'style'] |
|
| 2308 |
- |
|
| 2309 |
- acceptable_css_properties = ['azimuth', 'background-color', |
|
| 2310 |
- 'border-bottom-color', 'border-collapse', 'border-color', |
|
| 2311 |
- 'border-left-color', 'border-right-color', 'border-top-color', 'clear', |
|
| 2312 |
- 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font', |
|
| 2313 |
- 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight', |
|
| 2314 |
- 'height', 'letter-spacing', 'line-height', 'overflow', 'pause', |
|
| 2315 |
- 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness', |
|
| 2316 |
- 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation', |
|
| 2317 |
- 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent', |
|
| 2318 |
- 'unicode-bidi', 'vertical-align', 'voice-family', 'volume', |
|
| 2319 |
- 'white-space', 'width'] |
|
| 2320 |
- |
|
| 2321 |
- # survey of common keywords found in feeds |
|
| 2322 |
- acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue', |
|
| 2323 |
- 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed', |
|
| 2324 |
- 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left', |
|
| 2325 |
- 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive', |
|
| 2326 |
- 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top', |
|
| 2327 |
- 'transparent', 'underline', 'white', 'yellow'] |
|
| 2328 |
- |
|
| 2329 |
- valid_css_values = re.compile('^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' +
|
|
| 2330 |
- '\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$')
|
|
| 2331 |
- |
|
| 2332 |
- mathml_elements = ['annotation', 'annotation-xml', 'maction', 'math', |
|
| 2333 |
- 'merror', 'mfenced', 'mfrac', 'mi', 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', |
|
| 2334 |
- 'mphantom', 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', |
|
| 2335 |
- 'msub', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder', |
|
| 2336 |
- 'munderover', 'none', 'semantics'] |
|
| 2337 |
- |
|
| 2338 |
- mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign', |
|
| 2339 |
- 'columnalign', 'close', 'columnlines', 'columnspacing', 'columnspan', 'depth', |
|
| 2340 |
- 'display', 'displaystyle', 'encoding', 'equalcolumns', 'equalrows', |
|
| 2341 |
- 'fence', 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', |
|
| 2342 |
- 'lspace', 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', |
|
| 2343 |
- 'maxsize', 'minsize', 'open', 'other', 'rowalign', 'rowalign', 'rowalign', |
|
| 2344 |
- 'rowlines', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection', |
|
| 2345 |
- 'separator', 'separators', 'stretchy', 'width', 'width', 'xlink:href', |
|
| 2346 |
- 'xlink:show', 'xlink:type', 'xmlns', 'xmlns:xlink'] |
|
| 2347 |
- |
|
| 2348 |
- # svgtiny - foreignObject + linearGradient + radialGradient + stop |
|
| 2349 |
- svg_elements = ['a', 'animate', 'animateColor', 'animateMotion', |
|
| 2350 |
- 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject', |
|
| 2351 |
- 'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', |
|
| 2352 |
- 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath', |
|
| 2353 |
- 'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop', |
|
| 2354 |
- 'svg', 'switch', 'text', 'title', 'tspan', 'use'] |
|
| 2355 |
- |
|
| 2356 |
- # svgtiny + class + opacity + offset + xmlns + xmlns:xlink |
|
| 2357 |
- svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic', |
|
| 2358 |
- 'arabic-form', 'ascent', 'attributeName', 'attributeType', |
|
| 2359 |
- 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height', |
|
| 2360 |
- 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx', |
|
| 2361 |
- 'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity', |
|
| 2362 |
- 'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style', |
|
| 2363 |
- 'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', |
|
| 2364 |
- 'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x', |
|
| 2365 |
- 'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines', |
|
| 2366 |
- 'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid', |
|
| 2367 |
- 'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max', |
|
| 2368 |
- 'min', 'name', 'offset', 'opacity', 'orient', 'origin', |
|
| 2369 |
- 'overline-position', 'overline-thickness', 'panose-1', 'path', |
|
| 2370 |
- 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY', |
|
| 2371 |
- 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures', |
|
| 2372 |
- 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', |
|
| 2373 |
- 'stop-color', 'stop-opacity', 'strikethrough-position', |
|
| 2374 |
- 'strikethrough-thickness', 'stroke', 'stroke-dasharray', |
|
| 2375 |
- 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin', |
|
| 2376 |
- 'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage', |
|
| 2377 |
- 'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2', |
|
| 2378 |
- 'underline-position', 'underline-thickness', 'unicode', 'unicode-range', |
|
| 2379 |
- 'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width', |
|
| 2380 |
- 'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole', |
|
| 2381 |
- 'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type', |
|
| 2382 |
- 'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', |
|
| 2383 |
- 'y2', 'zoomAndPan'] |
|
| 2384 |
- |
|
| 2385 |
- svg_attr_map = None |
|
| 2386 |
- svg_elem_map = None |
|
| 2387 |
- |
|
| 2388 |
- acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule', |
|
| 2389 |
- 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin', |
|
| 2390 |
- 'stroke-opacity'] |
|
| 2391 |
- |
|
| 2392 |
- def reset(self): |
|
| 2393 |
- _BaseHTMLProcessor.reset(self) |
|
| 2394 |
- self.unacceptablestack = 0 |
|
| 2395 |
- self.mathmlOK = 0 |
|
| 2396 |
- self.svgOK = 0 |
|
| 2397 |
- |
|
| 2398 |
- def unknown_starttag(self, tag, attrs): |
|
| 2399 |
- acceptable_attributes = self.acceptable_attributes |
|
| 2400 |
- keymap = {}
|
|
| 2401 |
- if not tag in self.acceptable_elements or self.svgOK: |
|
| 2402 |
- if tag in self.unacceptable_elements_with_end_tag: |
|
| 2403 |
- self.unacceptablestack += 1 |
|
| 2404 |
- |
|
| 2405 |
- # not otherwise acceptable, perhaps it is MathML or SVG? |
|
| 2406 |
- if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs:
|
|
| 2407 |
- self.mathmlOK += 1 |
|
| 2408 |
- if tag=='svg' and ('xmlns','http://www.w3.org/2000/svg') in attrs:
|
|
| 2409 |
- self.svgOK += 1 |
|
| 2410 |
- |
|
| 2411 |
- # chose acceptable attributes based on tag class, else bail |
|
| 2412 |
- if self.mathmlOK and tag in self.mathml_elements: |
|
| 2413 |
- acceptable_attributes = self.mathml_attributes |
|
| 2414 |
- elif self.svgOK and tag in self.svg_elements: |
|
| 2415 |
- # for most vocabularies, lowercasing is a good idea. Many |
|
| 2416 |
- # svg elements, however, are camel case |
|
| 2417 |
- if not self.svg_attr_map: |
|
| 2418 |
- lower=[attr.lower() for attr in self.svg_attributes] |
|
| 2419 |
- mix=[a for a in self.svg_attributes if a not in lower] |
|
| 2420 |
- self.svg_attributes = lower |
|
| 2421 |
- self.svg_attr_map = dict([(a.lower(),a) for a in mix]) |
|
| 2422 |
- |
|
| 2423 |
- lower=[attr.lower() for attr in self.svg_elements] |
|
| 2424 |
- mix=[a for a in self.svg_elements if a not in lower] |
|
| 2425 |
- self.svg_elements = lower |
|
| 2426 |
- self.svg_elem_map = dict([(a.lower(),a) for a in mix]) |
|
| 2427 |
- acceptable_attributes = self.svg_attributes |
|
| 2428 |
- tag = self.svg_elem_map.get(tag,tag) |
|
| 2429 |
- keymap = self.svg_attr_map |
|
| 2430 |
- elif not tag in self.acceptable_elements: |
|
| 2431 |
- return |
|
| 2432 |
- |
|
| 2433 |
- # declare xlink namespace, if needed |
|
| 2434 |
- if self.mathmlOK or self.svgOK: |
|
| 2435 |
- if filter(lambda (n,v): n.startswith('xlink:'),attrs):
|
|
| 2436 |
- if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs:
|
|
| 2437 |
- attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink'))
|
|
| 2438 |
- |
|
| 2439 |
- clean_attrs = [] |
|
| 2440 |
- for key, value in self.normalize_attrs(attrs): |
|
| 2441 |
- if key in acceptable_attributes: |
|
| 2442 |
- key=keymap.get(key,key) |
|
| 2443 |
- clean_attrs.append((key,value)) |
|
| 2444 |
- elif key=='style': |
|
| 2445 |
- clean_value = self.sanitize_style(value) |
|
| 2446 |
- if clean_value: clean_attrs.append((key,clean_value)) |
|
| 2447 |
- _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs) |
|
| 2448 |
- |
|
| 2449 |
- def unknown_endtag(self, tag): |
|
| 2450 |
- if not tag in self.acceptable_elements: |
|
| 2451 |
- if tag in self.unacceptable_elements_with_end_tag: |
|
| 2452 |
- self.unacceptablestack -= 1 |
|
| 2453 |
- if self.mathmlOK and tag in self.mathml_elements: |
|
| 2454 |
- if tag == 'math' and self.mathmlOK: self.mathmlOK -= 1 |
|
| 2455 |
- elif self.svgOK and tag in self.svg_elements: |
|
| 2456 |
- tag = self.svg_elem_map.get(tag,tag) |
|
| 2457 |
- if tag == 'svg' and self.svgOK: self.svgOK -= 1 |
|
| 2458 |
- else: |
|
| 2459 |
- return |
|
| 2460 |
- _BaseHTMLProcessor.unknown_endtag(self, tag) |
|
| 2461 |
- |
|
| 2462 |
- def handle_pi(self, text): |
|
| 2463 |
- pass |
|
| 2464 |
- |
|
| 2465 |
- def handle_decl(self, text): |
|
| 2466 |
- pass |
|
| 2467 |
- |
|
| 2468 |
- def handle_data(self, text): |
|
| 2469 |
- if not self.unacceptablestack: |
|
| 2470 |
- _BaseHTMLProcessor.handle_data(self, text) |
|
| 2471 |
- |
|
| 2472 |
- def sanitize_style(self, style): |
|
| 2473 |
- # disallow urls |
|
| 2474 |
- style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
|
|
| 2475 |
- |
|
| 2476 |
- # gauntlet |
|
| 2477 |
- if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
|
|
| 2478 |
- if not re.match("^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$", style): return ''
|
|
| 2479 |
- |
|
| 2480 |
- clean = [] |
|
| 2481 |
- for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
|
|
| 2482 |
- if not value: continue |
|
| 2483 |
- if prop.lower() in self.acceptable_css_properties: |
|
| 2484 |
- clean.append(prop + ': ' + value + ';') |
|
| 2485 |
- elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
|
|
| 2486 |
- for keyword in value.split(): |
|
| 2487 |
- if not keyword in self.acceptable_css_keywords and \ |
|
| 2488 |
- not self.valid_css_values.match(keyword): |
|
| 2489 |
- break |
|
| 2490 |
- else: |
|
| 2491 |
- clean.append(prop + ': ' + value + ';') |
|
| 2492 |
- elif self.svgOK and prop.lower() in self.acceptable_svg_properties: |
|
| 2493 |
- clean.append(prop + ': ' + value + ';') |
|
| 2494 |
- |
|
| 2495 |
- return ' '.join(clean) |
|
| 2496 |
- |
|
| 2497 |
- |
|
| 2498 |
-def _sanitizeHTML(htmlSource, encoding, type): |
|
| 2499 |
- p = _HTMLSanitizer(encoding, type) |
|
| 2500 |
- p.feed(htmlSource) |
|
| 2501 |
- data = p.output() |
|
| 2502 |
- if TIDY_MARKUP: |
|
| 2503 |
- # loop through list of preferred Tidy interfaces looking for one that's installed, |
|
| 2504 |
- # then set up a common _tidy function to wrap the interface-specific API. |
|
| 2505 |
- _tidy = None |
|
| 2506 |
- for tidy_interface in PREFERRED_TIDY_INTERFACES: |
|
| 2507 |
- try: |
|
| 2508 |
- if tidy_interface == "uTidy": |
|
| 2509 |
- from tidy import parseString as _utidy |
|
| 2510 |
- def _tidy(data, **kwargs): |
|
| 2511 |
- return str(_utidy(data, **kwargs)) |
|
| 2512 |
- break |
|
| 2513 |
- elif tidy_interface == "mxTidy": |
|
| 2514 |
- from mx.Tidy import Tidy as _mxtidy |
|
| 2515 |
- def _tidy(data, **kwargs): |
|
| 2516 |
- nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs) |
|
| 2517 |
- return data |
|
| 2518 |
- break |
|
| 2519 |
- except: |
|
| 2520 |
- pass |
|
| 2521 |
- if _tidy: |
|
| 2522 |
- utf8 = type(data) == type(u'') |
|
| 2523 |
- if utf8: |
|
| 2524 |
- data = data.encode('utf-8')
|
|
| 2525 |
- data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8") |
|
| 2526 |
- if utf8: |
|
| 2527 |
- data = unicode(data, 'utf-8') |
|
| 2528 |
- if data.count('<body'):
|
|
| 2529 |
- data = data.split('<body', 1)[1]
|
|
| 2530 |
- if data.count('>'):
|
|
| 2531 |
- data = data.split('>', 1)[1]
|
|
| 2532 |
- if data.count('</body'):
|
|
| 2533 |
- data = data.split('</body', 1)[0]
|
|
| 2534 |
- data = data.strip().replace('\r\n', '\n')
|
|
| 2535 |
- return data |
|
| 2536 |
- |
|
| 2537 |
-class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler): |
|
| 2538 |
- def http_error_default(self, req, fp, code, msg, headers): |
|
| 2539 |
- if ((code / 100) == 3) and (code != 304): |
|
| 2540 |
- return self.http_error_302(req, fp, code, msg, headers) |
|
| 2541 |
- infourl = urllib.addinfourl(fp, headers, req.get_full_url()) |
|
| 2542 |
- infourl.status = code |
|
| 2543 |
- return infourl |
|
| 2544 |
- |
|
| 2545 |
- def http_error_302(self, req, fp, code, msg, headers): |
|
| 2546 |
- if headers.dict.has_key('location'):
|
|
| 2547 |
- infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers) |
|
| 2548 |
- else: |
|
| 2549 |
- infourl = urllib.addinfourl(fp, headers, req.get_full_url()) |
|
| 2550 |
- if not hasattr(infourl, 'status'): |
|
| 2551 |
- infourl.status = code |
|
| 2552 |
- return infourl |
|
| 2553 |
- |
|
| 2554 |
- def http_error_301(self, req, fp, code, msg, headers): |
|
| 2555 |
- if headers.dict.has_key('location'):
|
|
| 2556 |
- infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers) |
|
| 2557 |
- else: |
|
| 2558 |
- infourl = urllib.addinfourl(fp, headers, req.get_full_url()) |
|
| 2559 |
- if not hasattr(infourl, 'status'): |
|
| 2560 |
- infourl.status = code |
|
| 2561 |
- return infourl |
|
| 2562 |
- |
|
| 2563 |
- http_error_300 = http_error_302 |
|
| 2564 |
- http_error_303 = http_error_302 |
|
| 2565 |
- http_error_307 = http_error_302 |
|
| 2566 |
- |
|
| 2567 |
- def http_error_401(self, req, fp, code, msg, headers): |
|
| 2568 |
- # Check if |
|
| 2569 |
- # - server requires digest auth, AND |
|
| 2570 |
- # - we tried (unsuccessfully) with basic auth, AND |
|
| 2571 |
- # - we're using Python 2.3.3 or later (digest auth is irreparably broken in earlier versions) |
|
| 2572 |
- # If all conditions hold, parse authentication information |
|
| 2573 |
- # out of the Authorization header we sent the first time |
|
| 2574 |
- # (for the username and password) and the WWW-Authenticate |
|
| 2575 |
- # header the server sent back (for the realm) and retry |
|
| 2576 |
- # the request with the appropriate digest auth headers instead. |
|
| 2577 |
- # This evil genius hack has been brought to you by Aaron Swartz. |
|
| 2578 |
- host = urlparse.urlparse(req.get_full_url())[1] |
|
| 2579 |
- try: |
|
| 2580 |
- assert sys.version.split()[0] >= '2.3.3' |
|
| 2581 |
- assert base64 != None |
|
| 2582 |
- user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':')
|
|
| 2583 |
- realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
|
|
| 2584 |
- self.add_password(realm, host, user, passw) |
|
| 2585 |
- retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
|
|
| 2586 |
- self.reset_retry_count() |
|
| 2587 |
- return retry |
|
| 2588 |
- except: |
|
| 2589 |
- return self.http_error_default(req, fp, code, msg, headers) |
|
| 2590 |
- |
|
| 2591 |
-def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers): |
|
| 2592 |
- """URL, filename, or string --> stream |
|
| 2593 |
- |
|
| 2594 |
- This function lets you define parsers that take any input source |
|
| 2595 |
- (URL, pathname to local or network file, or actual data as a string) |
|
| 2596 |
- and deal with it in a uniform manner. Returned object is guaranteed |
|
| 2597 |
- to have all the basic stdio read methods (read, readline, readlines). |
|
| 2598 |
- Just .close() the object when you're done with it. |
|
| 2599 |
- |
|
| 2600 |
- If the etag argument is supplied, it will be used as the value of an |
|
| 2601 |
- If-None-Match request header. |
|
| 2602 |
- |
|
| 2603 |
- If the modified argument is supplied, it can be a tuple of 9 integers |
|
| 2604 |
- (as returned by gmtime() in the standard Python time module) or a date |
|
| 2605 |
- string in any format supported by feedparser. Regardless, it MUST |
|
| 2606 |
- be in GMT (Greenwich Mean Time). It will be reformatted into an |
|
| 2607 |
- RFC 1123-compliant date and used as the value of an If-Modified-Since |
|
| 2608 |
- request header. |
|
| 2609 |
- |
|
| 2610 |
- If the agent argument is supplied, it will be used as the value of a |
|
| 2611 |
- User-Agent request header. |
|
| 2612 |
- |
|
| 2613 |
- If the referrer argument is supplied, it will be used as the value of a |
|
| 2614 |
- Referer[sic] request header. |
|
| 2615 |
- |
|
| 2616 |
- If handlers is supplied, it is a list of handlers used to build a |
|
| 2617 |
- urllib2 opener. |
|
| 2618 |
- """ |
|
| 2619 |
- |
|
| 2620 |
- if hasattr(url_file_stream_or_string, 'read'): |
|
| 2621 |
- return url_file_stream_or_string |
|
| 2622 |
- |
|
| 2623 |
- if url_file_stream_or_string == '-': |
|
| 2624 |
- return sys.stdin |
|
| 2625 |
- |
|
| 2626 |
- if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):
|
|
| 2627 |
- if not agent: |
|
| 2628 |
- agent = USER_AGENT |
|
| 2629 |
- # test for inline user:password for basic auth |
|
| 2630 |
- auth = None |
|
| 2631 |
- if base64: |
|
| 2632 |
- urltype, rest = urllib.splittype(url_file_stream_or_string) |
|
| 2633 |
- realhost, rest = urllib.splithost(rest) |
|
| 2634 |
- if realhost: |
|
| 2635 |
- user_passwd, realhost = urllib.splituser(realhost) |
|
| 2636 |
- if user_passwd: |
|
| 2637 |
- url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest) |
|
| 2638 |
- auth = base64.encodestring(user_passwd).strip() |
|
| 2639 |
- |
|
| 2640 |
- # iri support |
|
| 2641 |
- try: |
|
| 2642 |
- if isinstance(url_file_stream_or_string,unicode): |
|
| 2643 |
- url_file_stream_or_string = url_file_stream_or_string.encode('idna')
|
|
| 2644 |
- else: |
|
| 2645 |
- url_file_stream_or_string = url_file_stream_or_string.decode('utf-8').encode('idna')
|
|
| 2646 |
- except: |
|
| 2647 |
- pass |
|
| 2648 |
- |
|
| 2649 |
- # try to open with urllib2 (to use optional headers) |
|
| 2650 |
- request = urllib2.Request(url_file_stream_or_string) |
|
| 2651 |
- request.add_header('User-Agent', agent)
|
|
| 2652 |
- if etag: |
|
| 2653 |
- request.add_header('If-None-Match', etag)
|
|
| 2654 |
- if type(modified) == type(''):
|
|
| 2655 |
- modified = _parse_date(modified) |
|
| 2656 |
- if modified: |
|
| 2657 |
- # format into an RFC 1123-compliant timestamp. We can't use |
|
| 2658 |
- # time.strftime() since the %a and %b directives can be affected |
|
| 2659 |
- # by the current locale, but RFC 2616 states that dates must be |
|
| 2660 |
- # in English. |
|
| 2661 |
- short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] |
|
| 2662 |
- months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] |
|
| 2663 |
- request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
|
|
| 2664 |
- if referrer: |
|
| 2665 |
- request.add_header('Referer', referrer)
|
|
| 2666 |
- if gzip and zlib: |
|
| 2667 |
- request.add_header('Accept-encoding', 'gzip, deflate')
|
|
| 2668 |
- elif gzip: |
|
| 2669 |
- request.add_header('Accept-encoding', 'gzip')
|
|
| 2670 |
- elif zlib: |
|
| 2671 |
- request.add_header('Accept-encoding', 'deflate')
|
|
| 2672 |
- else: |
|
| 2673 |
- request.add_header('Accept-encoding', '')
|
|
| 2674 |
- if auth: |
|
| 2675 |
- request.add_header('Authorization', 'Basic %s' % auth)
|
|
| 2676 |
- if ACCEPT_HEADER: |
|
| 2677 |
- request.add_header('Accept', ACCEPT_HEADER)
|
|
| 2678 |
- request.add_header('A-IM', 'feed') # RFC 3229 support
|
|
| 2679 |
- opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers)) |
|
| 2680 |
- opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent |
|
| 2681 |
- try: |
|
| 2682 |
- return opener.open(request) |
|
| 2683 |
- finally: |
|
| 2684 |
- opener.close() # JohnD |
|
| 2685 |
- |
|
| 2686 |
- # try to open with native open function (if url_file_stream_or_string is a filename) |
|
| 2687 |
- try: |
|
| 2688 |
- return open(url_file_stream_or_string) |
|
| 2689 |
- except: |
|
| 2690 |
- pass |
|
| 2691 |
- |
|
| 2692 |
- # treat url_file_stream_or_string as string |
|
| 2693 |
- return _StringIO(str(url_file_stream_or_string)) |
|
| 2694 |
- |
|
| 2695 |
-_date_handlers = [] |
|
| 2696 |
-def registerDateHandler(func): |
|
| 2697 |
- '''Register a date handler function (takes string, returns 9-tuple date in GMT)''' |
|
| 2698 |
- _date_handlers.insert(0, func) |
|
| 2699 |
- |
|
| 2700 |
-# ISO-8601 date parsing routines written by Fazal Majid. |
|
| 2701 |
-# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601 |
|
| 2702 |
-# parser is beyond the scope of feedparser and would be a worthwhile addition |
|
| 2703 |
-# to the Python library. |
|
| 2704 |
-# A single regular expression cannot parse ISO 8601 date formats into groups |
|
| 2705 |
-# as the standard is highly irregular (for instance is 030104 2003-01-04 or |
|
| 2706 |
-# 0301-04-01), so we use templates instead. |
|
| 2707 |
-# Please note the order in templates is significant because we need a |
|
| 2708 |
-# greedy match. |
|
| 2709 |
-_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-0MM?-?DD', 'YYYY-MM', 'YYYY-?OOO', |
|
| 2710 |
- 'YY-?MM-?DD', 'YY-?OOO', 'YYYY', |
|
| 2711 |
- '-YY-?MM', '-OOO', '-YY', |
|
| 2712 |
- '--MM-?DD', '--MM', |
|
| 2713 |
- '---DD', |
|
| 2714 |
- 'CC', ''] |
|
| 2715 |
-_iso8601_re = [ |
|
| 2716 |
- tmpl.replace( |
|
| 2717 |
- 'YYYY', r'(?P<year>\d{4})').replace(
|
|
| 2718 |
- 'YY', r'(?P<year>\d\d)').replace( |
|
| 2719 |
- 'MM', r'(?P<month>[01]\d)').replace( |
|
| 2720 |
- 'DD', r'(?P<day>[0123]\d)').replace( |
|
| 2721 |
- 'OOO', r'(?P<ordinal>[0123]\d\d)').replace( |
|
| 2722 |
- 'CC', r'(?P<century>\d\d$)') |
|
| 2723 |
- + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
|
|
| 2724 |
- + r'(:(?P<second>\d{2}(\.\d*)?))?'
|
|
| 2725 |
- + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
|
|
| 2726 |
- for tmpl in _iso8601_tmpl] |
|
| 2727 |
-del tmpl |
|
| 2728 |
-_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re] |
|
| 2729 |
-del regex |
|
| 2730 |
-def _parse_date_iso8601(dateString): |
|
| 2731 |
- '''Parse a variety of ISO-8601-compatible formats like 20040105''' |
|
| 2732 |
- m = None |
|
| 2733 |
- for _iso8601_match in _iso8601_matches: |
|
| 2734 |
- m = _iso8601_match(dateString) |
|
| 2735 |
- if m: break |
|
| 2736 |
- if not m: return |
|
| 2737 |
- if m.span() == (0, 0): return |
|
| 2738 |
- params = m.groupdict() |
|
| 2739 |
- ordinal = params.get('ordinal', 0)
|
|
| 2740 |
- if ordinal: |
|
| 2741 |
- ordinal = int(ordinal) |
|
| 2742 |
- else: |
|
| 2743 |
- ordinal = 0 |
|
| 2744 |
- year = params.get('year', '--')
|
|
| 2745 |
- if not year or year == '--': |
|
| 2746 |
- year = time.gmtime()[0] |
|
| 2747 |
- elif len(year) == 2: |
|
| 2748 |
- # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993 |
|
| 2749 |
- year = 100 * int(time.gmtime()[0] / 100) + int(year) |
|
| 2750 |
- else: |
|
| 2751 |
- year = int(year) |
|
| 2752 |
- month = params.get('month', '-')
|
|
| 2753 |
- if not month or month == '-': |
|
| 2754 |
- # ordinals are NOT normalized by mktime, we simulate them |
|
| 2755 |
- # by setting month=1, day=ordinal |
|
| 2756 |
- if ordinal: |
|
| 2757 |
- month = 1 |
|
| 2758 |
- else: |
|
| 2759 |
- month = time.gmtime()[1] |
|
| 2760 |
- month = int(month) |
|
| 2761 |
- day = params.get('day', 0)
|
|
| 2762 |
- if not day: |
|
| 2763 |
- # see above |
|
| 2764 |
- if ordinal: |
|
| 2765 |
- day = ordinal |
|
| 2766 |
- elif params.get('century', 0) or \
|
|
| 2767 |
- params.get('year', 0) or params.get('month', 0):
|
|
| 2768 |
- day = 1 |
|
| 2769 |
- else: |
|
| 2770 |
- day = time.gmtime()[2] |
|
| 2771 |
- else: |
|
| 2772 |
- day = int(day) |
|
| 2773 |
- # special case of the century - is the first year of the 21st century |
|
| 2774 |
- # 2000 or 2001 ? The debate goes on... |
|
| 2775 |
- if 'century' in params.keys(): |
|
| 2776 |
- year = (int(params['century']) - 1) * 100 + 1 |
|
| 2777 |
- # in ISO 8601 most fields are optional |
|
| 2778 |
- for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']: |
|
| 2779 |
- if not params.get(field, None): |
|
| 2780 |
- params[field] = 0 |
|
| 2781 |
- hour = int(params.get('hour', 0))
|
|
| 2782 |
- minute = int(params.get('minute', 0))
|
|
| 2783 |
- second = int(float(params.get('second', 0)))
|
|
| 2784 |
- # weekday is normalized by mktime(), we can ignore it |
|
| 2785 |
- weekday = 0 |
|
| 2786 |
- daylight_savings_flag = -1 |
|
| 2787 |
- tm = [year, month, day, hour, minute, second, weekday, |
|
| 2788 |
- ordinal, daylight_savings_flag] |
|
| 2789 |
- # ISO 8601 time zone adjustments |
|
| 2790 |
- tz = params.get('tz')
|
|
| 2791 |
- if tz and tz != 'Z': |
|
| 2792 |
- if tz[0] == '-': |
|
| 2793 |
- tm[3] += int(params.get('tzhour', 0))
|
|
| 2794 |
- tm[4] += int(params.get('tzmin', 0))
|
|
| 2795 |
- elif tz[0] == '+': |
|
| 2796 |
- tm[3] -= int(params.get('tzhour', 0))
|
|
| 2797 |
- tm[4] -= int(params.get('tzmin', 0))
|
|
| 2798 |
- else: |
|
| 2799 |
- return None |
|
| 2800 |
- # Python's time.mktime() is a wrapper around the ANSI C mktime(3c) |
|
| 2801 |
- # which is guaranteed to normalize d/m/y/h/m/s. |
|
| 2802 |
- # Many implementations have bugs, but we'll pretend they don't. |
|
| 2803 |
- return time.localtime(time.mktime(tm)) |
|
| 2804 |
-registerDateHandler(_parse_date_iso8601) |
|
| 2805 |
- |
|
| 2806 |
-# 8-bit date handling routines written by ytrewq1. |
|
| 2807 |
-_korean_year = u'\ub144' # b3e2 in euc-kr |
|
| 2808 |
-_korean_month = u'\uc6d4' # bff9 in euc-kr |
|
| 2809 |
-_korean_day = u'\uc77c' # c0cf in euc-kr |
|
| 2810 |
-_korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr |
|
| 2811 |
-_korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr |
|
| 2812 |
- |
|
| 2813 |
-_korean_onblog_date_re = \ |
|
| 2814 |
- re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
|
|
| 2815 |
- (_korean_year, _korean_month, _korean_day)) |
|
| 2816 |
-_korean_nate_date_re = \ |
|
| 2817 |
- re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
|
|
| 2818 |
- (_korean_am, _korean_pm)) |
|
| 2819 |
-def _parse_date_onblog(dateString): |
|
| 2820 |
- '''Parse a string according to the OnBlog 8-bit date format''' |
|
| 2821 |
- m = _korean_onblog_date_re.match(dateString) |
|
| 2822 |
- if not m: return |
|
| 2823 |
- w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ |
|
| 2824 |
- {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
|
|
| 2825 |
- 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\ |
|
| 2826 |
- 'zonediff': '+09:00'} |
|
| 2827 |
- if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate)
|
|
| 2828 |
- return _parse_date_w3dtf(w3dtfdate) |
|
| 2829 |
-registerDateHandler(_parse_date_onblog) |
|
| 2830 |
- |
|
| 2831 |
-def _parse_date_nate(dateString): |
|
| 2832 |
- '''Parse a string according to the Nate 8-bit date format''' |
|
| 2833 |
- m = _korean_nate_date_re.match(dateString) |
|
| 2834 |
- if not m: return |
|
| 2835 |
- hour = int(m.group(5)) |
|
| 2836 |
- ampm = m.group(4) |
|
| 2837 |
- if (ampm == _korean_pm): |
|
| 2838 |
- hour += 12 |
|
| 2839 |
- hour = str(hour) |
|
| 2840 |
- if len(hour) == 1: |
|
| 2841 |
- hour = '0' + hour |
|
| 2842 |
- w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ |
|
| 2843 |
- {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
|
|
| 2844 |
- 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\ |
|
| 2845 |
- 'zonediff': '+09:00'} |
|
| 2846 |
- if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate)
|
|
| 2847 |
- return _parse_date_w3dtf(w3dtfdate) |
|
| 2848 |
-registerDateHandler(_parse_date_nate) |
|
| 2849 |
- |
|
| 2850 |
-_mssql_date_re = \ |
|
| 2851 |
- re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')
|
|
| 2852 |
-def _parse_date_mssql(dateString): |
|
| 2853 |
- '''Parse a string according to the MS SQL date format''' |
|
| 2854 |
- m = _mssql_date_re.match(dateString) |
|
| 2855 |
- if not m: return |
|
| 2856 |
- w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ |
|
| 2857 |
- {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
|
|
| 2858 |
- 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\ |
|
| 2859 |
- 'zonediff': '+09:00'} |
|
| 2860 |
- if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate)
|
|
| 2861 |
- return _parse_date_w3dtf(w3dtfdate) |
|
| 2862 |
-registerDateHandler(_parse_date_mssql) |
|
| 2863 |
- |
|
| 2864 |
-# Unicode strings for Greek date strings |
|
| 2865 |
-_greek_months = \ |
|
| 2866 |
- { \
|
|
| 2867 |
- u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7 |
|
| 2868 |
- u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7 |
|
| 2869 |
- u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7 |
|
| 2870 |
- u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7 |
|
| 2871 |
- u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7 |
|
| 2872 |
- u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7 |
|
| 2873 |
- u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7 |
|
| 2874 |
- u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7 |
|
| 2875 |
- u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7 |
|
| 2876 |
- u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7 |
|
| 2877 |
- u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7 |
|
| 2878 |
- u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7 |
|
| 2879 |
- u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7 |
|
| 2880 |
- u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7 |
|
| 2881 |
- u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7 |
|
| 2882 |
- u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7 |
|
| 2883 |
- u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7 |
|
| 2884 |
- u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7 |
|
| 2885 |
- u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7 |
|
| 2886 |
- } |
|
| 2887 |
- |
|
| 2888 |
-_greek_wdays = \ |
|
| 2889 |
- { \
|
|
| 2890 |
- u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7 |
|
| 2891 |
- u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7 |
|
| 2892 |
- u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7 |
|
| 2893 |
- u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7 |
|
| 2894 |
- u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7 |
|
| 2895 |
- u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7 |
|
| 2896 |
- u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7 |
|
| 2897 |
- } |
|
| 2898 |
- |
|
| 2899 |
-_greek_date_format_re = \ |
|
| 2900 |
- re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
|
|
| 2901 |
- |
|
| 2902 |
-def _parse_date_greek(dateString): |
|
| 2903 |
- '''Parse a string according to a Greek 8-bit date format.''' |
|
| 2904 |
- m = _greek_date_format_re.match(dateString) |
|
| 2905 |
- if not m: return |
|
| 2906 |
- try: |
|
| 2907 |
- wday = _greek_wdays[m.group(1)] |
|
| 2908 |
- month = _greek_months[m.group(3)] |
|
| 2909 |
- except: |
|
| 2910 |
- return |
|
| 2911 |
- rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \ |
|
| 2912 |
- {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
|
|
| 2913 |
- 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\ |
|
| 2914 |
- 'zonediff': m.group(8)} |
|
| 2915 |
- if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date)
|
|
| 2916 |
- return _parse_date_rfc822(rfc822date) |
|
| 2917 |
-registerDateHandler(_parse_date_greek) |
|
| 2918 |
- |
|
| 2919 |
-# Unicode strings for Hungarian date strings |
|
| 2920 |
-_hungarian_months = \ |
|
| 2921 |
- { \
|
|
| 2922 |
- u'janu\u00e1r': u'01', # e1 in iso-8859-2 |
|
| 2923 |
- u'febru\u00e1ri': u'02', # e1 in iso-8859-2 |
|
| 2924 |
- u'm\u00e1rcius': u'03', # e1 in iso-8859-2 |
|
| 2925 |
- u'\u00e1prilis': u'04', # e1 in iso-8859-2 |
|
| 2926 |
- u'm\u00e1ujus': u'05', # e1 in iso-8859-2 |
|
| 2927 |
- u'j\u00fanius': u'06', # fa in iso-8859-2 |
|
| 2928 |
- u'j\u00falius': u'07', # fa in iso-8859-2 |
|
| 2929 |
- u'augusztus': u'08', |
|
| 2930 |
- u'szeptember': u'09', |
|
| 2931 |
- u'okt\u00f3ber': u'10', # f3 in iso-8859-2 |
|
| 2932 |
- u'november': u'11', |
|
| 2933 |
- u'december': u'12', |
|
| 2934 |
- } |
|
| 2935 |
- |
|
| 2936 |
-_hungarian_date_format_re = \ |
|
| 2937 |
- re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
|
|
| 2938 |
- |
|
| 2939 |
-def _parse_date_hungarian(dateString): |
|
| 2940 |
- '''Parse a string according to a Hungarian 8-bit date format.''' |
|
| 2941 |
- m = _hungarian_date_format_re.match(dateString) |
|
| 2942 |
- if not m: return |
|
| 2943 |
- try: |
|
| 2944 |
- month = _hungarian_months[m.group(2)] |
|
| 2945 |
- day = m.group(3) |
|
| 2946 |
- if len(day) == 1: |
|
| 2947 |
- day = '0' + day |
|
| 2948 |
- hour = m.group(4) |
|
| 2949 |
- if len(hour) == 1: |
|
| 2950 |
- hour = '0' + hour |
|
| 2951 |
- except: |
|
| 2952 |
- return |
|
| 2953 |
- w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \ |
|
| 2954 |
- {'year': m.group(1), 'month': month, 'day': day,\
|
|
| 2955 |
- 'hour': hour, 'minute': m.group(5),\ |
|
| 2956 |
- 'zonediff': m.group(6)} |
|
| 2957 |
- if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate)
|
|
| 2958 |
- return _parse_date_w3dtf(w3dtfdate) |
|
| 2959 |
-registerDateHandler(_parse_date_hungarian) |
|
| 2960 |
- |
|
| 2961 |
-# W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by |
|
| 2962 |
-# Drake and licensed under the Python license. Removed all range checking |
|
| 2963 |
-# for month, day, hour, minute, and second, since mktime will normalize |
|
| 2964 |
-# these later |
|
| 2965 |
-def _parse_date_w3dtf(dateString): |
|
| 2966 |
- def __extract_date(m): |
|
| 2967 |
- year = int(m.group('year'))
|
|
| 2968 |
- if year < 100: |
|
| 2969 |
- year = 100 * int(time.gmtime()[0] / 100) + int(year) |
|
| 2970 |
- if year < 1000: |
|
| 2971 |
- return 0, 0, 0 |
|
| 2972 |
- julian = m.group('julian')
|
|
| 2973 |
- if julian: |
|
| 2974 |
- julian = int(julian) |
|
| 2975 |
- month = julian / 30 + 1 |
|
| 2976 |
- day = julian % 30 + 1 |
|
| 2977 |
- jday = None |
|
| 2978 |
- while jday != julian: |
|
| 2979 |
- t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0)) |
|
| 2980 |
- jday = time.gmtime(t)[-2] |
|
| 2981 |
- diff = abs(jday - julian) |
|
| 2982 |
- if jday > julian: |
|
| 2983 |
- if diff < day: |
|
| 2984 |
- day = day - diff |
|
| 2985 |
- else: |
|
| 2986 |
- month = month - 1 |
|
| 2987 |
- day = 31 |
|
| 2988 |
- elif jday < julian: |
|
| 2989 |
- if day + diff < 28: |
|
| 2990 |
- day = day + diff |
|
| 2991 |
- else: |
|
| 2992 |
- month = month + 1 |
|
| 2993 |
- return year, month, day |
|
| 2994 |
- month = m.group('month')
|
|
| 2995 |
- day = 1 |
|
| 2996 |
- if month is None: |
|
| 2997 |
- month = 1 |
|
| 2998 |
- else: |
|
| 2999 |
- month = int(month) |
|
| 3000 |
- day = m.group('day')
|
|
| 3001 |
- if day: |
|
| 3002 |
- day = int(day) |
|
| 3003 |
- else: |
|
| 3004 |
- day = 1 |
|
| 3005 |
- return year, month, day |
|
| 3006 |
- |
|
| 3007 |
- def __extract_time(m): |
|
| 3008 |
- if not m: |
|
| 3009 |
- return 0, 0, 0 |
|
| 3010 |
- hours = m.group('hours')
|
|
| 3011 |
- if not hours: |
|
| 3012 |
- return 0, 0, 0 |
|
| 3013 |
- hours = int(hours) |
|
| 3014 |
- minutes = int(m.group('minutes'))
|
|
| 3015 |
- seconds = m.group('seconds')
|
|
| 3016 |
- if seconds: |
|
| 3017 |
- seconds = int(seconds) |
|
| 3018 |
- else: |
|
| 3019 |
- seconds = 0 |
|
| 3020 |
- return hours, minutes, seconds |
|
| 3021 |
- |
|
| 3022 |
- def __extract_tzd(m): |
|
| 3023 |
- '''Return the Time Zone Designator as an offset in seconds from UTC.''' |
|
| 3024 |
- if not m: |
|
| 3025 |
- return 0 |
|
| 3026 |
- tzd = m.group('tzd')
|
|
| 3027 |
- if not tzd: |
|
| 3028 |
- return 0 |
|
| 3029 |
- if tzd == 'Z': |
|
| 3030 |
- return 0 |
|
| 3031 |
- hours = int(m.group('tzdhours'))
|
|
| 3032 |
- minutes = m.group('tzdminutes')
|
|
| 3033 |
- if minutes: |
|
| 3034 |
- minutes = int(minutes) |
|
| 3035 |
- else: |
|
| 3036 |
- minutes = 0 |
|
| 3037 |
- offset = (hours*60 + minutes) * 60 |
|
| 3038 |
- if tzd[0] == '+': |
|
| 3039 |
- return -offset |
|
| 3040 |
- return offset |
|
| 3041 |
- |
|
| 3042 |
- __date_re = ('(?P<year>\d\d\d\d)'
|
|
| 3043 |
- '(?:(?P<dsep>-|)' |
|
| 3044 |
- '(?:(?P<julian>\d\d\d)' |
|
| 3045 |
- '|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?') |
|
| 3046 |
- __tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)' |
|
| 3047 |
- __tzd_rx = re.compile(__tzd_re) |
|
| 3048 |
- __time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)'
|
|
| 3049 |
- '(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?' |
|
| 3050 |
- + __tzd_re) |
|
| 3051 |
- __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re) |
|
| 3052 |
- __datetime_rx = re.compile(__datetime_re) |
|
| 3053 |
- m = __datetime_rx.match(dateString) |
|
| 3054 |
- if (m is None) or (m.group() != dateString): return |
|
| 3055 |
- gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0) |
|
| 3056 |
- if gmt[0] == 0: return |
|
| 3057 |
- return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone) |
|
| 3058 |
-registerDateHandler(_parse_date_w3dtf) |
|
| 3059 |
- |
|
| 3060 |
-def _parse_date_rfc822(dateString): |
|
| 3061 |
- '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date''' |
|
| 3062 |
- data = dateString.split() |
|
| 3063 |
- if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:
|
|
| 3064 |
- del data[0] |
|
| 3065 |
- if len(data) == 4: |
|
| 3066 |
- s = data[3] |
|
| 3067 |
- i = s.find('+')
|
|
| 3068 |
- if i > 0: |
|
| 3069 |
- data[3:] = [s[:i], s[i+1:]] |
|
| 3070 |
- else: |
|
| 3071 |
- data.append('')
|
|
| 3072 |
- dateString = " ".join(data) |
|
| 3073 |
- if len(data) < 5: |
|
| 3074 |
- dateString += ' 00:00:00 GMT' |
|
| 3075 |
- tm = rfc822.parsedate_tz(dateString) |
|
| 3076 |
- if tm: |
|
| 3077 |
- return time.gmtime(rfc822.mktime_tz(tm)) |
|
| 3078 |
-# rfc822.py defines several time zones, but we define some extra ones. |
|
| 3079 |
-# 'ET' is equivalent to 'EST', etc. |
|
| 3080 |
-_additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
|
|
| 3081 |
-rfc822._timezones.update(_additional_timezones) |
|
| 3082 |
-registerDateHandler(_parse_date_rfc822) |
|
| 3083 |
- |
|
| 3084 |
-def _parse_date_perforce(aDateString): |
|
| 3085 |
- """parse a date in yyyy/mm/dd hh:mm:ss TTT format""" |
|
| 3086 |
- # Fri, 2006/09/15 08:19:53 EDT |
|
| 3087 |
- _my_date_pattern = re.compile( \ |
|
| 3088 |
- r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})')
|
|
| 3089 |
- |
|
| 3090 |
- dow, year, month, day, hour, minute, second, tz = \ |
|
| 3091 |
- _my_date_pattern.search(aDateString).groups() |
|
| 3092 |
- months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] |
|
| 3093 |
- dateString = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz) |
|
| 3094 |
- tm = rfc822.parsedate_tz(dateString) |
|
| 3095 |
- if tm: |
|
| 3096 |
- return time.gmtime(rfc822.mktime_tz(tm)) |
|
| 3097 |
-registerDateHandler(_parse_date_perforce) |
|
| 3098 |
- |
|
| 3099 |
-def _parse_date(dateString): |
|
| 3100 |
- '''Parses a variety of date formats into a 9-tuple in GMT''' |
|
| 3101 |
- for handler in _date_handlers: |
|
| 3102 |
- try: |
|
| 3103 |
- date9tuple = handler(dateString) |
|
| 3104 |
- if not date9tuple: continue |
|
| 3105 |
- if len(date9tuple) != 9: |
|
| 3106 |
- if _debug: sys.stderr.write('date handler function must return 9-tuple\n')
|
|
| 3107 |
- raise ValueError |
|
| 3108 |
- map(int, date9tuple) |
|
| 3109 |
- return date9tuple |
|
| 3110 |
- except Exception as e: |
|
| 3111 |
- if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e)))
|
|
| 3112 |
- pass |
|
| 3113 |
- return None |
|
| 3114 |
- |
|
| 3115 |
-def _getCharacterEncoding(http_headers, xml_data): |
|
| 3116 |
- '''Get the character encoding of the XML document |
|
| 3117 |
- |
|
| 3118 |
- http_headers is a dictionary |
|
| 3119 |
- xml_data is a raw string (not Unicode) |
|
| 3120 |
- |
|
| 3121 |
- This is so much trickier than it sounds, it's not even funny. |
|
| 3122 |
- According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
|
|
| 3123 |
- is application/xml, application/*+xml, |
|
| 3124 |
- application/xml-external-parsed-entity, or application/xml-dtd, |
|
| 3125 |
- the encoding given in the charset parameter of the HTTP Content-Type |
|
| 3126 |
- takes precedence over the encoding given in the XML prefix within the |
|
| 3127 |
- document, and defaults to 'utf-8' if neither are specified. But, if |
|
| 3128 |
- the HTTP Content-Type is text/xml, text/*+xml, or |
|
| 3129 |
- text/xml-external-parsed-entity, the encoding given in the XML prefix |
|
| 3130 |
- within the document is ALWAYS IGNORED and only the encoding given in |
|
| 3131 |
- the charset parameter of the HTTP Content-Type header should be |
|
| 3132 |
- respected, and it defaults to 'us-ascii' if not specified. |
|
| 3133 |
- |
|
| 3134 |
- Furthermore, discussion on the atom-syntax mailing list with the |
|
| 3135 |
- author of RFC 3023 leads me to the conclusion that any document |
|
| 3136 |
- served with a Content-Type of text/* and no charset parameter |
|
| 3137 |
- must be treated as us-ascii. (We now do this.) And also that it |
|
| 3138 |
- must always be flagged as non-well-formed. (We now do this too.) |
|
| 3139 |
- |
|
| 3140 |
- If Content-Type is unspecified (input was local file or non-HTTP source) |
|
| 3141 |
- or unrecognized (server just got it totally wrong), then go by the |
|
| 3142 |
- encoding given in the XML prefix of the document and default to |
|
| 3143 |
- 'iso-8859-1' as per the HTTP specification (RFC 2616). |
|
| 3144 |
- |
|
| 3145 |
- Then, assuming we didn't find a character encoding in the HTTP headers |
|
| 3146 |
- (and the HTTP Content-type allowed us to look in the body), we need |
|
| 3147 |
- to sniff the first few bytes of the XML data and try to determine |
|
| 3148 |
- whether the encoding is ASCII-compatible. Section F of the XML |
|
| 3149 |
- specification shows the way here: |
|
| 3150 |
- http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info |
|
| 3151 |
- |
|
| 3152 |
- If the sniffed encoding is not ASCII-compatible, we need to make it |
|
| 3153 |
- ASCII compatible so that we can sniff further into the XML declaration |
|
| 3154 |
- to find the encoding attribute, which will tell us the true encoding. |
|
| 3155 |
- |
|
| 3156 |
- Of course, none of this guarantees that we will be able to parse the |
|
| 3157 |
- feed in the declared character encoding (assuming it was declared |
|
| 3158 |
- correctly, which many are not). CJKCodecs and iconv_codec help a lot; |
|
| 3159 |
- you should definitely install them if you can. |
|
| 3160 |
- http://cjkpython.i18n.org/ |
|
| 3161 |
- ''' |
|
| 3162 |
- |
|
| 3163 |
- def _parseHTTPContentType(content_type): |
|
| 3164 |
- '''takes HTTP Content-Type header and returns (content type, charset) |
|
| 3165 |
- |
|
| 3166 |
- If no charset is specified, returns (content type, '') |
|
| 3167 |
- If no content type is specified, returns ('', '')
|
|
| 3168 |
- Both return parameters are guaranteed to be lowercase strings |
|
| 3169 |
- ''' |
|
| 3170 |
- content_type = content_type or '' |
|
| 3171 |
- content_type, params = cgi.parse_header(content_type) |
|
| 3172 |
- return content_type, params.get('charset', '').replace("'", '')
|
|
| 3173 |
- |
|
| 3174 |
- sniffed_xml_encoding = '' |
|
| 3175 |
- xml_encoding = '' |
|
| 3176 |
- true_encoding = '' |
|
| 3177 |
- http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type'))
|
|
| 3178 |
- # Must sniff for non-ASCII-compatible character encodings before |
|
| 3179 |
- # searching for XML declaration. This heuristic is defined in |
|
| 3180 |
- # section F of the XML specification: |
|
| 3181 |
- # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info |
|
| 3182 |
- try: |
|
| 3183 |
- if xml_data[:4] == '\x4c\x6f\xa7\x94': |
|
| 3184 |
- # EBCDIC |
|
| 3185 |
- xml_data = _ebcdic_to_ascii(xml_data) |
|
| 3186 |
- elif xml_data[:4] == '\x00\x3c\x00\x3f': |
|
| 3187 |
- # UTF-16BE |
|
| 3188 |
- sniffed_xml_encoding = 'utf-16be' |
|
| 3189 |
- xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
|
|
| 3190 |
- elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'): |
|
| 3191 |
- # UTF-16BE with BOM |
|
| 3192 |
- sniffed_xml_encoding = 'utf-16be' |
|
| 3193 |
- xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
|
|
| 3194 |
- elif xml_data[:4] == '\x3c\x00\x3f\x00': |
|
| 3195 |
- # UTF-16LE |
|
| 3196 |
- sniffed_xml_encoding = 'utf-16le' |
|
| 3197 |
- xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
|
|
| 3198 |
- elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'): |
|
| 3199 |
- # UTF-16LE with BOM |
|
| 3200 |
- sniffed_xml_encoding = 'utf-16le' |
|
| 3201 |
- xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
|
|
| 3202 |
- elif xml_data[:4] == '\x00\x00\x00\x3c': |
|
| 3203 |
- # UTF-32BE |
|
| 3204 |
- sniffed_xml_encoding = 'utf-32be' |
|
| 3205 |
- xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
|
|
| 3206 |
- elif xml_data[:4] == '\x3c\x00\x00\x00': |
|
| 3207 |
- # UTF-32LE |
|
| 3208 |
- sniffed_xml_encoding = 'utf-32le' |
|
| 3209 |
- xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
|
|
| 3210 |
- elif xml_data[:4] == '\x00\x00\xfe\xff': |
|
| 3211 |
- # UTF-32BE with BOM |
|
| 3212 |
- sniffed_xml_encoding = 'utf-32be' |
|
| 3213 |
- xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
|
|
| 3214 |
- elif xml_data[:4] == '\xff\xfe\x00\x00': |
|
| 3215 |
- # UTF-32LE with BOM |
|
| 3216 |
- sniffed_xml_encoding = 'utf-32le' |
|
| 3217 |
- xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
|
|
| 3218 |
- elif xml_data[:3] == '\xef\xbb\xbf': |
|
| 3219 |
- # UTF-8 with BOM |
|
| 3220 |
- sniffed_xml_encoding = 'utf-8' |
|
| 3221 |
- xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
|
|
| 3222 |
- else: |
|
| 3223 |
- # ASCII-compatible |
|
| 3224 |
- pass |
|
| 3225 |
- xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
|
|
| 3226 |
- except: |
|
| 3227 |
- xml_encoding_match = None |
|
| 3228 |
- if xml_encoding_match: |
|
| 3229 |
- xml_encoding = xml_encoding_match.groups()[0].lower() |
|
| 3230 |
- if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):
|
|
| 3231 |
- xml_encoding = sniffed_xml_encoding |
|
| 3232 |
- acceptable_content_type = 0 |
|
| 3233 |
- application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')
|
|
| 3234 |
- text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
|
|
| 3235 |
- if (http_content_type in application_content_types) or \ |
|
| 3236 |
- (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
|
|
| 3237 |
- acceptable_content_type = 1 |
|
| 3238 |
- true_encoding = http_encoding or xml_encoding or 'utf-8' |
|
| 3239 |
- elif (http_content_type in text_content_types) or \ |
|
| 3240 |
- (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):
|
|
| 3241 |
- acceptable_content_type = 1 |
|
| 3242 |
- true_encoding = http_encoding or 'us-ascii' |
|
| 3243 |
- elif http_content_type.startswith('text/'):
|
|
| 3244 |
- true_encoding = http_encoding or 'us-ascii' |
|
| 3245 |
- elif http_headers and (not http_headers.has_key('content-type')):
|
|
| 3246 |
- true_encoding = xml_encoding or 'iso-8859-1' |
|
| 3247 |
- else: |
|
| 3248 |
- true_encoding = xml_encoding or 'utf-8' |
|
| 3249 |
- # some feeds claim to be gb2312 but are actually gb18030. |
|
| 3250 |
- # apparently MSIE and Firefox both do the following switch: |
|
| 3251 |
- if true_encoding.lower() == 'gb2312': |
|
| 3252 |
- true_encoding = 'gb18030' |
|
| 3253 |
- return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type |
|
| 3254 |
- |
|
| 3255 |
-def _toUTF8(data, encoding): |
|
| 3256 |
- '''Changes an XML data stream on the fly to specify a new encoding |
|
| 3257 |
- |
|
| 3258 |
- data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already |
|
| 3259 |
- encoding is a string recognized by encodings.aliases |
|
| 3260 |
- ''' |
|
| 3261 |
- if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)
|
|
| 3262 |
- # strip Byte Order Mark (if present) |
|
| 3263 |
- if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'): |
|
| 3264 |
- if _debug: |
|
| 3265 |
- sys.stderr.write('stripping BOM\n')
|
|
| 3266 |
- if encoding != 'utf-16be': |
|
| 3267 |
- sys.stderr.write('trying utf-16be instead\n')
|
|
| 3268 |
- encoding = 'utf-16be' |
|
| 3269 |
- data = data[2:] |
|
| 3270 |
- elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'): |
|
| 3271 |
- if _debug: |
|
| 3272 |
- sys.stderr.write('stripping BOM\n')
|
|
| 3273 |
- if encoding != 'utf-16le': |
|
| 3274 |
- sys.stderr.write('trying utf-16le instead\n')
|
|
| 3275 |
- encoding = 'utf-16le' |
|
| 3276 |
- data = data[2:] |
|
| 3277 |
- elif data[:3] == '\xef\xbb\xbf': |
|
| 3278 |
- if _debug: |
|
| 3279 |
- sys.stderr.write('stripping BOM\n')
|
|
| 3280 |
- if encoding != 'utf-8': |
|
| 3281 |
- sys.stderr.write('trying utf-8 instead\n')
|
|
| 3282 |
- encoding = 'utf-8' |
|
| 3283 |
- data = data[3:] |
|
| 3284 |
- elif data[:4] == '\x00\x00\xfe\xff': |
|
| 3285 |
- if _debug: |
|
| 3286 |
- sys.stderr.write('stripping BOM\n')
|
|
| 3287 |
- if encoding != 'utf-32be': |
|
| 3288 |
- sys.stderr.write('trying utf-32be instead\n')
|
|
| 3289 |
- encoding = 'utf-32be' |
|
| 3290 |
- data = data[4:] |
|
| 3291 |
- elif data[:4] == '\xff\xfe\x00\x00': |
|
| 3292 |
- if _debug: |
|
| 3293 |
- sys.stderr.write('stripping BOM\n')
|
|
| 3294 |
- if encoding != 'utf-32le': |
|
| 3295 |
- sys.stderr.write('trying utf-32le instead\n')
|
|
| 3296 |
- encoding = 'utf-32le' |
|
| 3297 |
- data = data[4:] |
|
| 3298 |
- newdata = unicode(data, encoding) |
|
| 3299 |
- if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)
|
|
| 3300 |
- declmatch = re.compile('^<\?xml[^>]*?>')
|
|
| 3301 |
- newdecl = '''<?xml version='1.0' encoding='utf-8'?>''' |
|
| 3302 |
- if declmatch.search(newdata): |
|
| 3303 |
- newdata = declmatch.sub(newdecl, newdata) |
|
| 3304 |
- else: |
|
| 3305 |
- newdata = newdecl + u'\n' + newdata |
|
| 3306 |
- return newdata.encode('utf-8')
|
|
| 3307 |
- |
|
| 3308 |
-def _stripDoctype(data): |
|
| 3309 |
- '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data) |
|
| 3310 |
- |
|
| 3311 |
- rss_version may be 'rss091n' or None |
|
| 3312 |
- stripped_data is the same XML document, minus the DOCTYPE |
|
| 3313 |
- ''' |
|
| 3314 |
- start = re.search('<\w',data)
|
|
| 3315 |
- start = start and start.start() or -1 |
|
| 3316 |
- head,data = data[:start+1], data[start+1:] |
|
| 3317 |
- |
|
| 3318 |
- entity_pattern = re.compile(r'^\s*<!ENTITY([^>]*?)>', re.MULTILINE) |
|
| 3319 |
- entity_results=entity_pattern.findall(head) |
|
| 3320 |
- head = entity_pattern.sub('', head)
|
|
| 3321 |
- doctype_pattern = re.compile(r'^\s*<!DOCTYPE([^>]*?)>', re.MULTILINE) |
|
| 3322 |
- doctype_results = doctype_pattern.findall(head) |
|
| 3323 |
- doctype = doctype_results and doctype_results[0] or '' |
|
| 3324 |
- if doctype.lower().count('netscape'):
|
|
| 3325 |
- version = 'rss091n' |
|
| 3326 |
- else: |
|
| 3327 |
- version = None |
|
| 3328 |
- |
|
| 3329 |
- # only allow in 'safe' inline entity definitions |
|
| 3330 |
- replacement='' |
|
| 3331 |
- if len(doctype_results)==1 and entity_results: |
|
| 3332 |
- safe_pattern=re.compile('\s+(\w+)\s+"(&#\w+;|[^&"]*)"')
|
|
| 3333 |
- safe_entities=filter(lambda e: safe_pattern.match(e),entity_results) |
|
| 3334 |
- if safe_entities: |
|
| 3335 |
- replacement='<!DOCTYPE feed [\n <!ENTITY %s>\n]>' % '>\n <!ENTITY '.join(safe_entities) |
|
| 3336 |
- data = doctype_pattern.sub(replacement, head) + data |
|
| 3337 |
- |
|
| 3338 |
- return version, data, dict(replacement and safe_pattern.findall(replacement)) |
|
| 3339 |
- |
|
| 3340 |
-def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]): |
|
| 3341 |
- '''Parse a feed from a URL, file, stream, or string''' |
|
| 3342 |
- result = FeedParserDict() |
|
| 3343 |
- result['feed'] = FeedParserDict() |
|
| 3344 |
- result['entries'] = [] |
|
| 3345 |
- if _XML_AVAILABLE: |
|
| 3346 |
- result['bozo'] = 0 |
|
| 3347 |
- if type(handlers) == types.InstanceType: |
|
| 3348 |
- handlers = [handlers] |
|
| 3349 |
- try: |
|
| 3350 |
- f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers) |
|
| 3351 |
- data = f.read() |
|
| 3352 |
- except Exception as e: |
|
| 3353 |
- result['bozo'] = 1 |
|
| 3354 |
- result['bozo_exception'] = e |
|
| 3355 |
- data = '' |
|
| 3356 |
- f = None |
|
| 3357 |
- |
|
| 3358 |
- # if feed is gzip-compressed, decompress it |
|
| 3359 |
- if f and data and hasattr(f, 'headers'): |
|
| 3360 |
- if gzip and f.headers.get('content-encoding', '') == 'gzip':
|
|
| 3361 |
- try: |
|
| 3362 |
- data = gzip.GzipFile(fileobj=_StringIO(data)).read() |
|
| 3363 |
- except Exception as e: |
|
| 3364 |
- # Some feeds claim to be gzipped but they're not, so |
|
| 3365 |
- # we get garbage. Ideally, we should re-request the |
|
| 3366 |
- # feed without the 'Accept-encoding: gzip' header, |
|
| 3367 |
- # but we don't. |
|
| 3368 |
- result['bozo'] = 1 |
|
| 3369 |
- result['bozo_exception'] = e |
|
| 3370 |
- data = '' |
|
| 3371 |
- elif zlib and f.headers.get('content-encoding', '') == 'deflate':
|
|
| 3372 |
- try: |
|
| 3373 |
- data = zlib.decompress(data, -zlib.MAX_WBITS) |
|
| 3374 |
- except Exception as e: |
|
| 3375 |
- result['bozo'] = 1 |
|
| 3376 |
- result['bozo_exception'] = e |
|
| 3377 |
- data = '' |
|
| 3378 |
- |
|
| 3379 |
- # save HTTP headers |
|
| 3380 |
- if hasattr(f, 'info'): |
|
| 3381 |
- info = f.info() |
|
| 3382 |
- etag = info.getheader('ETag')
|
|
| 3383 |
- if etag: |
|
| 3384 |
- result['etag'] = etag |
|
| 3385 |
- last_modified = info.getheader('Last-Modified')
|
|
| 3386 |
- if last_modified: |
|
| 3387 |
- result['modified'] = _parse_date(last_modified) |
|
| 3388 |
- if hasattr(f, 'url'): |
|
| 3389 |
- result['href'] = f.url |
|
| 3390 |
- result['status'] = 200 |
|
| 3391 |
- if hasattr(f, 'status'): |
|
| 3392 |
- result['status'] = f.status |
|
| 3393 |
- if hasattr(f, 'headers'): |
|
| 3394 |
- result['headers'] = f.headers.dict |
|
| 3395 |
- if hasattr(f, 'close'): |
|
| 3396 |
- f.close() |
|
| 3397 |
- |
|
| 3398 |
- # there are four encodings to keep track of: |
|
| 3399 |
- # - http_encoding is the encoding declared in the Content-Type HTTP header |
|
| 3400 |
- # - xml_encoding is the encoding declared in the <?xml declaration |
|
| 3401 |
- # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data |
|
| 3402 |
- # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications |
|
| 3403 |
- http_headers = result.get('headers', {})
|
|
| 3404 |
- result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \ |
|
| 3405 |
- _getCharacterEncoding(http_headers, data) |
|
| 3406 |
- if http_headers and (not acceptable_content_type): |
|
| 3407 |
- if http_headers.has_key('content-type'):
|
|
| 3408 |
- bozo_message = '%s is not an XML media type' % http_headers['content-type'] |
|
| 3409 |
- else: |
|
| 3410 |
- bozo_message = 'no Content-type specified' |
|
| 3411 |
- result['bozo'] = 1 |
|
| 3412 |
- result['bozo_exception'] = NonXMLContentType(bozo_message) |
|
| 3413 |
- |
|
| 3414 |
- result['version'], data, entities = _stripDoctype(data) |
|
| 3415 |
- |
|
| 3416 |
- baseuri = http_headers.get('content-location', result.get('href'))
|
|
| 3417 |
- baselang = http_headers.get('content-language', None)
|
|
| 3418 |
- |
|
| 3419 |
- # if server sent 304, we're done |
|
| 3420 |
- if result.get('status', 0) == 304:
|
|
| 3421 |
- result['version'] = '' |
|
| 3422 |
- result['debug_message'] = 'The feed has not changed since you last checked, ' + \ |
|
| 3423 |
- 'so the server sent no data. This is a feature, not a bug!' |
|
| 3424 |
- return result |
|
| 3425 |
- |
|
| 3426 |
- # if there was a problem downloading, we're done |
|
| 3427 |
- if not data: |
|
| 3428 |
- return result |
|
| 3429 |
- |
|
| 3430 |
- # determine character encoding |
|
| 3431 |
- use_strict_parser = 0 |
|
| 3432 |
- known_encoding = 0 |
|
| 3433 |
- tried_encodings = [] |
|
| 3434 |
- # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM |
|
| 3435 |
- for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding): |
|
| 3436 |
- if not proposed_encoding: continue |
|
| 3437 |
- if proposed_encoding in tried_encodings: continue |
|
| 3438 |
- tried_encodings.append(proposed_encoding) |
|
| 3439 |
- try: |
|
| 3440 |
- data = _toUTF8(data, proposed_encoding) |
|
| 3441 |
- known_encoding = use_strict_parser = 1 |
|
| 3442 |
- break |
|
| 3443 |
- except: |
|
| 3444 |
- pass |
|
| 3445 |
- # if no luck and we have auto-detection library, try that |
|
| 3446 |
- if (not known_encoding) and chardet: |
|
| 3447 |
- try: |
|
| 3448 |
- proposed_encoding = chardet.detect(data)['encoding'] |
|
| 3449 |
- if proposed_encoding and (proposed_encoding not in tried_encodings): |
|
| 3450 |
- tried_encodings.append(proposed_encoding) |
|
| 3451 |
- data = _toUTF8(data, proposed_encoding) |
|
| 3452 |
- known_encoding = use_strict_parser = 1 |
|
| 3453 |
- except: |
|
| 3454 |
- pass |
|
| 3455 |
- # if still no luck and we haven't tried utf-8 yet, try that |
|
| 3456 |
- if (not known_encoding) and ('utf-8' not in tried_encodings):
|
|
| 3457 |
- try: |
|
| 3458 |
- proposed_encoding = 'utf-8' |
|
| 3459 |
- tried_encodings.append(proposed_encoding) |
|
| 3460 |
- data = _toUTF8(data, proposed_encoding) |
|
| 3461 |
- known_encoding = use_strict_parser = 1 |
|
| 3462 |
- except: |
|
| 3463 |
- pass |
|
| 3464 |
- # if still no luck and we haven't tried windows-1252 yet, try that |
|
| 3465 |
- if (not known_encoding) and ('windows-1252' not in tried_encodings):
|
|
| 3466 |
- try: |
|
| 3467 |
- proposed_encoding = 'windows-1252' |
|
| 3468 |
- tried_encodings.append(proposed_encoding) |
|
| 3469 |
- data = _toUTF8(data, proposed_encoding) |
|
| 3470 |
- known_encoding = use_strict_parser = 1 |
|
| 3471 |
- except: |
|
| 3472 |
- pass |
|
| 3473 |
- # if still no luck and we haven't tried iso-8859-2 yet, try that. |
|
| 3474 |
- if (not known_encoding) and ('iso-8859-2' not in tried_encodings):
|
|
| 3475 |
- try: |
|
| 3476 |
- proposed_encoding = 'iso-8859-2' |
|
| 3477 |
- tried_encodings.append(proposed_encoding) |
|
| 3478 |
- data = _toUTF8(data, proposed_encoding) |
|
| 3479 |
- known_encoding = use_strict_parser = 1 |
|
| 3480 |
- except: |
|
| 3481 |
- pass |
|
| 3482 |
- # if still no luck, give up |
|
| 3483 |
- if not known_encoding: |
|
| 3484 |
- result['bozo'] = 1 |
|
| 3485 |
- result['bozo_exception'] = CharacterEncodingUnknown( \ |
|
| 3486 |
- 'document encoding unknown, I tried ' + \ |
|
| 3487 |
- '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' % \ |
|
| 3488 |
- (result['encoding'], xml_encoding)) |
|
| 3489 |
- result['encoding'] = '' |
|
| 3490 |
- elif proposed_encoding != result['encoding']: |
|
| 3491 |
- result['bozo'] = 1 |
|
| 3492 |
- result['bozo_exception'] = CharacterEncodingOverride( \ |
|
| 3493 |
- 'documented declared as %s, but parsed as %s' % \ |
|
| 3494 |
- (result['encoding'], proposed_encoding)) |
|
| 3495 |
- result['encoding'] = proposed_encoding |
|
| 3496 |
- |
|
| 3497 |
- if not _XML_AVAILABLE: |
|
| 3498 |
- use_strict_parser = 0 |
|
| 3499 |
- if use_strict_parser: |
|
| 3500 |
- # initialize the SAX parser |
|
| 3501 |
- feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8') |
|
| 3502 |
- saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS) |
|
| 3503 |
- saxparser.setFeature(xml.sax.handler.feature_namespaces, 1) |
|
| 3504 |
- saxparser.setContentHandler(feedparser) |
|
| 3505 |
- saxparser.setErrorHandler(feedparser) |
|
| 3506 |
- source = xml.sax.xmlreader.InputSource() |
|
| 3507 |
- source.setByteStream(_StringIO(data)) |
|
| 3508 |
- if hasattr(saxparser, '_ns_stack'): |
|
| 3509 |
- # work around bug in built-in SAX parser (doesn't recognize xml: namespace) |
|
| 3510 |
- # PyXML doesn't have this problem, and it doesn't have _ns_stack either |
|
| 3511 |
- saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
|
|
| 3512 |
- try: |
|
| 3513 |
- saxparser.parse(source) |
|
| 3514 |
- except Exception, e: |
|
| 3515 |
- if _debug: |
|
| 3516 |
- import traceback |
|
| 3517 |
- traceback.print_stack() |
|
| 3518 |
- traceback.print_exc() |
|
| 3519 |
- sys.stderr.write('xml parsing failed\n')
|
|
| 3520 |
- result['bozo'] = 1 |
|
| 3521 |
- result['bozo_exception'] = feedparser.exc or e |
|
| 3522 |
- use_strict_parser = 0 |
|
| 3523 |
- if not use_strict_parser: |
|
| 3524 |
- feedparser = _LooseFeedParser(baseuri, baselang, known_encoding and 'utf-8' or '', entities) |
|
| 3525 |
- feedparser.feed(data) |
|
| 3526 |
- result['feed'] = feedparser.feeddata |
|
| 3527 |
- result['entries'] = feedparser.entries |
|
| 3528 |
- result['version'] = result['version'] or feedparser.version |
|
| 3529 |
- result['namespaces'] = feedparser.namespacesInUse |
|
| 3530 |
- return result |
|
| 3531 |
- |
|
| 3532 |
-class Serializer: |
|
| 3533 |
- def __init__(self, results): |
|
| 3534 |
- self.results = results |
|
| 3535 |
- |
|
| 3536 |
-class TextSerializer(Serializer): |
|
| 3537 |
- def write(self, stream=sys.stdout): |
|
| 3538 |
- self._writer(stream, self.results, '') |
|
| 3539 |
- |
|
| 3540 |
- def _writer(self, stream, node, prefix): |
|
| 3541 |
- if not node: return |
|
| 3542 |
- if hasattr(node, 'keys'): |
|
| 3543 |
- keys = node.keys() |
|
| 3544 |
- keys.sort() |
|
| 3545 |
- for k in keys: |
|
| 3546 |
- if k in ('description', 'link'): continue
|
|
| 3547 |
- if node.has_key(k + '_detail'): continue |
|
| 3548 |
- if node.has_key(k + '_parsed'): continue |
|
| 3549 |
- self._writer(stream, node[k], prefix + k + '.') |
|
| 3550 |
- elif type(node) == types.ListType: |
|
| 3551 |
- index = 0 |
|
| 3552 |
- for n in node: |
|
| 3553 |
- self._writer(stream, n, prefix[:-1] + '[' + str(index) + '].') |
|
| 3554 |
- index += 1 |
|
| 3555 |
- else: |
|
| 3556 |
- try: |
|
| 3557 |
- s = str(node).encode('utf-8')
|
|
| 3558 |
- s = s.replace('\\', '\\\\')
|
|
| 3559 |
- s = s.replace('\r', '')
|
|
| 3560 |
- s = s.replace('\n', r'\n')
|
|
| 3561 |
- stream.write(prefix[:-1]) |
|
| 3562 |
- stream.write('=')
|
|
| 3563 |
- stream.write(s) |
|
| 3564 |
- stream.write('\n')
|
|
| 3565 |
- except: |
|
| 3566 |
- pass |
|
| 3567 |
- |
|
| 3568 |
-class PprintSerializer(Serializer): |
|
| 3569 |
- def write(self, stream=sys.stdout): |
|
| 3570 |
- if self.results.has_key('href'):
|
|
| 3571 |
- stream.write(self.results['href'] + '\n\n') |
|
| 3572 |
- from pprint import pprint |
|
| 3573 |
- pprint(self.results, stream) |
|
| 3574 |
- stream.write('\n')
|
|
| 3575 |
- |
|
| 3576 |
-if __name__ == '__main__': |
|
| 3577 |
- try: |
|
| 3578 |
- from optparse import OptionParser |
|
| 3579 |
- except: |
|
| 3580 |
- OptionParser = None |
|
| 3581 |
- |
|
| 3582 |
- if OptionParser: |
|
| 3583 |
- optionParser = OptionParser(version=__version__, usage="%prog [options] url_or_filename_or_-") |
|
| 3584 |
- optionParser.set_defaults(format="pprint") |
|
| 3585 |
- optionParser.add_option("-A", "--user-agent", dest="agent", metavar="AGENT", help="User-Agent for HTTP URLs")
|
|
| 3586 |
- optionParser.add_option("-e", "--referer", "--referrer", dest="referrer", metavar="URL", help="Referrer for HTTP URLs")
|
|
| 3587 |
- optionParser.add_option("-t", "--etag", dest="etag", metavar="TAG", help="ETag/If-None-Match for HTTP URLs")
|
|
| 3588 |
- optionParser.add_option("-m", "--last-modified", dest="modified", metavar="DATE", help="Last-modified/If-Modified-Since for HTTP URLs (any supported date format)")
|
|
| 3589 |
- optionParser.add_option("-f", "--format", dest="format", metavar="FORMAT", help="output results in FORMAT (text, pprint)")
|
|
| 3590 |
- optionParser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False, help="write debugging information to stderr")
|
|
| 3591 |
- (options, urls) = optionParser.parse_args() |
|
| 3592 |
- if options.verbose: |
|
| 3593 |
- _debug = 1 |
|
| 3594 |
- if not urls: |
|
| 3595 |
- optionParser.print_help() |
|
| 3596 |
- sys.exit(0) |
|
| 3597 |
- else: |
|
| 3598 |
- if not sys.argv[1:]: |
|
| 3599 |
- print __doc__ |
|
| 3600 |
- sys.exit(0) |
|
| 3601 |
- class _Options: |
|
| 3602 |
- etag = modified = agent = referrer = None |
|
| 3603 |
- format = 'pprint' |
|
| 3604 |
- options = _Options() |
|
| 3605 |
- urls = sys.argv[1:] |
|
| 3606 |
- |
|
| 3607 |
- zopeCompatibilityHack() |
|
| 3608 |
- |
|
| 3609 |
- serializer = globals().get(options.format.capitalize() + 'Serializer', Serializer) |
|
| 3610 |
- for url in urls: |
|
| 3611 |
- results = parse(url, etag=options.etag, modified=options.modified, agent=options.agent, referrer=options.referrer) |
|
| 3612 |
- serializer(results).write(sys.stdout) |
| ... | ... |
@@ -1,4 +1,4 @@ |
| 1 |
-#!/usr/bin/env python |
|
| 1 |
+#!/home/dblume/opt/python-3.9.6/bin/python3 |
|
| 2 | 2 |
# |
| 3 | 3 |
# Testing without affecting the yaml file and saving the updated one aside: |
| 4 | 4 |
# cp techcrunch.yaml techcrunch.yaml_back; ./techcrunch.py; \ |
| ... | ... |
@@ -14,18 +14,16 @@ import codecs |
| 14 | 14 |
import traceback |
| 15 | 15 |
import calendar |
| 16 | 16 |
import pickle |
| 17 |
-import exceptions |
|
| 18 |
-import urllib |
|
| 19 |
-import urllib2 |
|
| 20 |
-import httplib |
|
| 17 |
+import urllib.request, urllib.parse, urllib.error |
|
| 18 |
+import http.client |
|
| 21 | 19 |
import shutil |
| 22 | 20 |
import smtplib |
| 23 | 21 |
import analysis |
| 24 | 22 |
import json |
| 25 | 23 |
import xml |
| 26 | 24 |
import operator |
| 27 |
-import cgi |
|
| 28 |
-import cStringIO |
|
| 25 |
+import html |
|
| 26 |
+import io |
|
| 29 | 27 |
import smtp_creds # Your own credentials, used in send_email() |
| 30 | 28 |
|
| 31 | 29 |
debug = True |
| ... | ... |
@@ -144,7 +142,7 @@ def send_email(subject, message, toaddrs, |
| 144 | 142 |
|
| 145 | 143 |
def index_id(a_list, op, elem): |
| 146 | 144 |
try: |
| 147 |
- return (index for index, item in enumerate(a_list) if op(item, elem)).next() |
|
| 145 |
+ return next((index for index, item in enumerate(a_list) if op(item, elem))) |
|
| 148 | 146 |
except: |
| 149 | 147 |
return -1 |
| 150 | 148 |
|
| ... | ... |
@@ -219,31 +217,31 @@ def process_feed(yaml_items): |
| 219 | 217 |
else: |
| 220 | 218 |
if feed.status != 200 and feed.status != 307 and feed.status != 301 and feed.status != 302: |
| 221 | 219 |
if feed.status == 503: |
| 222 |
- print "the feed is temporarily unavailable." |
|
| 220 |
+ print("the feed is temporarily unavailable.")
|
|
| 223 | 221 |
elif feed.status == 400: |
| 224 |
- print "the feed says we made a bad request." |
|
| 222 |
+ print("the feed says we made a bad request.")
|
|
| 225 | 223 |
elif feed.status == 502: |
| 226 |
- print "the feed reported a bad gateway error." |
|
| 224 |
+ print("the feed reported a bad gateway error.")
|
|
| 227 | 225 |
elif feed.status == 404: |
| 228 |
- print "the feed says the page was not found." |
|
| 226 |
+ print("the feed says the page was not found.")
|
|
| 229 | 227 |
elif feed.status == 500: |
| 230 |
- print "the feed had an internal server error." |
|
| 228 |
+ print("the feed had an internal server error.")
|
|
| 231 | 229 |
elif feed.status == 403: |
| 232 |
- print "Access to the feed was forbidden." |
|
| 230 |
+ print("Access to the feed was forbidden.")
|
|
| 233 | 231 |
else: |
| 234 |
- print "the feed returned feed.status %d." % ( feed.status, ) |
|
| 232 |
+ print("the feed returned feed.status %d." % ( feed.status, ))
|
|
| 235 | 233 |
else: |
| 236 | 234 |
# Save off this |
| 237 | 235 |
if hasattr(feed, 'bozo_exception') and isinstance(feed.bozo_exception, xml.sax._exceptions.SAXParseException): |
| 238 |
- print "Didn't pickle TechCrunch feed because it had a bozo_exception: %s" % (str(feed.bozo_exception)) |
|
| 236 |
+ print("Didn't pickle TechCrunch feed because it had a bozo_exception: %s" % (str(feed.bozo_exception)))
|
|
| 239 | 237 |
else: |
| 240 | 238 |
try: |
| 241 | 239 |
with open(os.path.join(localdir, 'techcrunch_feed.pickle'), 'wb') as f: |
| 242 | 240 |
pickle.dump(feed, f) |
| 243 | 241 |
except(pickle.PicklingError, exceptions.TypeError) as e: |
| 244 |
- print "An error occurred while pickling the feed: %s." % \ |
|
| 242 |
+ print("An error occurred while pickling the feed: %s." % \
|
|
| 245 | 243 |
(# str(e.__class__), |
| 246 |
- str(e)) |
|
| 244 |
+ str(e))) |
|
| 247 | 245 |
traceback.print_exc(3, file=sys.stdout) |
| 248 | 246 |
|
| 249 | 247 |
for i in reversed(feed.entries): |
| ... | ... |
@@ -261,32 +259,32 @@ def process_feed(yaml_items): |
| 261 | 259 |
else: |
| 262 | 260 |
if hasattr(feed, 'bozo_exception'): |
| 263 | 261 |
e = feed.bozo_exception |
| 264 |
- if isinstance(e, urllib2.URLError): |
|
| 262 |
+ if isinstance(e, urllib.error.URLError): |
|
| 265 | 263 |
print_last_line = True |
| 266 | 264 |
if hasattr(e, 'reason'): |
| 267 | 265 |
if e.reason[0] == 110: |
| 268 |
- print "the feed's connection timed out." |
|
| 266 |
+ print("the feed's connection timed out.")
|
|
| 269 | 267 |
print_last_line = False |
| 270 | 268 |
elif e.reason[0] == 111: |
| 271 |
- print "the feed's connection was refused." |
|
| 269 |
+ print("the feed's connection was refused.")
|
|
| 272 | 270 |
print_last_line = False |
| 273 | 271 |
elif e.reason[0] == 104: |
| 274 |
- print "the feed reset the connection." |
|
| 272 |
+ print("the feed reset the connection.")
|
|
| 275 | 273 |
print_last_line = False |
| 276 | 274 |
else: |
| 277 |
- print "the feed had a URLError with reason %s." % (str(e.reason),) |
|
| 275 |
+ print("the feed had a URLError with reason %s." % (str(e.reason),))
|
|
| 278 | 276 |
print_last_line = False |
| 279 | 277 |
if print_last_line: |
| 280 |
- print "the feed had a URLError %s" % (str(e),) |
|
| 281 |
- elif isinstance(e, httplib.BadStatusLine): |
|
| 282 |
- print "the feed gave a bad status line. (%s)" % (str(e),) |
|
| 278 |
+ print("the feed had a URLError %s" % (str(e),))
|
|
| 279 |
+ elif isinstance(e, http.client.BadStatusLine): |
|
| 280 |
+ print("the feed gave a bad status line. (%s)" % (str(e),))
|
|
| 283 | 281 |
else: |
| 284 | 282 |
if len(str(e)): |
| 285 |
- print "the feed bozo_exception: %s \"%s\"" % (str(e.__class__), str(e)) |
|
| 283 |
+ print("the feed bozo_exception: %s \"%s\"" % (str(e.__class__), str(e)))
|
|
| 286 | 284 |
else: |
| 287 |
- print "the feed bozo_exception: %s %s" % (str(e.__class__), repr(e)) |
|
| 285 |
+ print("the feed bozo_exception: %s %s" % (str(e.__class__), repr(e)))
|
|
| 288 | 286 |
else: |
| 289 |
- print "the feed returned class %s, %s" % (str(feed.__class__), str(feed)) |
|
| 287 |
+ print("the feed returned class %s, %s" % (str(feed.__class__), str(feed)))
|
|
| 290 | 288 |
|
| 291 | 289 |
|
| 292 | 290 |
def process_item(feed_item, yaml_items): |
| ... | ... |
@@ -300,7 +298,7 @@ def process_item(feed_item, yaml_items): |
| 300 | 298 |
elif hasattr(feed_item, 'date_parsed'): |
| 301 | 299 |
date_parsed = feed_item.date_parsed |
| 302 | 300 |
else: |
| 303 |
- print "process_item found no timestamp for", asciiize(feed_item.link) |
|
| 301 |
+ print("process_item found no timestamp for", asciiize(feed_item.link))
|
|
| 304 | 302 |
timecode_parsed = calendar.timegm(date_parsed) |
| 305 | 303 |
|
| 306 | 304 |
link = feed_item.link |
| ... | ... |
@@ -398,21 +396,21 @@ def Get_fb_stats(url_string): |
| 398 | 396 |
url_string = url_string.encode('utf-8')
|
| 399 | 397 |
|
| 400 | 398 |
try: |
| 401 |
- encoded = urllib.urlencode({'access_token': facebook_token})
|
|
| 399 |
+ encoded = urllib.parse.urlencode({'access_token': facebook_token})
|
|
| 402 | 400 |
url = 'https://graph.facebook.com/v2.11/?id=%s&fields=engagement&%s' |
| 403 |
- f = urllib2.urlopen(url % (urllib.quote_plus(url_string), encoded)) |
|
| 401 |
+ f = urllib.request.urlopen(url % (urllib.parse.quote_plus(url_string), encoded)) |
|
| 404 | 402 |
data = f.read() |
| 405 | 403 |
f.close() |
| 406 |
- except (urllib2.URLError, httplib.BadStatusLine) as e: |
|
| 404 |
+ except (urllib.error.URLError, http.client.BadStatusLine) as e: |
|
| 407 | 405 |
if hasattr(e, 'reason'): # URLError |
| 408 | 406 |
if hasattr(e, 'code'): |
| 409 |
- print "Get_fb_stats got an error (1):", e.code, e.reason, url_string |
|
| 407 |
+ print("Get_fb_stats got an error (1):", e.code, e.reason, url_string)
|
|
| 410 | 408 |
else: |
| 411 |
- print "Get_fb_stats got an error (2):", e.reason, url_string |
|
| 409 |
+ print("Get_fb_stats got an error (2):", e.reason, url_string)
|
|
| 412 | 410 |
elif hasattr(e, 'code'): #URLError |
| 413 |
- print "Get_fb_stats got an error. Code:", e.code, url_string |
|
| 411 |
+ print("Get_fb_stats got an error. Code:", e.code, url_string)
|
|
| 414 | 412 |
else: |
| 415 |
- print "Get_fb_stats got an error (3):", str(e) |
|
| 413 |
+ print("Get_fb_stats got an error (3):", str(e))
|
|
| 416 | 414 |
return shares, comments, likes |
| 417 | 415 |
if len(data) > 20: |
| 418 | 416 |
d = json.loads(data)['engagement'] |
| ... | ... |
@@ -435,7 +433,7 @@ def Get_fb_stats(url_string): |
| 435 | 433 |
except KeyError: |
| 436 | 434 |
comments = 0 |
| 437 | 435 |
else: |
| 438 |
- print "Get_fb_stats got too little data for ", url_string |
|
| 436 |
+ print("Get_fb_stats got too little data for ", url_string)
|
|
| 439 | 437 |
return shares, comments, likes |
| 440 | 438 |
|
| 441 | 439 |
|
| ... | ... |
@@ -445,7 +443,7 @@ def make_index_html(yaml_items, weekend_stats, weekday_stats): |
| 445 | 443 |
new_index_fullpath = os.path.join(localdir, 'index.html_new') |
| 446 | 444 |
index_fullpath = os.path.join(localdir, 'index.html') |
| 447 | 445 |
|
| 448 |
- chart_io = cStringIO.StringIO() |
|
| 446 |
+ chart_io = io.StringIO() |
|
| 449 | 447 |
for image_index, image in enumerate(yaml_items[:40]): |
| 450 | 448 |
tag_hit = False |
| 451 | 449 |
if image['author'].lower() in authors_to_post: |
| ... | ... |
@@ -498,8 +496,8 @@ def make_feed_file(yaml_items): |
| 498 | 496 |
for item in yaml_items: |
| 499 | 497 |
now = time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime(item['orig_posted']))
|
| 500 | 498 |
if item['qualified'] != -1: |
| 501 |
- escaped_title = cgi.escape(item['title']).encode('ascii', 'xmlcharrefreplace')
|
|
| 502 |
- escaped_author = cgi.escape(item['author']).encode('ascii', 'xmlcharrefreplace')
|
|
| 499 |
+ escaped_title = html.escape(item['title']).encode('ascii', 'xmlcharrefreplace')
|
|
| 500 |
+ escaped_author = html.escape(item['author']).encode('ascii', 'xmlcharrefreplace')
|
|
| 503 | 501 |
f.write("<item><title>%s</title><pubDate>%s</pubDate><link>%s</link><guid isPermaLink=\"false\">%s</guid><description><![CDATA[By: %s]]></description></item>\n" % \
|
| 504 | 502 |
(escaped_title, now, item['link'], item['link'], escaped_author)) |
| 505 | 503 |
count += 1 |
| ... | ... |
@@ -514,7 +512,7 @@ if __name__=='__main__': |
| 514 | 512 |
|
| 515 | 513 |
old_stdout = sys.stdout |
| 516 | 514 |
old_stderr = sys.stderr |
| 517 |
- sys.stdout = sys.stderr = cStringIO.StringIO() |
|
| 515 |
+ sys.stdout = sys.stderr = io.StringIO() |
|
| 518 | 516 |
|
| 519 | 517 |
try: |
| 520 | 518 |
localdir = os.path.abspath(os.path.dirname(sys.argv[0])) |
| ... | ... |
@@ -542,7 +540,7 @@ if __name__=='__main__': |
| 542 | 540 |
with open(yaml_fullpath, 'rb') as f: |
| 543 | 541 |
items = yaml.load(f, Loader=yaml.Loader) |
| 544 | 542 |
if items is None: |
| 545 |
- print yaml_fullpath, "exists, but was empty." |
|
| 543 |
+ print(yaml_fullpath, "exists, but was empty.") |
|
| 546 | 544 |
items = [] |
| 547 | 545 |
|
| 548 | 546 |
# Do any dictionary item updating that might be necessary |
| ... | ... |
@@ -550,7 +548,7 @@ if __name__=='__main__': |
| 550 | 548 |
# if not item.has_key('fb_shares'):
|
| 551 | 549 |
# item['fb_shares'] = [] |
| 552 | 550 |
else: |
| 553 |
- print "could not open", yaml_fullpath |
|
| 551 |
+ print("could not open", yaml_fullpath)
|
|
| 554 | 552 |
items = [] |
| 555 | 553 |
|
| 556 | 554 |
with open(os.path.join(localdir, 'facebook-token.txt'), 'r') as f: |
| ... | ... |
@@ -607,7 +605,7 @@ if __name__=='__main__': |
| 607 | 605 |
try: |
| 608 | 606 |
os.rename(yaml_newfile_fullpath, yaml_fullpath) |
| 609 | 607 |
except OSError as e: |
| 610 |
- print "The source file was", yaml_newfile_fullpath, "and exists =", os.path.isfile(yaml_newfile_fullpath) |
|
| 608 |
+ print("The source file was", yaml_newfile_fullpath, "and exists =", os.path.isfile(yaml_newfile_fullpath))
|
|
| 611 | 609 |
with open(os.path.join(localdir, 'techcrunch_text.yaml'), 'w') as f: |
| 612 | 610 |
yaml.dump(items, f, default_flow_style=None, width=120) |
| 613 | 611 |
with codecs.open(os.path.join(localdir, 'techcrunch_unicode.yaml'), 'w', 'utf-8') as f: |
| ... | ... |
@@ -617,24 +615,24 @@ if __name__=='__main__': |
| 617 | 615 |
|
| 618 | 616 |
make_index_html(items, weekend_stats, weekday_stats) |
| 619 | 617 |
else: |
| 620 |
- print "No entries were added this time." |
|
| 618 |
+ print("No entries were added this time.")
|
|
| 621 | 619 |
|
| 622 | 620 |
except Exception as e: |
| 623 | 621 |
exceptional_text = "An exception occurred: " + str(e.__class__) + " " + str(e) |
| 624 |
- print exceptional_text, ' '.join(progress_text) |
|
| 622 |
+ print(exceptional_text, ' '.join(progress_text)) |
|
| 625 | 623 |
traceback.print_exc(file=sys.stdout) |
| 626 | 624 |
try: |
| 627 | 625 |
send_email('Exception thrown in ' + os.path.basename(__file__),
|
| 628 | 626 |
exceptional_text + "\n" + traceback.format_exc(), |
| 629 | 627 |
(smtp_creds.default_recipient,)) |
| 630 | 628 |
except Exception as e: |
| 631 |
- print "Could not send email to notify you of the exception. :("
|
|
| 629 |
+ print("Could not send email to notify you of the exception. :(")
|
|
| 632 | 630 |
|
| 633 | 631 |
message = sys.stdout.getvalue() |
| 634 | 632 |
sys.stdout = old_stdout |
| 635 | 633 |
sys.stderr = old_stderr |
| 636 | 634 |
if not debug: |
| 637 |
- print message |
|
| 635 |
+ print(message) |
|
| 638 | 636 |
|
| 639 | 637 |
# Finally, let's save this to a statistics page |
| 640 | 638 |
if os.path.exists(os.path.join(localdir, 'stats.txt')): |
| 641 | 639 |