- commit
- a8584f62f23281c3f219faae2f09b319585a6d9e
- parent
- fcc85232beceacf9b225ab752e25e215f1df0122
- Author
- Tobias Bengfort <tobias.bengfort@posteo.de>
- Date
- 2023-07-01 18:16
update clean_html()
Diffstat
| M | plutopluto/__init__.py | 55 | +++++++++++++++++++++++++++++++++++++++++++------------ |
1 files changed, 43 insertions, 12 deletions
diff --git a/plutopluto/__init__.py b/plutopluto/__init__.py
@@ -19,19 +19,50 @@ __version__ = '1.2.0' 19 19 app = Flask(__name__) 20 20 21 2122 -1 def strip_atts(s):23 -1 """Strip possibly dangerous HTML attributes."""-1 22 def clean_html(s): -1 23 """Strip possibly dangerous HTML.""" -1 24 -1 25 allowed_tags = [ -1 26 'p', -1 27 'a', -1 28 'ul', -1 29 'ol', -1 30 'li', -1 31 'blockquote', -1 32 'em', -1 33 'strong', -1 34 'img', -1 35 'video', -1 36 'h1', -1 37 'h2', -1 38 'h3', -1 39 'h4', -1 40 'h5', -1 41 'h6', -1 42 'pre', -1 43 'code', -1 44 'hr', -1 45 'table', -1 46 'tr', -1 47 'td', -1 48 'th', -1 49 'details', -1 50 'summary', -1 51 ] -1 52 allowed_attrs = ['href', 'src', 'alt', 'title'] 24 5325 -1 whitelist = ['href', 'src', 'alt', 'title', 'datetime']26 54 tree = BeautifulSoup(s) 27 55 28 56 for tag in tree.find_all():29 -1 l = []30 -1 for attr in tag.attrs:31 -1 if attr not in whitelist:32 -1 l.append(attr)33 -1 for attr in l:34 -1 del tag.attrs[attr]-1 57 if tag.name not in allowed_tags: -1 58 if tag.name in ['script']: -1 59 tag.extract() -1 60 else: -1 61 tag.hidden = True -1 62 else: -1 63 for attr in set(tag.attrs) - set(allowed_attrs): -1 64 del tag.attrs[attr] -1 65 35 66 return str(tree) 36 67 37 68 @@ -55,14 +86,14 @@ def parse(url): 55 86 d['source'] = feed.feed.get('title') 56 87 if 'youtube' in url: 57 88 template = u'<img alt="%s" src="%s" />\n<div>%s</div>'58 -1 d['content'] = strip_atts(template % (-1 89 d['content'] = clean_html(template % ( 59 90 item['media_content'][0]['url'], 60 91 item['media_thumbnail'][0]['url'], 61 92 item['media_description'])) 62 93 elif 'content' in item:63 -1 d['content'] = strip_atts(item['content'][0]['value'])-1 94 d['content'] = clean_html(item['content'][0]['value']) 64 95 else:65 -1 d['content'] = strip_atts(item.get('description'))-1 96 d['content'] = clean_html(item.get('description')) 66 97 return d 67 98 68 99 return {