plutopluto

git clone https://git.ce9e.org/plutopluto.git

commit
a8584f62f23281c3f219faae2f09b319585a6d9e
parent
fcc85232beceacf9b225ab752e25e215f1df0122
Author
Tobias Bengfort <tobias.bengfort@posteo.de>
Date
2023-07-01 18:16
update clean_html()

Diffstat

M plutopluto/__init__.py 55 +++++++++++++++++++++++++++++++++++++++++++------------

1 files changed, 43 insertions, 12 deletions


diff --git a/plutopluto/__init__.py b/plutopluto/__init__.py

@@ -19,19 +19,50 @@ __version__ = '1.2.0'
   19    19 app = Flask(__name__)
   20    20 
   21    21 
   22    -1 def strip_atts(s):
   23    -1     """Strip possibly dangerous HTML attributes."""
   -1    22 def clean_html(s):
   -1    23     """Strip possibly dangerous HTML."""
   -1    24 
   -1    25     allowed_tags = [
   -1    26         'p',
   -1    27         'a',
   -1    28         'ul',
   -1    29         'ol',
   -1    30         'li',
   -1    31         'blockquote',
   -1    32         'em',
   -1    33         'strong',
   -1    34         'img',
   -1    35         'video',
   -1    36         'h1',
   -1    37         'h2',
   -1    38         'h3',
   -1    39         'h4',
   -1    40         'h5',
   -1    41         'h6',
   -1    42         'pre',
   -1    43         'code',
   -1    44         'hr',
   -1    45         'table',
   -1    46         'tr',
   -1    47         'td',
   -1    48         'th',
   -1    49         'details',
   -1    50         'summary',
   -1    51     ]
   -1    52     allowed_attrs = ['href', 'src', 'alt', 'title']
   24    53 
   25    -1     whitelist = ['href', 'src', 'alt', 'title', 'datetime']
   26    54     tree = BeautifulSoup(s)
   27    55 
   28    56     for tag in tree.find_all():
   29    -1         l = []
   30    -1         for attr in tag.attrs:
   31    -1             if attr not in whitelist:
   32    -1                 l.append(attr)
   33    -1         for attr in l:
   34    -1             del tag.attrs[attr]
   -1    57         if tag.name not in allowed_tags:
   -1    58             if tag.name in ['script']:
   -1    59                 tag.extract()
   -1    60             else:
   -1    61                 tag.hidden = True
   -1    62         else:
   -1    63             for attr in set(tag.attrs) - set(allowed_attrs):
   -1    64                 del tag.attrs[attr]
   -1    65 
   35    66     return str(tree)
   36    67 
   37    68 
@@ -55,14 +86,14 @@ def parse(url):
   55    86         d['source'] = feed.feed.get('title')
   56    87         if 'youtube' in url:
   57    88             template = u'<img alt="%s" src="%s" />\n<div>%s</div>'
   58    -1             d['content'] = strip_atts(template % (
   -1    89             d['content'] = clean_html(template % (
   59    90                 item['media_content'][0]['url'],
   60    91                 item['media_thumbnail'][0]['url'],
   61    92                 item['media_description']))
   62    93         elif 'content' in item:
   63    -1             d['content'] = strip_atts(item['content'][0]['value'])
   -1    94             d['content'] = clean_html(item['content'][0]['value'])
   64    95         else:
   65    -1             d['content'] = strip_atts(item.get('description'))
   -1    96             d['content'] = clean_html(item.get('description'))
   66    97         return d
   67    98 
   68    99     return {