- commit
- 7fa385428e8a21c7b4efc643ffcf5b6475ad7b6f
- parent
- 9dc70f62554d44f059ec9f3587c232a844d94991
- Author
- Tobias Bengfort <tobias.bengfort@gmx.net>
- Date
- 2015-12-06 07:50
mv reusable logic to separate module
Diffstat
| M | jsonproxy/__init__.py | 31 | +------------------------------ |
| M | jsonproxy/api.py | 82 | ++++++++++++++----------------------------------------------- |
| A | jsonproxy/lib.py | 82 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
3 files changed, 101 insertions, 94 deletions
diff --git a/jsonproxy/__init__.py b/jsonproxy/__init__.py
@@ -7,36 +7,7 @@ import argparse 7 7 from flask import Flask 8 8 9 9 from .api import api10 -111 -1 ENDPOINTS = 'ENDPOINTS'12 -113 -114 -1 def check_fields_config(fields, endpoint, field=''):15 -1 for key, value in fields.items():16 -1 full_key = field + '.' + key if field else key17 -1 if isinstance(value, dict):18 -1 if 'selector' not in value:19 -1 yield ('No selector configured for field %s in endpoint %s.' %20 -1 (full_key, endpoint))21 -1 if 'fields' in value:22 -1 for error in check_fields_config(value['fields'], endpoint, full_key):23 -1 yield error24 -125 -126 -1 def check_config(config):27 -1 errors = []28 -129 -1 if ENDPOINTS not in config or len(config[ENDPOINTS]) == 0:30 -1 errors.append('No endpoints configured.')31 -1 else:32 -1 for key, data in config[ENDPOINTS].items():33 -1 if 'fields' in data:34 -1 if len(data['fields']) == 0:35 -1 errors.append('No fields configured for endpoint %s.' % key)36 -1 else:37 -1 errors += list(check_fields_config(data['fields'], key))38 -139 -1 return errors-1 10 from .lib import check_config 40 11 41 12 42 13 def main():
diff --git a/jsonproxy/api.py b/jsonproxy/api.py
@@ -1,3 +1,5 @@ -1 1 from __future__ import absolute_import -1 2 1 3 try: 2 4 from urllib.request import urlopen as _urlopen 3 5 from urllib.error import HTTPError @@ -15,9 +17,20 @@ from flask import render_template 15 17 from bs4 import BeautifulSoup 16 18 import cachetools 17 19 -1 20 from .lib import get_fields -1 21 from .lib import _doc -1 22 from .lib import ENDPOINTS -1 23 18 24 api = Blueprint('api', __name__, static_folder='static') 19 25 20 26 -1 27 def get_config(endpoint): -1 28 try: -1 29 return current_app.config[ENDPOINTS][endpoint] -1 30 except KeyError: -1 31 abort(404) -1 32 -1 33 21 34 @cachetools.ttl_cache() 22 35 def urlopen(url, parse=False): 23 36 try: @@ -36,39 +49,6 @@ def urlopen(url, parse=False): 36 49 abort(error.code) 37 50 38 5139 -1 def get_attribute_list(html, selector):40 -1 s = selector.rsplit('@', 1)[0]41 -1 if s:42 -1 elements = html.select(s)43 -1 else:44 -1 elements = [html]45 -146 -1 if '@' in selector:47 -1 attr = selector.rsplit('@', 1)[1]48 -1 return [element[attr] for element in elements]49 -1 else:50 -1 return [element.text.strip() for element in elements]51 -152 -153 -1 def get_attribute(html, selector):54 -1 l = get_attribute_list(html, selector)55 -1 if len(l) > 0:56 -1 return l[0]57 -158 -159 -1 def get_fields(html, config):60 -1 data = {}61 -1 for key, value in config['fields'].items():62 -1 if isinstance(value, str):63 -1 data[key] = get_attribute(html, value)64 -1 elif 'fields' in value:65 -1 elements = html.select(value['selector'])66 -1 data[key] = [get_fields(e, value) for e in elements]67 -1 else:68 -1 data[key] = get_attribute_list(html, value['selector'])69 -1 return data70 -171 -172 52 def scrape(url, config): 73 53 html = urlopen(url, parse=True) 74 54 data = get_fields(html, config) @@ -84,11 +64,7 @@ def proxy(url, config): 84 64 85 65 @api.route('/<endpoint>/<path:path>', methods=['GET']) 86 66 def main(endpoint, path):87 -1 try:88 -1 config = current_app.config['ENDPOINTS'][endpoint]89 -1 except KeyError:90 -1 abort(404)91 -1-1 67 config = get_config(endpoint) 92 68 url = request.url.replace(request.host_url + endpoint + '/', config['host']) 93 69 94 70 if 'fields' in config: @@ -102,36 +78,14 @@ def main(endpoint, path): 102 78 return response 103 79 104 80105 -1 def _fields_doc(config):106 -1 if isinstance(config, dict):107 -1 fields = config.get('fields', {})108 -1 doc = config.get('fields_doc', {})109 -1 for key in fields:110 -1 yield key, doc.get(key, ''), list(_fields_doc(fields[key]))111 -1112 -1113 -1 def _doc(endpoint):114 -1 config = current_app.config['ENDPOINTS'][endpoint]115 -1116 -1 data = {117 -1 'title': endpoint,118 -1 'doc': config.get('doc', ''),119 -1 'fields': list(_fields_doc(config)),120 -1 }121 -1122 -1 data['fields'].append(('url', 'url of the scraped page', []))123 -1124 -1 return data125 -1126 -1127 81 @api.route('/', methods=['GET']) 128 82 def index():129 -1 data = [_doc(endpoint) for endpoint in current_app.config['ENDPOINTS']]-1 83 config = current_app.config[ENDPOINTS] -1 84 data = [_doc(config[endpoint], endpoint) for endpoint in config] 130 85 return render_template('index.html', endpoints=data) 131 86 132 87 133 88 @api.route('/<endpoint>/', methods=['GET']) 134 89 def doc(endpoint):135 -1 if endpoint not in current_app.config['ENDPOINTS']:136 -1 abort(404)137 -1 return render_template('index.html', endpoints=[_doc(endpoint)])-1 90 config = get_config(endpoint) -1 91 return render_template('index.html', endpoints=[_doc(config, endpoint)])
diff --git a/jsonproxy/lib.py b/jsonproxy/lib.py
@@ -0,0 +1,82 @@
-1 1 ENDPOINTS = 'ENDPOINTS'
-1 2
-1 3
-1 4 def get_attribute_list(html, selector):
-1 5 s = selector.rsplit('@', 1)[0]
-1 6 if s:
-1 7 elements = html.select(s)
-1 8 else:
-1 9 elements = [html]
-1 10
-1 11 if '@' in selector:
-1 12 attr = selector.rsplit('@', 1)[1]
-1 13 return [element[attr] for element in elements]
-1 14 else:
-1 15 return [element.text.strip() for element in elements]
-1 16
-1 17
-1 18 def get_attribute(html, selector):
-1 19 l = get_attribute_list(html, selector)
-1 20 if len(l) > 0:
-1 21 return l[0]
-1 22
-1 23
-1 24 def get_fields(html, config):
-1 25 data = {}
-1 26 for key, value in config['fields'].items():
-1 27 if isinstance(value, str):
-1 28 data[key] = get_attribute(html, value)
-1 29 elif 'fields' in value:
-1 30 elements = html.select(value['selector'])
-1 31 data[key] = [get_fields(e, value) for e in elements]
-1 32 else:
-1 33 data[key] = get_attribute_list(html, value['selector'])
-1 34 return data
-1 35
-1 36
-1 37 def _fields_doc(config):
-1 38 if isinstance(config, dict):
-1 39 fields = config.get('fields', {})
-1 40 doc = config.get('fields_doc', {})
-1 41 for key in fields:
-1 42 yield key, doc.get(key, ''), list(_fields_doc(fields[key]))
-1 43
-1 44
-1 45 def _doc(config, endpoint):
-1 46 data = {
-1 47 'title': endpoint,
-1 48 'doc': config.get('doc', ''),
-1 49 'fields': list(_fields_doc(config)),
-1 50 }
-1 51
-1 52 data['fields'].append(('url', 'url of the scraped page', []))
-1 53
-1 54 return data
-1 55
-1 56
-1 57 def check_fields_config(fields, endpoint, field=''):
-1 58 for key, value in fields.items():
-1 59 full_key = field + '.' + key if field else key
-1 60 if isinstance(value, dict):
-1 61 if 'selector' not in value:
-1 62 yield ('No selector configured for field %s in endpoint %s.' %
-1 63 (full_key, endpoint))
-1 64 if 'fields' in value:
-1 65 for error in check_fields_config(value['fields'], endpoint, full_key):
-1 66 yield error
-1 67
-1 68
-1 69 def check_config(config):
-1 70 errors = []
-1 71
-1 72 if ENDPOINTS not in config or len(config[ENDPOINTS]) == 0:
-1 73 errors.append('No endpoints configured.')
-1 74 else:
-1 75 for key, data in config[ENDPOINTS].items():
-1 76 if 'fields' in data:
-1 77 if len(data['fields']) == 0:
-1 78 errors.append('No fields configured for endpoint %s.' % key)
-1 79 else:
-1 80 errors += list(check_fields_config(data['fields'], key))
-1 81
-1 82 return errors