- commit
- bd0cbefebd9d191c24bd1e0bd065aade6c66d7b5
- parent
- 7f46223aba8eec112efcd3cd5937a07e01d1af3f
- Author
- Tobias Bengfort <tobias.bengfort@gmx.net>
- Date
- 2015-12-06 21:20
mv more scraping code to lib
Diffstat
| M | jsonproxy/api.py | 32 | ++++++++++---------------------- |
| M | jsonproxy/lib.py | 23 | +++++++++++++++++++++++ |
2 files changed, 33 insertions, 22 deletions
diff --git a/jsonproxy/api.py b/jsonproxy/api.py
@@ -14,12 +14,11 @@ from flask import abort 14 14 from flask import jsonify 15 15 from flask import make_response 16 16 from flask import render_template17 -1 from bs4 import BeautifulSoup18 17 import cachetools 19 1820 -1 from .lib import get_fields21 19 from .lib import _doc 22 20 from .lib import ENDPOINTS -1 21 from .lib import scrape 23 22 24 23 api = Blueprint('api', __name__, static_folder='static') 25 24 @@ -32,7 +31,7 @@ def get_config(endpoint): 32 31 33 32 34 33 @cachetools.ttl_cache()35 -1 def urlopen(url, parse=False):-1 34 def urlopen(url): 36 35 try: 37 36 current_app.logger.info('fetching %s' % url) 38 37 original = _urlopen(url) @@ -41,36 +40,25 @@ def urlopen(url, parse=False): 41 40 code = original.getcode() 42 41 headers = original.headers.items() 43 4244 -1 if parse:45 -1 return BeautifulSoup(body)46 -1 else:47 -1 return body, code, headers-1 43 return body, code, headers 48 44 except HTTPError as error: 49 45 abort(error.code) 50 46 51 4752 -1 def scrape(url, config):53 -1 html = urlopen(url, parse=True)54 -1 data = get_fields(html, config)55 -1 data['url'] = url56 -1 if 'post' in config:57 -1 data = config['post'](data)58 -1 return jsonify(data)59 -160 -161 -1 def proxy(url, config):62 -1 return make_response(*urlopen(url))63 -164 -165 48 @api.route('/<endpoint>/<path:path>', methods=['GET']) 66 49 def main(endpoint, path): 67 50 config = get_config(endpoint) 68 51 url = request.url.replace(request.host_url + endpoint + '/', config['host']) 69 52 -1 53 body, code, headers = urlopen(url) -1 54 70 55 if 'fields' in config:71 -1 response = scrape(url, config)-1 56 if code == 200: -1 57 response = jsonify(scrape(url, body, config)) -1 58 else: -1 59 abort(code) 72 60 else:73 -1 response = proxy(url, config)-1 61 response = make_response(body, code) 74 62 75 63 if current_app.config.get('ALLOW_CORS', False): 76 64 response.headers['Access-Control-Allow-Origin'] = '*'
diff --git a/jsonproxy/lib.py b/jsonproxy/lib.py
@@ -1,3 +1,12 @@ -1 1 import argparse -1 2 -1 3 from bs4 import BeautifulSoup -1 4 -1 5 try: -1 6 from functools import lfu_cache -1 7 except ImportError: -1 8 from cachetools import lfu_cache -1 9 1 10 ENDPOINTS = 'ENDPOINTS' 2 11 3 12 @@ -34,6 +43,20 @@ def get_fields(html, config): 34 43 return data 35 44 36 45 -1 46 @lfu_cache() -1 47 def parse_html(body): -1 48 return BeautifulSoup(body) -1 49 -1 50 -1 51 def scrape(url, body, config): -1 52 html = parse_html(body) -1 53 data = get_fields(html, config) -1 54 data['url'] = url -1 55 if 'post' in config: -1 56 data = config['post'](data) -1 57 return data -1 58 -1 59 37 60 def _fields_doc(config): 38 61 if isinstance(config, dict): 39 62 fields = config.get('fields', {})