- commit
- 901b5090d4e4d995221d08e95aed7342fc20c89f
- parent
- 75dfc6834307e3989467112711318ba07ad9b0af
- Author
- Tobias Bengfort <tobias.bengfort@gmx.net>
- Date
- 2015-02-05 17:58
Gardening
Diffstat
| M | jsonproxy/api.py | 111 | ++++++++++++++++++++++++++++++++++++------------------------- |
1 files changed, 65 insertions, 46 deletions
diff --git a/jsonproxy/api.py b/jsonproxy/api.py
@@ -1,8 +1,8 @@ 1 1 try:2 -1 from urllib.request import urlopen-1 2 from urllib.request import urlopen as _urlopen 3 3 from urllib.error import HTTPError 4 4 except ImportError:5 -1 from urllib2 import urlopen-1 5 from urllib2 import urlopen as _urlopen 6 6 from urllib2 import HTTPError 7 7 8 8 from flask import Blueprint @@ -18,6 +18,23 @@ from bs4 import BeautifulSoup 18 18 api = Blueprint('api', __name__, static_folder='static') 19 19 20 20 -1 21 def urlopen(url, parse=False): -1 22 try: -1 23 current_app.logger.info('fetching %s' % url) -1 24 original = _urlopen(url) -1 25 -1 26 body = original.read() -1 27 code = original.getcode() -1 28 headers = original.headers.items() -1 29 -1 30 if parse: -1 31 return BeautifulSoup(body) -1 32 else: -1 33 return body, code, headers -1 34 except HTTPError as error: -1 35 abort(error.code) -1 36 -1 37 21 38 def get_attribute_list(html, selector): 22 39 if '@' in selector: 23 40 s, attr = selector.rsplit('@', 1) @@ -30,6 +47,51 @@ def get_attribute(html, selector): 30 47 return get_attribute_list(html, selector)[0] 31 48 32 49 -1 50 def scrape_item(url, config): -1 51 tree = urlopen(url, parse=True) -1 52 data = { -1 53 'url': url -1 54 } -1 55 for key, selector in config['fields'].items(): -1 56 data[key] = get_attribute(tree, selector) -1 57 return jsonify(data) -1 58 -1 59 -1 60 def scrape_list(url, config): -1 61 tree = urlopen(url, parse=True) -1 62 return jsonify({ -1 63 'url': url, -1 64 'l': get_attribute_list(tree, config['selector']) -1 65 }) -1 66 -1 67 -1 68 def proxy(url, config): -1 69 return make_response(*urlopen(url)) -1 70 -1 71 -1 72 @api.route('/<endpoint>/<path:path>', methods=['GET']) -1 73 def main(endpoint, path): -1 74 try: -1 75 config = current_app.config['ENDPOINTS'][endpoint] -1 76 except KeyError: -1 77 abort(404) -1 78 -1 79 url = request.url.replace(request.host_url + endpoint + '/', config['host']) -1 80 _type = config.get('type', 'proxy') -1 81 -1 82 if _type == 'scrape_item': -1 83 response = scrape_item(url, config) -1 84 elif _type == 'scrape_list': -1 85 response = scrape_list(url, config) -1 86 else: -1 87 response = proxy(url, config) -1 88 -1 89 if current_app.config.get('ALLOW_CORS', False): -1 90 response.headers['Access-Control-Allow-Origin'] = '*' -1 91 -1 92 return response -1 93 -1 94 33 95 def _doc(endpoint): 34 96 config = current_app.config['ENDPOINTS'][endpoint] 35 97 url_doc = 'url of the scraped page' @@ -58,7 +120,7 @@ def _doc(endpoint): 58 120 59 121 60 122 @api.route('/', methods=['GET'])61 -1 def main():-1 123 def index(): 62 124 data = [_doc(endpoint) for endpoint in current_app.config['ENDPOINTS']] 63 125 return render_template('index.html', endpoints=data) 64 126 @@ -68,46 +130,3 @@ def doc(endpoint): 68 130 if endpoint not in current_app.config['ENDPOINTS']: 69 131 abort(404) 70 132 return render_template('index.html', endpoints=[_doc(endpoint)])71 -172 -173 -1 @api.route('/<endpoint>/<path:path>', methods=['GET'])74 -1 def proxy(endpoint, path):75 -1 try:76 -1 config = current_app.config['ENDPOINTS'][endpoint]77 -1 except KeyError:78 -1 abort(404)79 -180 -1 try:81 -1 url = request.url.replace(request.host_url + endpoint + '/', config['host'])82 -1 current_app.logger.info('fetching %s' % url)83 -1 original = urlopen(url)84 -1 except HTTPError as error:85 -1 abort(error.code)86 -187 -1 body = original.read()88 -1 code = original.getcode()89 -1 headers = original.headers.items()90 -191 -1 type = config.get('type', 'proxy')92 -193 -1 if type == 'scrape_item':94 -1 html = BeautifulSoup(body)95 -1 data = {96 -1 'url': url97 -1 }98 -1 for key, selector in config['fields'].items():99 -1 data[key] = get_attribute(html, selector)100 -1 response = jsonify(data)101 -1 elif type == 'scrape_list':102 -1 html = BeautifulSoup(body)103 -1 response = jsonify({104 -1 'url': url,105 -1 'l': get_attribute_list(html, config['selector'])106 -1 })107 -1 else:108 -1 response = make_response(body, code, headers)109 -1110 -1 if current_app.config.get('ALLOW_CORS', False):111 -1 response.headers['Access-Control-Allow-Origin'] = '*'112 -1113 -1 return response