PyJSONProxy

simple proxy and scraper
git clone https://git.ce9e.org/PyJSONProxy.git

commit
901b5090d4e4d995221d08e95aed7342fc20c89f
parent
75dfc6834307e3989467112711318ba07ad9b0af
Author
Tobias Bengfort <tobias.bengfort@gmx.net>
Date
2015-02-05 17:58
Gardening

Diffstat

M jsonproxy/api.py 111 ++++++++++++++++++++++++++++++++++++-------------------------

1 files changed, 65 insertions, 46 deletions


diff --git a/jsonproxy/api.py b/jsonproxy/api.py

@@ -1,8 +1,8 @@
    1     1 try:
    2    -1 	from urllib.request import urlopen
   -1     2 	from urllib.request import urlopen as _urlopen
    3     3 	from urllib.error import HTTPError
    4     4 except ImportError:
    5    -1 	from urllib2 import urlopen
   -1     5 	from urllib2 import urlopen as _urlopen
    6     6 	from urllib2 import HTTPError
    7     7 
    8     8 from flask import Blueprint
@@ -18,6 +18,23 @@ from bs4 import BeautifulSoup
   18    18 api = Blueprint('api', __name__, static_folder='static')
   19    19 
   20    20 
   -1    21 def urlopen(url, parse=False):
   -1    22 	try:
   -1    23 		current_app.logger.info('fetching %s' % url)
   -1    24 		original = _urlopen(url)
   -1    25 
   -1    26 		body = original.read()
   -1    27 		code = original.getcode()
   -1    28 		headers = original.headers.items()
   -1    29 
   -1    30 		if parse:
   -1    31 			return BeautifulSoup(body)
   -1    32 		else:
   -1    33 			return body, code, headers
   -1    34 	except HTTPError as error:
   -1    35 		abort(error.code)
   -1    36 
   -1    37 
   21    38 def get_attribute_list(html, selector):
   22    39 	if '@' in selector:
   23    40 		s, attr = selector.rsplit('@', 1)
@@ -30,6 +47,51 @@ def get_attribute(html, selector):
   30    47 	return get_attribute_list(html, selector)[0]
   31    48 
   32    49 
   -1    50 def scrape_item(url, config):
   -1    51 	tree = urlopen(url, parse=True)
   -1    52 	data = {
   -1    53 		'url': url
   -1    54 	}
   -1    55 	for key, selector in config['fields'].items():
   -1    56 		data[key] = get_attribute(tree, selector)
   -1    57 	return jsonify(data)
   -1    58 
   -1    59 
   -1    60 def scrape_list(url, config):
   -1    61 	tree = urlopen(url, parse=True)
   -1    62 	return jsonify({
   -1    63 		'url': url,
   -1    64 		'l': get_attribute_list(tree, config['selector'])
   -1    65 	})
   -1    66 
   -1    67 
   -1    68 def proxy(url, config):
   -1    69 	return make_response(*urlopen(url))
   -1    70 
   -1    71 
   -1    72 @api.route('/<endpoint>/<path:path>', methods=['GET'])
   -1    73 def main(endpoint, path):
   -1    74 	try:
   -1    75 		config = current_app.config['ENDPOINTS'][endpoint]
   -1    76 	except KeyError:
   -1    77 		abort(404)
   -1    78 
   -1    79 	url = request.url.replace(request.host_url + endpoint + '/', config['host'])
   -1    80 	_type = config.get('type', 'proxy')
   -1    81 
   -1    82 	if _type == 'scrape_item':
   -1    83 		response = scrape_item(url, config)
   -1    84 	elif _type == 'scrape_list':
   -1    85 		response = scrape_list(url, config)
   -1    86 	else:
   -1    87 		response = proxy(url, config)
   -1    88 
   -1    89 	if current_app.config.get('ALLOW_CORS', False):
   -1    90 		response.headers['Access-Control-Allow-Origin'] = '*'
   -1    91 
   -1    92 	return response
   -1    93 
   -1    94 
   33    95 def _doc(endpoint):
   34    96 	config = current_app.config['ENDPOINTS'][endpoint]
   35    97 	url_doc = 'url of the scraped page'
@@ -58,7 +120,7 @@ def _doc(endpoint):
   58   120 
   59   121 
   60   122 @api.route('/', methods=['GET'])
   61    -1 def main():
   -1   123 def index():
   62   124 	data = [_doc(endpoint) for endpoint in current_app.config['ENDPOINTS']]
   63   125 	return render_template('index.html', endpoints=data)
   64   126 
@@ -68,46 +130,3 @@ def doc(endpoint):
   68   130 	if endpoint not in current_app.config['ENDPOINTS']:
   69   131 		abort(404)
   70   132 	return render_template('index.html', endpoints=[_doc(endpoint)])
   71    -1 
   72    -1 
   73    -1 @api.route('/<endpoint>/<path:path>', methods=['GET'])
   74    -1 def proxy(endpoint, path):
   75    -1 	try:
   76    -1 		config = current_app.config['ENDPOINTS'][endpoint]
   77    -1 	except KeyError:
   78    -1 		abort(404)
   79    -1 
   80    -1 	try:
   81    -1 		url = request.url.replace(request.host_url + endpoint + '/', config['host'])
   82    -1 		current_app.logger.info('fetching %s' % url)
   83    -1 		original = urlopen(url)
   84    -1 	except HTTPError as error:
   85    -1 		abort(error.code)
   86    -1 
   87    -1 	body = original.read()
   88    -1 	code = original.getcode()
   89    -1 	headers = original.headers.items()
   90    -1 
   91    -1 	type = config.get('type', 'proxy')
   92    -1 
   93    -1 	if type == 'scrape_item':
   94    -1 		html = BeautifulSoup(body)
   95    -1 		data = {
   96    -1 			'url': url
   97    -1 		}
   98    -1 		for key, selector in config['fields'].items():
   99    -1 			data[key] = get_attribute(html, selector)
  100    -1 		response = jsonify(data)
  101    -1 	elif type == 'scrape_list':
  102    -1 		html = BeautifulSoup(body)
  103    -1 		response = jsonify({
  104    -1 			'url': url,
  105    -1 			'l': get_attribute_list(html, config['selector'])
  106    -1 		})
  107    -1 	else:
  108    -1 		response = make_response(body, code, headers)
  109    -1 
  110    -1 	if current_app.config.get('ALLOW_CORS', False):
  111    -1 		response.headers['Access-Control-Allow-Origin'] = '*'
  112    -1 
  113    -1 	return response