PyJSONProxy

simple proxy and scraper
git clone https://git.ce9e.org/PyJSONProxy.git

commit
8e9785bc7037f1733abbd5ba82e378be20b59d68
parent
db5d4b2797b145c227988da36cfcdc351ae10ace
Author
Tobias Bengfort <tobias.bengfort@gmx.net>
Date
2015-12-06 22:07
reimplement based on asynio

for performance and profit

Diffstat

M jsonproxy/__init__.py 106 ++++++++++++++++++++++++++++++++++---------------------------

1 files changed, 59 insertions, 47 deletions


diff --git a/jsonproxy/__init__.py b/jsonproxy/__init__.py

@@ -1,23 +1,16 @@
    1    -1 from __future__ import absolute_import
    2    -1 
   -1     1 import asyncio
    3     2 import os
    4     3 import sys
    5     4 
    6    -1 try:
    7    -1 	from urllib.request import urlopen as _urlopen
    8    -1 	from urllib.error import HTTPError
    9    -1 except ImportError:
   10    -1 	from urllib2 import urlopen as _urlopen
   11    -1 	from urllib2 import HTTPError
   12    -1 
   13    -1 from flask import abort
   14    -1 from flask import current_app
   15    -1 from flask import Flask
   16    -1 from flask import jsonify
   17    -1 from flask import make_response
   18    -1 from flask import render_template
   19    -1 from flask import request
   20    -1 import cachetools
   -1     5 import aiohttp
   -1     6 from aiohttp import web
   -1     7 from bs4 import BeautifulSoup
   -1     8 
   -1     9 from .web import Application
   -1    10 from .web import jsonify
   -1    11 from .web import render_template
   -1    12 from .web import make_response
   -1    13 from .web import abort
   21    14 
   22    15 from .lib import check_config
   23    16 from .lib import _doc
@@ -25,66 +18,85 @@ from .lib import ENDPOINTS
   25    18 from .lib import parse_args
   26    19 from .lib import scrape
   27    20 
   28    -1 app = Flask(__name__)
   -1    21 app = Application(__name__)
   29    22 
   30    23 
   31    24 def get_config(endpoint):
   32    25 	try:
   33    -1 		return current_app.config[ENDPOINTS][endpoint]
   -1    26 		return app.config[ENDPOINTS][endpoint]
   34    27 	except KeyError:
   35    28 		abort(404)
   36    29 
   37    30 
   38    -1 @cachetools.ttl_cache()
   39    -1 def urlopen(url):
   40    -1 	try:
   41    -1 		current_app.logger.info('fetching %s' % url)
   42    -1 		original = _urlopen(url)
   43    -1 
   44    -1 		body = original.read()
   45    -1 		code = original.getcode()
   46    -1 		headers = original.headers.items()
   -1    31 def async_cache(maxsize=128):
   -1    32 	cache = {}
   -1    33 
   -1    34 	def decorator(fn):
   -1    35 		def wrapper(*args):
   -1    36 			key = ':'.join(args)
   -1    37 			if key not in cache:
   -1    38 				if len(cache) >= maxsize:
   -1    39 					del cache[cache.keys().next()]
   -1    40 				cache[key] = yield from fn(*args)
   -1    41 			return cache[key]
   -1    42 		return wrapper
   -1    43 	return decorator
   -1    44 
   -1    45 
   -1    46 @async_cache()
   -1    47 def _request(method, url):
   -1    48 	app.logger.info(method, url)
   -1    49 	print(method, url)
   -1    50 	response = yield from aiohttp.request(method, url)
   -1    51 	if response.status != 200:
   -1    52 		abort(response.status)
   -1    53 	else:
   -1    54 		return response
   47    55 
   48    -1 		return body, code, headers
   49    -1 	except HTTPError as error:
   50    -1 		abort(error.code)
   51    56 
   -1    57 @app.route('/{endpoint}/{path:.+}', methods=['GET', 'HEAD', 'OPTIONS'])
   -1    58 @asyncio.coroutine
   -1    59 def handle(request):
   -1    60 	endpoint = request.match_info['endpoint']
   52    61 
   53    -1 @app.route('/<endpoint>/<path:path>', methods=['GET'])
   54    -1 def handle(endpoint, path):
   55    62 	config = get_config(endpoint)
   56    -1 	url = request.url.replace(request.host_url + endpoint + '/', config['host'])
   -1    63 	url = config['host'] + request.match_info['path']
   -1    64 	if request.query_string:
   -1    65 		url += '?' + request.query_string
   57    66 
   58    -1 	body, code, headers = urlopen(url)
   -1    67 	remote = yield from _request(request.method, url)
   -1    68 	body = yield from remote.read()
   59    69 
   60    -1 	if 'fields' in config:
   61    -1 		response = jsonify(scrape(url, body, config))
   -1    70 	if 'fields' in config and request.method == 'GET':
   -1    71 		response = jsonify(scrape(url, body, config), status=remote.status)
   62    72 	else:
   63    -1 		response = make_response(body, code)
   -1    73 		response = make_response(body, status=remote.status)
   64    74 
   65    -1 	if current_app.config.get('ALLOW_CORS', False):
   -1    75 	if app.config.get('ALLOW_CORS', False):
   66    76 		response.headers['Access-Control-Allow-Origin'] = '*'
   67    77 
   68    78 	return response
   69    79 
   70    80 
   71    -1 @app.route('/', methods=['GET'])
   72    -1 def index():
   73    -1 	config = current_app.config[ENDPOINTS]
   -1    81 @app.route('/')
   -1    82 def index(request):
   -1    83 	config = app.config[ENDPOINTS]
   74    84 	data = [_doc(config[endpoint], endpoint) for endpoint in config]
   75    85 	return render_template('index.html', endpoints=data)
   76    86 
   77    87 
   78    -1 @app.route('/<endpoint>/', methods=['GET'])
   79    -1 def doc(endpoint):
   80    -1 	config = get_config(endpoint)
   81    -1 	return render_template('index.html', endpoints=[_doc(config, endpoint)])
   -1    88 @app.route('/{endpoint}/')
   -1    89 def doc(request):
   -1    90 	endpoint = request.match_info['endpoint']
   -1    91 	config = app.get_config(endpoint)
   -1    92 	data = [_doc(config, endpoint)]
   -1    93 	return render_template('index.html', endpoints=data)
   82    94 
   83    95 
   84    96 def main():
   85    97 	args = parse_args()
   86    98 
   87    -1 	app.config.from_pyfile(os.path.abspath(args.config))
   -1    99 	app.config_from_file(os.path.abspath(args.config))
   88   100 	app.debug = args.debug
   89   101 
   90   102 	errors = check_config(app.config)