PyJSONProxy

simple proxy and scraper
git clone https://git.ce9e.org/PyJSONProxy.git

commit
bd0cbefebd9d191c24bd1e0bd065aade6c66d7b5
parent
7f46223aba8eec112efcd3cd5937a07e01d1af3f
Author
Tobias Bengfort <tobias.bengfort@gmx.net>
Date
2015-12-06 21:20
mv more scraping code to lib

Diffstat

M jsonproxy/api.py 32 ++++++++++----------------------
M jsonproxy/lib.py 23 +++++++++++++++++++++++

2 files changed, 33 insertions, 22 deletions


diff --git a/jsonproxy/api.py b/jsonproxy/api.py

@@ -14,12 +14,11 @@ from flask import abort
   14    14 from flask import jsonify
   15    15 from flask import make_response
   16    16 from flask import render_template
   17    -1 from bs4 import BeautifulSoup
   18    17 import cachetools
   19    18 
   20    -1 from .lib import get_fields
   21    19 from .lib import _doc
   22    20 from .lib import ENDPOINTS
   -1    21 from .lib import scrape
   23    22 
   24    23 api = Blueprint('api', __name__, static_folder='static')
   25    24 
@@ -32,7 +31,7 @@ def get_config(endpoint):
   32    31 
   33    32 
   34    33 @cachetools.ttl_cache()
   35    -1 def urlopen(url, parse=False):
   -1    34 def urlopen(url):
   36    35 	try:
   37    36 		current_app.logger.info('fetching %s' % url)
   38    37 		original = _urlopen(url)
@@ -41,36 +40,25 @@ def urlopen(url, parse=False):
   41    40 		code = original.getcode()
   42    41 		headers = original.headers.items()
   43    42 
   44    -1 		if parse:
   45    -1 			return BeautifulSoup(body)
   46    -1 		else:
   47    -1 			return body, code, headers
   -1    43 		return body, code, headers
   48    44 	except HTTPError as error:
   49    45 		abort(error.code)
   50    46 
   51    47 
   52    -1 def scrape(url, config):
   53    -1 	html = urlopen(url, parse=True)
   54    -1 	data = get_fields(html, config)
   55    -1 	data['url'] = url
   56    -1 	if 'post' in config:
   57    -1 		data = config['post'](data)
   58    -1 	return jsonify(data)
   59    -1 
   60    -1 
   61    -1 def proxy(url, config):
   62    -1 	return make_response(*urlopen(url))
   63    -1 
   64    -1 
   65    48 @api.route('/<endpoint>/<path:path>', methods=['GET'])
   66    49 def main(endpoint, path):
   67    50 	config = get_config(endpoint)
   68    51 	url = request.url.replace(request.host_url + endpoint + '/', config['host'])
   69    52 
   -1    53 	body, code, headers = urlopen(url)
   -1    54 
   70    55 	if 'fields' in config:
   71    -1 		response = scrape(url, config)
   -1    56 		if code == 200:
   -1    57 			response = jsonify(scrape(url, body, config))
   -1    58 		else:
   -1    59 			abort(code)
   72    60 	else:
   73    -1 		response = proxy(url, config)
   -1    61 		response = make_response(body, code)
   74    62 
   75    63 	if current_app.config.get('ALLOW_CORS', False):
   76    64 		response.headers['Access-Control-Allow-Origin'] = '*'

diff --git a/jsonproxy/lib.py b/jsonproxy/lib.py

@@ -1,3 +1,12 @@
   -1     1 import argparse
   -1     2 
   -1     3 from bs4 import BeautifulSoup
   -1     4 
   -1     5 try:
   -1     6 	from functools import lfu_cache
   -1     7 except ImportError:
   -1     8 	from cachetools import lfu_cache
   -1     9 
    1    10 ENDPOINTS = 'ENDPOINTS'
    2    11 
    3    12 
@@ -34,6 +43,20 @@ def get_fields(html, config):
   34    43 	return data
   35    44 
   36    45 
   -1    46 @lfu_cache()
   -1    47 def parse_html(body):
   -1    48 	return BeautifulSoup(body)
   -1    49 
   -1    50 
   -1    51 def scrape(url, body, config):
   -1    52 	html = parse_html(body)
   -1    53 	data = get_fields(html, config)
   -1    54 	data['url'] = url
   -1    55 	if 'post' in config:
   -1    56 		data = config['post'](data)
   -1    57 	return data
   -1    58 
   -1    59 
   37    60 def _fields_doc(config):
   38    61 	if isinstance(config, dict):
   39    62 		fields = config.get('fields', {})