PyJSONProxy

simple proxy and scraper
git clone https://git.ce9e.org/PyJSONProxy.git

commit
7fa385428e8a21c7b4efc643ffcf5b6475ad7b6f
parent
9dc70f62554d44f059ec9f3587c232a844d94991
Author
Tobias Bengfort <tobias.bengfort@gmx.net>
Date
2015-12-06 07:50
mv reusable logic to separate module

Diffstat

M jsonproxy/__init__.py 31 +------------------------------
M jsonproxy/api.py 82 ++++++++++++++-----------------------------------------------
A jsonproxy/lib.py 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

3 files changed, 101 insertions, 94 deletions


diff --git a/jsonproxy/__init__.py b/jsonproxy/__init__.py

@@ -7,36 +7,7 @@ import argparse
    7     7 from flask import Flask
    8     8 
    9     9 from .api import api
   10    -1 
   11    -1 ENDPOINTS = 'ENDPOINTS'
   12    -1 
   13    -1 
   14    -1 def check_fields_config(fields, endpoint, field=''):
   15    -1 	for key, value in fields.items():
   16    -1 		full_key = field + '.' + key if field else key
   17    -1 		if isinstance(value, dict):
   18    -1 			if 'selector' not in value:
   19    -1 				yield ('No selector configured for field %s in endpoint %s.' %
   20    -1 					(full_key, endpoint))
   21    -1 			if 'fields' in value:
   22    -1 				for error in check_fields_config(value['fields'], endpoint, full_key):
   23    -1 					yield error
   24    -1 
   25    -1 
   26    -1 def check_config(config):
   27    -1 	errors = []
   28    -1 
   29    -1 	if ENDPOINTS not in config or len(config[ENDPOINTS]) == 0:
   30    -1 		errors.append('No endpoints configured.')
   31    -1 	else:
   32    -1 		for key, data in config[ENDPOINTS].items():
   33    -1 			if 'fields' in data:
   34    -1 				if len(data['fields']) == 0:
   35    -1 					errors.append('No fields configured for endpoint %s.' % key)
   36    -1 				else:
   37    -1 					errors += list(check_fields_config(data['fields'], key))
   38    -1 
   39    -1 	return errors
   -1    10 from .lib import check_config
   40    11 
   41    12 
   42    13 def main():

diff --git a/jsonproxy/api.py b/jsonproxy/api.py

@@ -1,3 +1,5 @@
   -1     1 from __future__ import absolute_import
   -1     2 
    1     3 try:
    2     4 	from urllib.request import urlopen as _urlopen
    3     5 	from urllib.error import HTTPError
@@ -15,9 +17,20 @@ from flask import render_template
   15    17 from bs4 import BeautifulSoup
   16    18 import cachetools
   17    19 
   -1    20 from .lib import get_fields
   -1    21 from .lib import _doc
   -1    22 from .lib import ENDPOINTS
   -1    23 
   18    24 api = Blueprint('api', __name__, static_folder='static')
   19    25 
   20    26 
   -1    27 def get_config(endpoint):
   -1    28 	try:
   -1    29 		return current_app.config[ENDPOINTS][endpoint]
   -1    30 	except KeyError:
   -1    31 		abort(404)
   -1    32 
   -1    33 
   21    34 @cachetools.ttl_cache()
   22    35 def urlopen(url, parse=False):
   23    36 	try:
@@ -36,39 +49,6 @@ def urlopen(url, parse=False):
   36    49 		abort(error.code)
   37    50 
   38    51 
   39    -1 def get_attribute_list(html, selector):
   40    -1 	s = selector.rsplit('@', 1)[0]
   41    -1 	if s:
   42    -1 		elements = html.select(s)
   43    -1 	else:
   44    -1 		elements = [html]
   45    -1 
   46    -1 	if '@' in selector:
   47    -1 		attr = selector.rsplit('@', 1)[1]
   48    -1 		return [element[attr] for element in elements]
   49    -1 	else:
   50    -1 		return [element.text.strip() for element in elements]
   51    -1 
   52    -1 
   53    -1 def get_attribute(html, selector):
   54    -1 	l = get_attribute_list(html, selector)
   55    -1 	if len(l) > 0:
   56    -1 		return l[0]
   57    -1 
   58    -1 
   59    -1 def get_fields(html, config):
   60    -1 	data = {}
   61    -1 	for key, value in config['fields'].items():
   62    -1 		if isinstance(value, str):
   63    -1 			data[key] = get_attribute(html, value)
   64    -1 		elif 'fields' in value:
   65    -1 			elements = html.select(value['selector'])
   66    -1 			data[key] = [get_fields(e, value) for e in elements]
   67    -1 		else:
   68    -1 			data[key] = get_attribute_list(html, value['selector'])
   69    -1 	return data
   70    -1 
   71    -1 
   72    52 def scrape(url, config):
   73    53 	html = urlopen(url, parse=True)
   74    54 	data = get_fields(html, config)
@@ -84,11 +64,7 @@ def proxy(url, config):
   84    64 
   85    65 @api.route('/<endpoint>/<path:path>', methods=['GET'])
   86    66 def main(endpoint, path):
   87    -1 	try:
   88    -1 		config = current_app.config['ENDPOINTS'][endpoint]
   89    -1 	except KeyError:
   90    -1 		abort(404)
   91    -1 
   -1    67 	config = get_config(endpoint)
   92    68 	url = request.url.replace(request.host_url + endpoint + '/', config['host'])
   93    69 
   94    70 	if 'fields' in config:
@@ -102,36 +78,14 @@ def main(endpoint, path):
  102    78 	return response
  103    79 
  104    80 
  105    -1 def _fields_doc(config):
  106    -1 	if isinstance(config, dict):
  107    -1 		fields = config.get('fields', {})
  108    -1 		doc = config.get('fields_doc', {})
  109    -1 		for key in fields:
  110    -1 			yield key, doc.get(key, ''), list(_fields_doc(fields[key]))
  111    -1 
  112    -1 
  113    -1 def _doc(endpoint):
  114    -1 	config = current_app.config['ENDPOINTS'][endpoint]
  115    -1 
  116    -1 	data = {
  117    -1 		'title': endpoint,
  118    -1 		'doc': config.get('doc', ''),
  119    -1 		'fields': list(_fields_doc(config)),
  120    -1 	}
  121    -1 
  122    -1 	data['fields'].append(('url', 'url of the scraped page', []))
  123    -1 
  124    -1 	return data
  125    -1 
  126    -1 
  127    81 @api.route('/', methods=['GET'])
  128    82 def index():
  129    -1 	data = [_doc(endpoint) for endpoint in current_app.config['ENDPOINTS']]
   -1    83 	config = current_app.config[ENDPOINTS]
   -1    84 	data = [_doc(config[endpoint], endpoint) for endpoint in config]
  130    85 	return render_template('index.html', endpoints=data)
  131    86 
  132    87 
  133    88 @api.route('/<endpoint>/', methods=['GET'])
  134    89 def doc(endpoint):
  135    -1 	if endpoint not in current_app.config['ENDPOINTS']:
  136    -1 		abort(404)
  137    -1 	return render_template('index.html', endpoints=[_doc(endpoint)])
   -1    90 	config = get_config(endpoint)
   -1    91 	return render_template('index.html', endpoints=[_doc(config, endpoint)])

diff --git a/jsonproxy/lib.py b/jsonproxy/lib.py

@@ -0,0 +1,82 @@
   -1     1 ENDPOINTS = 'ENDPOINTS'
   -1     2 
   -1     3 
   -1     4 def get_attribute_list(html, selector):
   -1     5 	s = selector.rsplit('@', 1)[0]
   -1     6 	if s:
   -1     7 		elements = html.select(s)
   -1     8 	else:
   -1     9 		elements = [html]
   -1    10 
   -1    11 	if '@' in selector:
   -1    12 		attr = selector.rsplit('@', 1)[1]
   -1    13 		return [element[attr] for element in elements]
   -1    14 	else:
   -1    15 		return [element.text.strip() for element in elements]
   -1    16 
   -1    17 
   -1    18 def get_attribute(html, selector):
   -1    19 	l = get_attribute_list(html, selector)
   -1    20 	if len(l) > 0:
   -1    21 		return l[0]
   -1    22 
   -1    23 
   -1    24 def get_fields(html, config):
   -1    25 	data = {}
   -1    26 	for key, value in config['fields'].items():
   -1    27 		if isinstance(value, str):
   -1    28 			data[key] = get_attribute(html, value)
   -1    29 		elif 'fields' in value:
   -1    30 			elements = html.select(value['selector'])
   -1    31 			data[key] = [get_fields(e, value) for e in elements]
   -1    32 		else:
   -1    33 			data[key] = get_attribute_list(html, value['selector'])
   -1    34 	return data
   -1    35 
   -1    36 
   -1    37 def _fields_doc(config):
   -1    38 	if isinstance(config, dict):
   -1    39 		fields = config.get('fields', {})
   -1    40 		doc = config.get('fields_doc', {})
   -1    41 		for key in fields:
   -1    42 			yield key, doc.get(key, ''), list(_fields_doc(fields[key]))
   -1    43 
   -1    44 
   -1    45 def _doc(config, endpoint):
   -1    46 	data = {
   -1    47 		'title': endpoint,
   -1    48 		'doc': config.get('doc', ''),
   -1    49 		'fields': list(_fields_doc(config)),
   -1    50 	}
   -1    51 
   -1    52 	data['fields'].append(('url', 'url of the scraped page', []))
   -1    53 
   -1    54 	return data
   -1    55 
   -1    56 
   -1    57 def check_fields_config(fields, endpoint, field=''):
   -1    58 	for key, value in fields.items():
   -1    59 		full_key = field + '.' + key if field else key
   -1    60 		if isinstance(value, dict):
   -1    61 			if 'selector' not in value:
   -1    62 				yield ('No selector configured for field %s in endpoint %s.' %
   -1    63 					(full_key, endpoint))
   -1    64 			if 'fields' in value:
   -1    65 				for error in check_fields_config(value['fields'], endpoint, full_key):
   -1    66 					yield error
   -1    67 
   -1    68 
   -1    69 def check_config(config):
   -1    70 	errors = []
   -1    71 
   -1    72 	if ENDPOINTS not in config or len(config[ENDPOINTS]) == 0:
   -1    73 		errors.append('No endpoints configured.')
   -1    74 	else:
   -1    75 		for key, data in config[ENDPOINTS].items():
   -1    76 			if 'fields' in data:
   -1    77 				if len(data['fields']) == 0:
   -1    78 					errors.append('No fields configured for endpoint %s.' % key)
   -1    79 				else:
   -1    80 					errors += list(check_fields_config(data['fields'], key))
   -1    81 
   -1    82 	return errors