PyJSONProxy

simple proxy and scraper
git clone https://git.ce9e.org/PyJSONProxy.git

commit
e4ec571ed9fa190ae3487e4067d7e42a45f460e2
parent
ccb4d7342b3c6788159d90f0767c1af51d6dd2a2
Author
Tobias Bengfort <tobias.bengfort@gmx.net>
Date
2015-12-03 23:32
more flexible scrape config

Diffstat

M README.rst 49 ++++++++++++++++++++++++-------------------------
M example.cfg 20 +++++++++++++-------
M jsonproxy/__init__.py 22 ++++++++++++++++------
M jsonproxy/api.py 72 +++++++++++++++++++++++++++++++------------------------------
M jsonproxy/templates/index.html 30 ++++++++++++++----------------

5 files changed, 104 insertions, 89 deletions


diff --git a/README.rst b/README.rst

@@ -34,15 +34,13 @@ HTML pages, PyJSONProxy can extract information from there::
   34    34     {
   35    35       "url": "https://github.com/xi/",
   36    36       "login": "xi",
   37    -1       ...
   38    -1     }
   39    -1     $ curl http://localhost:5000/repos/xi/
   40    -1     {
   41    -1       "url": "https://github.com/xi/",
   42    -1       "l": [
   43    -1         "/xi/pyjsonproxy",
   -1    37       "activity": [
   -1    38         ...
   -1    39       ],
   -1    40       "repos": [{
   44    41         ...
   45    -1       ]
   -1    42       }]
   -1    43       ...
   46    44     }
   47    45 
   48    46 ::
@@ -50,26 +48,27 @@ HTML pages, PyJSONProxy can extract information from there::
   50    48     ENDPOINTS = {
   51    49         'github': {
   52    50             'host': 'https://github.com/',
   53    -1             'type': 'scrape_item',
   -1    51             'type': 'scrape',
   54    52             'fields': {
   55    -1               'login': '.vcard-username',
   56    -1               'fullname': '.vcard-fullname',
   57    -1               'email': '.vcard-details .email',
   58    -1               'join-date': '.vcard-details .join-date@datetime'
   -1    53                 'login': '.vcard-username',
   -1    54                 'fullname': '.vcard-fullname',
   -1    55                 'email': '.vcard-details .email',
   -1    56                 'join-date': '.vcard-details .join-date@datetime',
   -1    57                 'activity': {
   -1    58                     'selector': '.contribution-activity-listing ul a'
   -1    59                 },
   -1    60                 'repos': {
   -1    61                     'selector': '.popular-repos a.mini-repo-list-item',
   -1    62                     'fields': {
   -1    63                         'url': '@href',
   -1    64                         'name': '.repo',
   -1    65                         'description': '.repo-description'
   -1    66                     }
   -1    67                 }
   59    68             }
   60    -1         },
   61    -1         'repos': {
   62    -1             'host': 'https://github.com/',
   63    -1             'type': 'scrape_list',
   64    -1             'selector': '.popular-repos a.mini-repo-list-item@href'
   65    69         }
   66    70     }
   67    71 
   68    -1 There a two options here: ``scrape_item`` and ``scrape_list``. The first
   69    -1 one will take a list of fields and selectors and return only the first
   70    -1 match for each selector.The latter one will only take one selector and
   71    -1 return every match for this selector.
   72    -1 
   73    72 Selectors are generally CSS-selectors with the additional option to
   74    73 select an attribute by appending an ``@`` and the attribute name. If no
   75    74 attribute is selected, the text content of the element will be used.
@@ -86,7 +85,7 @@ all responses.
   86    85 Documentation
   87    86 =============
   88    87 
   89    -1 Some simple documentation is auomatically generated and available under
   -1    88 Some simple documentation is automatically generated and available under
   90    89 ``/`` (for all endpoints) or ``/{endpoint}/`` (for an individual
   91    90 endpoint). To provide some input for this documentation, you can add a
   92    91 description to both endpoints and fields::
@@ -94,7 +93,7 @@ description to both endpoints and fields::
   94    93     ENDPOINTS = {
   95    94         'github': {
   96    95             'host': 'https://github.com/',
   97    -1             'type': 'scrape_item',
   -1    96             'type': 'scrape',
   98    97             'doc': 'Access data about GitHub users',
   99    98             'fields': {
  100    99               'login': '.vcard-username',

diff --git a/example.cfg b/example.cfg

@@ -4,23 +4,29 @@ ENDPOINTS = {
    4     4 	},
    5     5 	'user': {
    6     6 		'host': 'https://github.com/',
    7    -1 		'type': 'scrape_item',
   -1     7 		'type': 'scrape',
    8     8 		'fields': {
    9     9 			'login': '.vcard-username',
   10    10 			'fullname': '.vcard-fullname',
   11    11 			'email': '.vcard-details .email',
   12    -1 			'join-date': '.vcard-details .join-date@datetime'
   -1    12 			'join-date': '.vcard-details .join-date@datetime',
   -1    13 			'activity': {
   -1    14 				'selector': '.contribution-activity-listing ul a'
   -1    15 			},
   -1    16 			'repos': {
   -1    17 				'selector': '.popular-repos a.mini-repo-list-item',
   -1    18 				'fields': {
   -1    19 					'url': '@href',
   -1    20 					'name': '.repo',
   -1    21 					'description': '.repo-description',
   -1    22 				}
   -1    23 			},
   13    24 		},
   14    25 		'fields_doc': {
   15    26 			'login': 'github username',
   16    27 			'fullname': 'the user\'s full name',
   17    28 			'join-date': 'date when the user joined github in ISO-xx format'
   18    29 		}
   19    -1 	},
   20    -1 	'repos': {
   21    -1 		'host': 'https://github.com/',
   22    -1 		'type': 'scrape_list',
   23    -1 		'selector': '.popular-repos a.mini-repo-list-item@href'
   24    30 	}
   25    31 }
   26    32 

diff --git a/jsonproxy/__init__.py b/jsonproxy/__init__.py

@@ -8,10 +8,22 @@ from flask import Flask
    8     8 
    9     9 from .api import api
   10    10 
   11    -1 TYPES = ['proxy', 'scrape_item', 'scrape_list']
   -1    11 TYPES = ['proxy', 'scrape']
   12    12 ENDPOINTS = 'ENDPOINTS'
   13    13 
   14    14 
   -1    15 def check_fields_config(fields, endpoint, field=''):
   -1    16 	for key, value in fields.items():
   -1    17 		full_key = field + '.' + key if field else key
   -1    18 		if isinstance(value, dict):
   -1    19 			if 'selector' not in value:
   -1    20 				yield ('No selector configured for field %s in endpoint %s.' %
   -1    21 					(full_key, endpoint))
   -1    22 			if 'fields' in value:
   -1    23 				for error in check_fields_config(value['fields'], endpoint, full_key):
   -1    24 					yield error
   -1    25 
   -1    26 
   15    27 def check_config(config):
   16    28 	errors = []
   17    29 
@@ -23,14 +35,12 @@ def check_config(config):
   23    35 			if _type not in TYPES:
   24    36 				errors.append('Unknown endpoint type %s for endpoint %s. '
   25    37 					'Choose one of %s.' % (_type, key, ', '.join(TYPES)))
   26    -1 			elif _type == 'scrape_item':
   -1    38 			elif _type == 'scrape':
   27    39 				if 'fields' not in data or len(data['fields']) == 0:
   28    40 					errors.append('No fields configured for endpoint %s of type %s.' %
   29    41 						(key, _type))
   30    -1 			elif _type == 'scrape_list':
   31    -1 				if 'selector' not in data:
   32    -1 					errors.append('Endpoint %s of type %s is missing a selector.' %
   33    -1 						(key, _type))
   -1    42 				else:
   -1    43 					errors += list(check_fields_config(data['fields'], key))
   34    44 
   35    45 	return errors
   36    46 

diff --git a/jsonproxy/api.py b/jsonproxy/api.py

@@ -37,11 +37,17 @@ def urlopen(url, parse=False):
   37    37 
   38    38 
   39    39 def get_attribute_list(html, selector):
   -1    40 	s = selector.rsplit('@', 1)[0]
   -1    41 	if s:
   -1    42 		elements = html.select(s)
   -1    43 	else:
   -1    44 		elements = [html]
   -1    45 
   40    46 	if '@' in selector:
   41    -1 		s, attr = selector.rsplit('@', 1)
   42    -1 		return [element[attr] for element in html.select(s)]
   -1    47 		attr = selector.rsplit('@', 1)[1]
   -1    48 		return [element[attr] for element in elements]
   43    49 	else:
   44    -1 		return [element.text.strip() for element in html.select(selector)]
   -1    50 		return [element.text.strip() for element in elements]
   45    51 
   46    52 
   47    53 def get_attribute(html, selector):
@@ -50,22 +56,24 @@ def get_attribute(html, selector):
   50    56 		return l[0]
   51    57 
   52    58 
   53    -1 def scrape_item(url, config):
   54    -1 	tree = urlopen(url, parse=True)
   55    -1 	data = {
   56    -1 		'url': url
   57    -1 	}
   58    -1 	for key, selector in config['fields'].items():
   59    -1 		data[key] = get_attribute(tree, selector)
   60    -1 	return jsonify(data)
   -1    59 def get_fields(html, config):
   -1    60 	data = {}
   -1    61 	for key, value in config['fields'].items():
   -1    62 		if isinstance(value, str):
   -1    63 			data[key] = get_attribute(html, value)
   -1    64 		elif 'fields' in value:
   -1    65 			elements = html.select(value['selector'])
   -1    66 			data[key] = [get_fields(e, value) for e in elements]
   -1    67 		else:
   -1    68 			data[key] = get_attribute_list(html, value['selector'])
   -1    69 	return data
   61    70 
   62    71 
   63    -1 def scrape_list(url, config):
   64    -1 	tree = urlopen(url, parse=True)
   65    -1 	return jsonify({
   66    -1 		'url': url,
   67    -1 		'l': get_attribute_list(tree, config['selector'])
   68    -1 	})
   -1    72 def scrape(url, config):
   -1    73 	html = urlopen(url, parse=True)
   -1    74 	data = get_fields(html, config)
   -1    75 	data['url'] = url
   -1    76 	return jsonify(data)
   69    77 
   70    78 
   71    79 def proxy(url, config):
@@ -82,10 +90,8 @@ def main(endpoint, path):
   82    90 	url = request.url.replace(request.host_url + endpoint + '/', config['host'])
   83    91 	_type = config.get('type', 'proxy')
   84    92 
   85    -1 	if _type == 'scrape_item':
   86    -1 		response = scrape_item(url, config)
   87    -1 	elif _type == 'scrape_list':
   88    -1 		response = scrape_list(url, config)
   -1    93 	if _type == 'scrape':
   -1    94 		response = scrape(url, config)
   89    95 	else:
   90    96 		response = proxy(url, config)
   91    97 
@@ -95,29 +101,25 @@ def main(endpoint, path):
   95   101 	return response
   96   102 
   97   103 
   -1   104 def _fields_doc(config):
   -1   105 	if isinstance(config, dict):
   -1   106 		fields = config.get('fields', {})
   -1   107 		doc = config.get('fields_doc', {})
   -1   108 		for key in fields:
   -1   109 			yield key, doc.get(key, ''), list(_fields_doc(fields[key]))
   -1   110 
   -1   111 
   98   112 def _doc(endpoint):
   99   113 	config = current_app.config['ENDPOINTS'][endpoint]
  100    -1 	url_doc = 'url of the scraped page'
  101   114 
  102   115 	data = {
  103   116 		'title': endpoint,
  104   117 		'doc': config.get('doc', ''),
  105   118 		'type': config.get('type', 'proxy'),
  106    -1 		'fields': [],
   -1   119 		'fields': list(_fields_doc(config)),
  107   120 	}
  108   121 
  109    -1 	if data['type'] == 'scrape_item':
  110    -1 		fields_doc = config.get('fields_doc', {})
  111    -1 		data['fields'].append(('url', url_doc))
  112    -1 		for key in config['fields']:
  113    -1 			doc = fields_doc.get(key, '')
  114    -1 			data['fields'].append((key, doc))
  115    -1 
  116    -1 	if data['type'] == 'scrape_list':
  117    -1 		data['fields'] = [
  118    -1 			('url', url_doc),
  119    -1 			('l', 'list of results'),
  120    -1 		]
   -1   122 	data['fields'].append(('url', 'url of the scraped page', []))
  121   123 
  122   124 	return data
  123   125 

diff --git a/jsonproxy/templates/index.html b/jsonproxy/templates/index.html

@@ -10,22 +10,20 @@
   10    10 			<h2>{{ endpoint.title }} ({{ endpoint.type }})</h2>
   11    11 			<p>{{ endpoint.doc }}</p>
   12    12 			{% if endpoint.fields %}
   13    -1 			<table>
   14    -1 				<thead>
   15    -1 					<tr>
   16    -1 						<th>name</th>
   17    -1 						<th>description</th>
   18    -1 					</tr>
   19    -1 				</thead>
   20    -1 				<tbody>
   21    -1 					{% for name, doc in endpoint.fields %}
   22    -1 					<tr>
   23    -1 						<td>{{ name }}</td>
   24    -1 						<td>{{ doc }}</td>
   25    -1 					</tr>
   26    -1 					{% endfor %}
   27    -1 				</tbody>
   28    -1 			</table>
   -1    13 			<dl>
   -1    14 				{% for name, doc, fields in endpoint.fields recursive %}
   -1    15 				<dt>{{ name }}</dt>
   -1    16 				<dd>
   -1    17 					{{ doc }}
   -1    18 
   -1    19 					{% if fields %}
   -1    20 					<dl>
   -1    21 						{{ loop(fields) }}
   -1    22 					</dl>
   -1    23 					{% endif %}
   -1    24 				</dd>
   -1    25 				{% endfor %}
   -1    26 			</dl>
   29    27 			{% endif %}
   30    28 		</section>
   31    29 		{% endfor %}