- commit
- e4ec571ed9fa190ae3487e4067d7e42a45f460e2
- parent
- ccb4d7342b3c6788159d90f0767c1af51d6dd2a2
- Author
- Tobias Bengfort <tobias.bengfort@gmx.net>
- Date
- 2015-12-03 23:32
more flexible scrape config
Diffstat
| M | README.rst | 49 | ++++++++++++++++++++++++------------------------- |
| M | example.cfg | 20 | +++++++++++++------- |
| M | jsonproxy/__init__.py | 22 | ++++++++++++++++------ |
| M | jsonproxy/api.py | 72 | +++++++++++++++++++++++++++++++------------------------------ |
| M | jsonproxy/templates/index.html | 30 | ++++++++++++++---------------- |
5 files changed, 104 insertions, 89 deletions
diff --git a/README.rst b/README.rst
@@ -34,15 +34,13 @@ HTML pages, PyJSONProxy can extract information from there:: 34 34 { 35 35 "url": "https://github.com/xi/", 36 36 "login": "xi",37 -1 ...38 -1 }39 -1 $ curl http://localhost:5000/repos/xi/40 -1 {41 -1 "url": "https://github.com/xi/",42 -1 "l": [43 -1 "/xi/pyjsonproxy",-1 37 "activity": [ -1 38 ... -1 39 ], -1 40 "repos": [{ 44 41 ...45 -1 ]-1 42 }] -1 43 ... 46 44 } 47 45 48 46 :: @@ -50,26 +48,27 @@ HTML pages, PyJSONProxy can extract information from there:: 50 48 ENDPOINTS = { 51 49 'github': { 52 50 'host': 'https://github.com/',53 -1 'type': 'scrape_item',-1 51 'type': 'scrape', 54 52 'fields': {55 -1 'login': '.vcard-username',56 -1 'fullname': '.vcard-fullname',57 -1 'email': '.vcard-details .email',58 -1 'join-date': '.vcard-details .join-date@datetime'-1 53 'login': '.vcard-username', -1 54 'fullname': '.vcard-fullname', -1 55 'email': '.vcard-details .email', -1 56 'join-date': '.vcard-details .join-date@datetime', -1 57 'activity': { -1 58 'selector': '.contribution-activity-listing ul a' -1 59 }, -1 60 'repos': { -1 61 'selector': '.popular-repos a.mini-repo-list-item', -1 62 'fields': { -1 63 'url': '@href', -1 64 'name': '.repo', -1 65 'description': '.repo-description' -1 66 } -1 67 } 59 68 }60 -1 },61 -1 'repos': {62 -1 'host': 'https://github.com/',63 -1 'type': 'scrape_list',64 -1 'selector': '.popular-repos a.mini-repo-list-item@href'65 69 } 66 70 } 67 7168 -1 There a two options here: ``scrape_item`` and ``scrape_list``. The first69 -1 one will take a list of fields and selectors and return only the first70 -1 match for each selector.The latter one will only take one selector and71 -1 return every match for this selector.72 -173 72 Selectors are generally CSS-selectors with the additional option to 74 73 select an attribute by appending an ``@`` and the attribute name. If no 75 74 attribute is selected, the text content of the element will be used. @@ -86,7 +85,7 @@ all responses. 86 85 Documentation 87 86 ============= 88 8789 -1 Some simple documentation is auomatically generated and available under-1 88 Some simple documentation is automatically generated and available under 90 89 ``/`` (for all endpoints) or ``/{endpoint}/`` (for an individual 91 90 endpoint). To provide some input for this documentation, you can add a 92 91 description to both endpoints and fields:: @@ -94,7 +93,7 @@ description to both endpoints and fields:: 94 93 ENDPOINTS = { 95 94 'github': { 96 95 'host': 'https://github.com/',97 -1 'type': 'scrape_item',-1 96 'type': 'scrape', 98 97 'doc': 'Access data about GitHub users', 99 98 'fields': { 100 99 'login': '.vcard-username',
diff --git a/example.cfg b/example.cfg
@@ -4,23 +4,29 @@ ENDPOINTS = {
4 4 },
5 5 'user': {
6 6 'host': 'https://github.com/',
7 -1 'type': 'scrape_item',
-1 7 'type': 'scrape',
8 8 'fields': {
9 9 'login': '.vcard-username',
10 10 'fullname': '.vcard-fullname',
11 11 'email': '.vcard-details .email',
12 -1 'join-date': '.vcard-details .join-date@datetime'
-1 12 'join-date': '.vcard-details .join-date@datetime',
-1 13 'activity': {
-1 14 'selector': '.contribution-activity-listing ul a'
-1 15 },
-1 16 'repos': {
-1 17 'selector': '.popular-repos a.mini-repo-list-item',
-1 18 'fields': {
-1 19 'url': '@href',
-1 20 'name': '.repo',
-1 21 'description': '.repo-description',
-1 22 }
-1 23 },
13 24 },
14 25 'fields_doc': {
15 26 'login': 'github username',
16 27 'fullname': 'the user\'s full name',
17 28 'join-date': 'date when the user joined github in ISO-xx format'
18 29 }
19 -1 },
20 -1 'repos': {
21 -1 'host': 'https://github.com/',
22 -1 'type': 'scrape_list',
23 -1 'selector': '.popular-repos a.mini-repo-list-item@href'
24 30 }
25 31 }
26 32
diff --git a/jsonproxy/__init__.py b/jsonproxy/__init__.py
@@ -8,10 +8,22 @@ from flask import Flask 8 8 9 9 from .api import api 10 1011 -1 TYPES = ['proxy', 'scrape_item', 'scrape_list']-1 11 TYPES = ['proxy', 'scrape'] 12 12 ENDPOINTS = 'ENDPOINTS' 13 13 14 14 -1 15 def check_fields_config(fields, endpoint, field=''): -1 16 for key, value in fields.items(): -1 17 full_key = field + '.' + key if field else key -1 18 if isinstance(value, dict): -1 19 if 'selector' not in value: -1 20 yield ('No selector configured for field %s in endpoint %s.' % -1 21 (full_key, endpoint)) -1 22 if 'fields' in value: -1 23 for error in check_fields_config(value['fields'], endpoint, full_key): -1 24 yield error -1 25 -1 26 15 27 def check_config(config): 16 28 errors = [] 17 29 @@ -23,14 +35,12 @@ def check_config(config): 23 35 if _type not in TYPES: 24 36 errors.append('Unknown endpoint type %s for endpoint %s. ' 25 37 'Choose one of %s.' % (_type, key, ', '.join(TYPES)))26 -1 elif _type == 'scrape_item':-1 38 elif _type == 'scrape': 27 39 if 'fields' not in data or len(data['fields']) == 0: 28 40 errors.append('No fields configured for endpoint %s of type %s.' % 29 41 (key, _type))30 -1 elif _type == 'scrape_list':31 -1 if 'selector' not in data:32 -1 errors.append('Endpoint %s of type %s is missing a selector.' %33 -1 (key, _type))-1 42 else: -1 43 errors += list(check_fields_config(data['fields'], key)) 34 44 35 45 return errors 36 46
diff --git a/jsonproxy/api.py b/jsonproxy/api.py
@@ -37,11 +37,17 @@ def urlopen(url, parse=False): 37 37 38 38 39 39 def get_attribute_list(html, selector): -1 40 s = selector.rsplit('@', 1)[0] -1 41 if s: -1 42 elements = html.select(s) -1 43 else: -1 44 elements = [html] -1 45 40 46 if '@' in selector:41 -1 s, attr = selector.rsplit('@', 1)42 -1 return [element[attr] for element in html.select(s)]-1 47 attr = selector.rsplit('@', 1)[1] -1 48 return [element[attr] for element in elements] 43 49 else:44 -1 return [element.text.strip() for element in html.select(selector)]-1 50 return [element.text.strip() for element in elements] 45 51 46 52 47 53 def get_attribute(html, selector): @@ -50,22 +56,24 @@ def get_attribute(html, selector): 50 56 return l[0] 51 57 52 5853 -1 def scrape_item(url, config):54 -1 tree = urlopen(url, parse=True)55 -1 data = {56 -1 'url': url57 -1 }58 -1 for key, selector in config['fields'].items():59 -1 data[key] = get_attribute(tree, selector)60 -1 return jsonify(data)-1 59 def get_fields(html, config): -1 60 data = {} -1 61 for key, value in config['fields'].items(): -1 62 if isinstance(value, str): -1 63 data[key] = get_attribute(html, value) -1 64 elif 'fields' in value: -1 65 elements = html.select(value['selector']) -1 66 data[key] = [get_fields(e, value) for e in elements] -1 67 else: -1 68 data[key] = get_attribute_list(html, value['selector']) -1 69 return data 61 70 62 7163 -1 def scrape_list(url, config):64 -1 tree = urlopen(url, parse=True)65 -1 return jsonify({66 -1 'url': url,67 -1 'l': get_attribute_list(tree, config['selector'])68 -1 })-1 72 def scrape(url, config): -1 73 html = urlopen(url, parse=True) -1 74 data = get_fields(html, config) -1 75 data['url'] = url -1 76 return jsonify(data) 69 77 70 78 71 79 def proxy(url, config): @@ -82,10 +90,8 @@ def main(endpoint, path): 82 90 url = request.url.replace(request.host_url + endpoint + '/', config['host']) 83 91 _type = config.get('type', 'proxy') 84 9285 -1 if _type == 'scrape_item':86 -1 response = scrape_item(url, config)87 -1 elif _type == 'scrape_list':88 -1 response = scrape_list(url, config)-1 93 if _type == 'scrape': -1 94 response = scrape(url, config) 89 95 else: 90 96 response = proxy(url, config) 91 97 @@ -95,29 +101,25 @@ def main(endpoint, path): 95 101 return response 96 102 97 103 -1 104 def _fields_doc(config): -1 105 if isinstance(config, dict): -1 106 fields = config.get('fields', {}) -1 107 doc = config.get('fields_doc', {}) -1 108 for key in fields: -1 109 yield key, doc.get(key, ''), list(_fields_doc(fields[key])) -1 110 -1 111 98 112 def _doc(endpoint): 99 113 config = current_app.config['ENDPOINTS'][endpoint]100 -1 url_doc = 'url of the scraped page'101 114 102 115 data = { 103 116 'title': endpoint, 104 117 'doc': config.get('doc', ''), 105 118 'type': config.get('type', 'proxy'),106 -1 'fields': [],-1 119 'fields': list(_fields_doc(config)), 107 120 } 108 121109 -1 if data['type'] == 'scrape_item':110 -1 fields_doc = config.get('fields_doc', {})111 -1 data['fields'].append(('url', url_doc))112 -1 for key in config['fields']:113 -1 doc = fields_doc.get(key, '')114 -1 data['fields'].append((key, doc))115 -1116 -1 if data['type'] == 'scrape_list':117 -1 data['fields'] = [118 -1 ('url', url_doc),119 -1 ('l', 'list of results'),120 -1 ]-1 122 data['fields'].append(('url', 'url of the scraped page', [])) 121 123 122 124 return data 123 125
diff --git a/jsonproxy/templates/index.html b/jsonproxy/templates/index.html
@@ -10,22 +10,20 @@ 10 10 <h2>{{ endpoint.title }} ({{ endpoint.type }})</h2> 11 11 <p>{{ endpoint.doc }}</p> 12 12 {% if endpoint.fields %}13 -1 <table>14 -1 <thead>15 -1 <tr>16 -1 <th>name</th>17 -1 <th>description</th>18 -1 </tr>19 -1 </thead>20 -1 <tbody>21 -1 {% for name, doc in endpoint.fields %}22 -1 <tr>23 -1 <td>{{ name }}</td>24 -1 <td>{{ doc }}</td>25 -1 </tr>26 -1 {% endfor %}27 -1 </tbody>28 -1 </table>-1 13 <dl> -1 14 {% for name, doc, fields in endpoint.fields recursive %} -1 15 <dt>{{ name }}</dt> -1 16 <dd> -1 17 {{ doc }} -1 18 -1 19 {% if fields %} -1 20 <dl> -1 21 {{ loop(fields) }} -1 22 </dl> -1 23 {% endif %} -1 24 </dd> -1 25 {% endfor %} -1 26 </dl> 29 27 {% endif %} 30 28 </section> 31 29 {% endfor %}