- commit
- 42cc0e306d1e30edeb45860cd064e268df204665
- parent
- 0438ecc7c751dd7a28cf4d0167ff5af2e59e83d2
- Author
- Tobias Bengfort <tobias.bengfort@posteo.de>
- Date
- 2025-05-30 18:28
refactor extract()
Diffstat
| M | db_pkpass.py | 182 | ++++++++++++++++++++++++++++++++----------------------------- |
| M | tests.py | 4 | ++-- |
2 files changed, 98 insertions, 88 deletions
diff --git a/db_pkpass.py b/db_pkpass.py
@@ -79,14 +79,6 @@ def extract_barcodes(pdf): 79 79 return barcodes 80 80 81 8182 -1 def parse_leg_dt(datestr, timestr, prefix, start):83 -1 f = f'%d.%m.%Y {prefix} %H:%M'84 -1 dt = strptime(f'{datestr}{start.year} {timestr}', f)85 -1 if dt < start:86 -1 dt = strptime(f'{datestr}{start.year + 1} {timestr}', f)87 -1 return dt88 -189 -190 82 def iter_lines(pdf): 91 83 last_x = 0 92 84 last_y = 0 @@ -106,85 +98,106 @@ def iter_lines(pdf): 106 98 yield line 107 99 108 100109 -1 def extract_legs(pdf):110 -1 legs = []111 -1 started = False112 -1 validity = extract_validity(pdf)113 -1 for line in iter_lines(pdf):-1 101 def parse_leg_dt(datestr, timestr, prefix, start): -1 102 f = f'%d.%m.%Y {prefix} %H:%M' -1 103 dt = strptime(f'{datestr}{start.year} {timestr}', f) -1 104 if dt < start: -1 105 dt = strptime(f'{datestr}{start.year + 1} {timestr}', f) -1 106 return dt -1 107 -1 108 -1 109 def parse_validity(text): -1 110 if 'bis' in text: -1 111 s_start, s_end = text.split(' bis ') -1 112 try: -1 113 start = strptime(s_start, '%d.%m.%Y %H:%M Uhr') -1 114 end = strptime(s_end, '%d.%m.%Y %H:%M Uhr') -1 115 except ValueError: -1 116 start = strptime(s_start, '%d.%m.%Y') -1 117 end = strptime(s_end, '%d.%m.%Y') -1 118 else: -1 119 s_start = text.removeprefix('Fahrtantritt am ') -1 120 start = strptime(s_start, '%d.%m.%Y') -1 121 end = start + datetime.timedelta(days=1) -1 122 return start, end -1 123 -1 124 -1 125 def extract_header(lines): -1 126 for i, line in enumerate(lines): 114 127 text = ' '.join(line)115 -1 if text.startswith('Halt\nDatum\nZeit\nGleis'):116 -1 started = True117 -1 elif not started or text.startswith('Ihre Reiseverbindung '):118 -1 pass119 -1 elif text.startswith('Wichtige Nutzungshinweise') or not text.strip():-1 128 if i == 1: -1 129 title = text -1 130 elif '\nAuftragsnummer: ' in text: -1 131 id_label = 'Auftragsnummer' -1 132 id_value = text.split('\nAuftragsnummer: ', 1)[1] -1 133 elif '\nBahnCard-Nr.: ' in text: -1 134 id_label = 'BahnCard-Nr.' -1 135 id_value = text.split('\nBahnCard-Nr.: ', 1)[1] -1 136 elif text.startswith('Gültigkeit: '): -1 137 validity = parse_validity(text.removeprefix('Gültigkeit: ')) -1 138 elif text.startswith('Fahrtantritt am '): -1 139 validity = parse_validity(text.removeprefix('Fahrtantritt am ')) -1 140 elif text.startswith('Halt\nDatum\nZeit\nGleis'): 120 141 break121 -1 else:122 -1 station1, station2 = (v.strip() for v in line[0].split('\n'))123 -1 date1, date2 = (v.strip() for v in line[1].split('\n'))124 -1 time1, time2 = (v.strip() for v in line[2].split('\n'))125 -1 legs.append({126 -1 'start': {127 -1 'station': station1,128 -1 'datetime': parse_leg_dt(date1, time1, 'ab', validity[0])129 -1 },130 -1 'destination': {131 -1 'station': station2,132 -1 'datetime': parse_leg_dt(date2, time2, 'an', validity[0])133 -1 },134 -1 })-1 142 return { -1 143 'title': title, -1 144 'id_label': id_label, -1 145 'id_value': id_value, -1 146 'valid_from': validity[0], -1 147 'valid_until': validity[1], -1 148 } 135 149136 -1 if len(line) > 3:137 -1 platform1, platform2 = (v.strip() for v in line[3].split('\n'))138 -1 if platform1:139 -1 legs[-1]['start']['platform'] = platform1140 -1 if platform2:141 -1 legs[-1]['destination']['platform'] = platform2142 150143 -1 if len(line) > 4:144 -1 legs[-1]['train'] = line[4].strip().replace('\n', ' ')145 -1 else:146 -1 legs[-1]['train'] = legs[-1]['destination'].pop('platform')-1 151 def extract_leg(line, start): -1 152 station1, station2 = (v.strip() for v in line[0].split('\n')) -1 153 date1, date2 = (v.strip() for v in line[1].split('\n')) -1 154 time1, time2 = (v.strip() for v in line[2].split('\n')) -1 155 leg = { -1 156 'start': { -1 157 'station': station1, -1 158 'datetime': parse_leg_dt(date1, time1, 'ab', start) -1 159 }, -1 160 'destination': { -1 161 'station': station2, -1 162 'datetime': parse_leg_dt(date2, time2, 'an', start) -1 163 }, -1 164 } 147 165148 -1 if len(line) > 5:149 -1 legs[-1]['comment'] = line[5].strip().replace('\n', ' ')-1 166 if len(line) > 3: -1 167 platform1, platform2 = (v.strip() for v in line[3].split('\n')) -1 168 if platform1: -1 169 leg['start']['platform'] = platform1 -1 170 if platform2: -1 171 leg['destination']['platform'] = platform2 150 172151 -1 return legs-1 173 if len(line) > 4: -1 174 leg['train'] = line[4].strip().replace('\n', ' ') -1 175 else: -1 176 leg['train'] = leg['destination'].pop('platform') 152 177 -1 178 if len(line) > 5: -1 179 leg['comment'] = line[5].strip().replace('\n', ' ') 153 180154 -1 def extract_id(pdf):155 -1 for page in pdf:156 -1 for text in page.get_text().split('\n'):157 -1 for label in ['Auftragsnummer', 'BahnCard-Nr.']:158 -1 key = f'{label}: '159 -1 if text.startswith(key):160 -1 return label, text[len(key):]161 -1 raise ValueError('No ID found')-1 181 return leg 162 182 163 183164 -1 def extract_title(pdf):165 -1 return pdf[0].get_text('blocks')[1][4].strip()-1 184 def extract(pdf): -1 185 lines = iter_lines(pdf) -1 186 header = extract_header(lines) 166 187 -1 188 legs = [] -1 189 for line in lines: -1 190 text = ' '.join(line) -1 191 if text.startswith('Wichtige Nutzungshinweise') or not text.strip(): -1 192 break -1 193 elif text.startswith('Ihre Reiseverbindung '): -1 194 pass -1 195 elif text.startswith('Halt\nDatum\nZeit\nGleis'): -1 196 pass -1 197 else: -1 198 legs.append(extract_leg(line, header['valid_from'])) 167 199168 -1 def extract_validity(pdf):169 -1 key1 = 'Gültigkeit: '170 -1 key2 = 'Fahrtantritt am '171 -1 for page in pdf:172 -1 for text in page.get_text().split('\n'):173 -1 if text.startswith(key1):174 -1 s_start, s_end = text[len(key1):].split(' bis ')175 -1 try:176 -1 start = strptime(s_start, '%d.%m.%Y %H:%M Uhr')177 -1 end = strptime(s_end, '%d.%m.%Y %H:%M Uhr')178 -1 except ValueError:179 -1 start = strptime(s_start, '%d.%m.%Y')180 -1 end = strptime(s_end, '%d.%m.%Y')181 -1 return start, end182 -1 elif text.startswith(key2):183 -1 s_start = text[len(key2):]184 -1 start = strptime(s_start, '%d.%m.%Y')185 -1 end = start + datetime.timedelta(days=1)186 -1 return start, end187 -1 raise ValueError('No validity information found')-1 200 return header, legs 188 201 189 202 190 203 def format_stop(stop, train=None): @@ -206,22 +219,20 @@ def format_legs(legs): 206 219 207 220 208 221 def extract_content(pdf):209 -1 title = extract_title(pdf)210 -1 id_label, id_value = extract_id(pdf)211 -1 validity = extract_validity(pdf)-1 222 header, legs = extract(pdf) 212 223 213 224 data = { 214 225 'formatVersion': 1, 215 226 'organizationName': 'Deutsche Bahn AG', 216 227 'passTypeIdentifier': 'ticket.ce9e.org', 217 228 'teamIdentifier': 'XXXXXXXXXX',218 -1 'serialNumber': id_value,219 -1 'description': title,220 -1 'expirationDate': validity[1].isoformat(),-1 229 'serialNumber': header['id_value'], -1 230 'description': header['title'], -1 231 'expirationDate': header['valid_until'].isoformat(), 221 232 'relevantDates': [ 222 233 {223 -1 'startDate': validity[0].isoformat(),224 -1 'endDate': validity[1].isoformat(),-1 234 'startDate': header['valid_from'].isoformat(), -1 235 'endDate': header['valid_until'].isoformat(), 225 236 }, 226 237 ], 227 238 'barcodes': [ @@ -237,14 +248,13 @@ def extract_content(pdf): 237 248 'auxiliaryFields': [ 238 249 { 239 250 'key': 'id',240 -1 'label': id_label,241 -1 'value': id_value,-1 251 'label': header['id_label'], -1 252 'value': header['id_value'], 242 253 }, 243 254 ], 244 255 }, 245 256 } 246 257247 -1 legs = extract_legs(pdf)248 258 if legs: 249 259 start = legs[0]['start']['station'] 250 260 destination = legs[-1]['destination']['station']
diff --git a/tests.py b/tests.py
@@ -15,8 +15,8 @@ class ExtractLegsTests(unittest.TestCase): 15 15 def _test_extract_leg(self, path, expected): 16 16 with open(path, 'rb') as fh: 17 17 pdf = pymupdf.open(stream=fh.read())18 -1 actual = db_pkpass.extract_legs(pdf)19 -1 self.assertEqual(actual, expected)-1 18 _header, legs = db_pkpass.extract(pdf) -1 19 self.assertEqual(legs, expected) 20 20 21 21 def test_normalpreis(self): 22 22 self._test_extract_leg('muster/Muster 918-9 Normalpreis.pdf', [