db-pkpass: Convert Deutsche Bahn PDF tickets to PKPass

commit: 42cc0e306d1e30edeb45860cd064e268df204665
parent: 0438ecc7c751dd7a28cf4d0167ff5af2e59e83d2
Author: Tobias Bengfort <tobias.bengfort@posteo.de>
Date: 2025-05-30 18:28

refactor extract()

Diffstat

M	db_pkpass.py	182	++++++++++++++++++++++++++++++++-----------------------------
M	tests.py	4	++--

2 files changed, 98 insertions, 88 deletions

diff --git a/db_pkpass.py b/db_pkpass.py

@@ -79,14 +79,6 @@ def extract_barcodes(pdf):
   79    79     return barcodes
   80    80 
   81    81 
   82    -1 def parse_leg_dt(datestr, timestr, prefix, start):
   83    -1     f = f'%d.%m.%Y {prefix} %H:%M'
   84    -1     dt = strptime(f'{datestr}{start.year} {timestr}', f)
   85    -1     if dt < start:
   86    -1         dt = strptime(f'{datestr}{start.year + 1} {timestr}', f)
   87    -1     return dt
   88    -1 
   89    -1 
   90    82 def iter_lines(pdf):
   91    83     last_x = 0
   92    84     last_y = 0
@@ -106,85 +98,106 @@ def iter_lines(pdf):
  106    98         yield line
  107    99 
  108   100 
  109    -1 def extract_legs(pdf):
  110    -1     legs = []
  111    -1     started = False
  112    -1     validity = extract_validity(pdf)
  113    -1     for line in iter_lines(pdf):
   -1   101 def parse_leg_dt(datestr, timestr, prefix, start):
   -1   102     f = f'%d.%m.%Y {prefix} %H:%M'
   -1   103     dt = strptime(f'{datestr}{start.year} {timestr}', f)
   -1   104     if dt < start:
   -1   105         dt = strptime(f'{datestr}{start.year + 1} {timestr}', f)
   -1   106     return dt
   -1   107 
   -1   108 
   -1   109 def parse_validity(text):
   -1   110     if 'bis' in text:
   -1   111         s_start, s_end = text.split(' bis ')
   -1   112         try:
   -1   113             start = strptime(s_start, '%d.%m.%Y %H:%M Uhr')
   -1   114             end = strptime(s_end, '%d.%m.%Y %H:%M Uhr')
   -1   115         except ValueError:
   -1   116             start = strptime(s_start, '%d.%m.%Y')
   -1   117             end = strptime(s_end, '%d.%m.%Y')
   -1   118     else:
   -1   119         s_start = text.removeprefix('Fahrtantritt am ')
   -1   120         start = strptime(s_start, '%d.%m.%Y')
   -1   121         end = start + datetime.timedelta(days=1)
   -1   122     return start, end
   -1   123 
   -1   124 
   -1   125 def extract_header(lines):
   -1   126     for i, line in enumerate(lines):
  114   127         text = ' '.join(line)
  115    -1         if text.startswith('Halt\nDatum\nZeit\nGleis'):
  116    -1             started = True
  117    -1         elif not started or text.startswith('Ihre Reiseverbindung '):
  118    -1             pass
  119    -1         elif text.startswith('Wichtige Nutzungshinweise') or not text.strip():
   -1   128         if i == 1:
   -1   129             title = text
   -1   130         elif '\nAuftragsnummer: ' in text:
   -1   131             id_label = 'Auftragsnummer'
   -1   132             id_value = text.split('\nAuftragsnummer: ', 1)[1]
   -1   133         elif '\nBahnCard-Nr.: ' in text:
   -1   134             id_label = 'BahnCard-Nr.'
   -1   135             id_value = text.split('\nBahnCard-Nr.: ', 1)[1]
   -1   136         elif text.startswith('Gültigkeit: '):
   -1   137             validity = parse_validity(text.removeprefix('Gültigkeit: '))
   -1   138         elif text.startswith('Fahrtantritt am '):
   -1   139             validity = parse_validity(text.removeprefix('Fahrtantritt am '))
   -1   140         elif text.startswith('Halt\nDatum\nZeit\nGleis'):
  120   141             break
  121    -1         else:
  122    -1             station1, station2 = (v.strip() for v in line[0].split('\n'))
  123    -1             date1, date2 = (v.strip() for v in line[1].split('\n'))
  124    -1             time1, time2 = (v.strip() for v in line[2].split('\n'))
  125    -1             legs.append({
  126    -1                 'start': {
  127    -1                     'station': station1,
  128    -1                     'datetime': parse_leg_dt(date1, time1, 'ab', validity[0])
  129    -1                 },
  130    -1                 'destination': {
  131    -1                     'station': station2,
  132    -1                     'datetime': parse_leg_dt(date2, time2, 'an', validity[0])
  133    -1                 },
  134    -1             })
   -1   142     return {
   -1   143         'title': title,
   -1   144         'id_label': id_label,
   -1   145         'id_value': id_value,
   -1   146         'valid_from': validity[0],
   -1   147         'valid_until': validity[1],
   -1   148     }
  135   149 
  136    -1             if len(line) > 3:
  137    -1                 platform1, platform2 = (v.strip() for v in line[3].split('\n'))
  138    -1                 if platform1:
  139    -1                     legs[-1]['start']['platform'] = platform1
  140    -1                 if platform2:
  141    -1                     legs[-1]['destination']['platform'] = platform2
  142   150 
  143    -1             if len(line) > 4:
  144    -1                 legs[-1]['train'] = line[4].strip().replace('\n', ' ')
  145    -1             else:
  146    -1                 legs[-1]['train'] = legs[-1]['destination'].pop('platform')
   -1   151 def extract_leg(line, start):
   -1   152     station1, station2 = (v.strip() for v in line[0].split('\n'))
   -1   153     date1, date2 = (v.strip() for v in line[1].split('\n'))
   -1   154     time1, time2 = (v.strip() for v in line[2].split('\n'))
   -1   155     leg = {
   -1   156         'start': {
   -1   157             'station': station1,
   -1   158             'datetime': parse_leg_dt(date1, time1, 'ab', start)
   -1   159         },
   -1   160         'destination': {
   -1   161             'station': station2,
   -1   162             'datetime': parse_leg_dt(date2, time2, 'an', start)
   -1   163         },
   -1   164     }
  147   165 
  148    -1             if len(line) > 5:
  149    -1                 legs[-1]['comment'] = line[5].strip().replace('\n', ' ')
   -1   166     if len(line) > 3:
   -1   167         platform1, platform2 = (v.strip() for v in line[3].split('\n'))
   -1   168         if platform1:
   -1   169             leg['start']['platform'] = platform1
   -1   170         if platform2:
   -1   171             leg['destination']['platform'] = platform2
  150   172 
  151    -1     return legs
   -1   173     if len(line) > 4:
   -1   174         leg['train'] = line[4].strip().replace('\n', ' ')
   -1   175     else:
   -1   176         leg['train'] = leg['destination'].pop('platform')
  152   177 
   -1   178     if len(line) > 5:
   -1   179         leg['comment'] = line[5].strip().replace('\n', ' ')
  153   180 
  154    -1 def extract_id(pdf):
  155    -1     for page in pdf:
  156    -1         for text in page.get_text().split('\n'):
  157    -1             for label in ['Auftragsnummer', 'BahnCard-Nr.']:
  158    -1                 key = f'{label}: '
  159    -1                 if text.startswith(key):
  160    -1                     return label, text[len(key):]
  161    -1     raise ValueError('No ID found')
   -1   181     return leg
  162   182 
  163   183 
  164    -1 def extract_title(pdf):
  165    -1     return pdf[0].get_text('blocks')[1][4].strip()
   -1   184 def extract(pdf):
   -1   185     lines = iter_lines(pdf)
   -1   186     header = extract_header(lines)
  166   187 
   -1   188     legs = []
   -1   189     for line in lines:
   -1   190         text = ' '.join(line)
   -1   191         if text.startswith('Wichtige Nutzungshinweise') or not text.strip():
   -1   192             break
   -1   193         elif text.startswith('Ihre Reiseverbindung '):
   -1   194             pass
   -1   195         elif text.startswith('Halt\nDatum\nZeit\nGleis'):
   -1   196             pass
   -1   197         else:
   -1   198             legs.append(extract_leg(line, header['valid_from']))
  167   199 
  168    -1 def extract_validity(pdf):
  169    -1     key1 = 'Gültigkeit: '
  170    -1     key2 = 'Fahrtantritt am '
  171    -1     for page in pdf:
  172    -1         for text in page.get_text().split('\n'):
  173    -1             if text.startswith(key1):
  174    -1                 s_start, s_end = text[len(key1):].split(' bis ')
  175    -1                 try:
  176    -1                     start = strptime(s_start, '%d.%m.%Y %H:%M Uhr')
  177    -1                     end = strptime(s_end, '%d.%m.%Y %H:%M Uhr')
  178    -1                 except ValueError:
  179    -1                     start = strptime(s_start, '%d.%m.%Y')
  180    -1                     end = strptime(s_end, '%d.%m.%Y')
  181    -1                 return start, end
  182    -1             elif text.startswith(key2):
  183    -1                 s_start = text[len(key2):]
  184    -1                 start = strptime(s_start, '%d.%m.%Y')
  185    -1                 end = start + datetime.timedelta(days=1)
  186    -1                 return start, end
  187    -1     raise ValueError('No validity information found')
   -1   200     return header, legs
  188   201 
  189   202 
  190   203 def format_stop(stop, train=None):
@@ -206,22 +219,20 @@ def format_legs(legs):
  206   219 
  207   220 
  208   221 def extract_content(pdf):
  209    -1     title = extract_title(pdf)
  210    -1     id_label, id_value = extract_id(pdf)
  211    -1     validity = extract_validity(pdf)
   -1   222     header, legs = extract(pdf)
  212   223 
  213   224     data = {
  214   225         'formatVersion': 1,
  215   226         'organizationName': 'Deutsche Bahn AG',
  216   227         'passTypeIdentifier': 'ticket.ce9e.org',
  217   228         'teamIdentifier': 'XXXXXXXXXX',
  218    -1         'serialNumber': id_value,
  219    -1         'description': title,
  220    -1         'expirationDate': validity[1].isoformat(),
   -1   229         'serialNumber': header['id_value'],
   -1   230         'description': header['title'],
   -1   231         'expirationDate': header['valid_until'].isoformat(),
  221   232         'relevantDates': [
  222   233             {
  223    -1                 'startDate': validity[0].isoformat(),
  224    -1                 'endDate': validity[1].isoformat(),
   -1   234                 'startDate': header['valid_from'].isoformat(),
   -1   235                 'endDate': header['valid_until'].isoformat(),
  225   236             },
  226   237         ],
  227   238         'barcodes': [
@@ -237,14 +248,13 @@ def extract_content(pdf):
  237   248             'auxiliaryFields': [
  238   249                 {
  239   250                     'key': 'id',
  240    -1                     'label': id_label,
  241    -1                     'value': id_value,
   -1   251                     'label': header['id_label'],
   -1   252                     'value': header['id_value'],
  242   253                 },
  243   254             ],
  244   255         },
  245   256     }
  246   257 
  247    -1     legs = extract_legs(pdf)
  248   258     if legs:
  249   259         start = legs[0]['start']['station']
  250   260         destination = legs[-1]['destination']['station']

diff --git a/tests.py b/tests.py

@@ -15,8 +15,8 @@ class ExtractLegsTests(unittest.TestCase):
   15    15     def _test_extract_leg(self, path, expected):
   16    16         with open(path, 'rb') as fh:
   17    17             pdf = pymupdf.open(stream=fh.read())
   18    -1         actual = db_pkpass.extract_legs(pdf)
   19    -1         self.assertEqual(actual, expected)
   -1    18         _header, legs = db_pkpass.extract(pdf)
   -1    19         self.assertEqual(legs, expected)
   20    20 
   21    21     def test_normalpreis(self):
   22    22         self._test_extract_leg('muster/Muster 918-9 Normalpreis.pdf', [