#!/usr/bin/env python3 import glob import json import re state_map = { 'Data state': 0, 'RCDATA state': 1, 'RAWTEXT state': 2, 'PLAINTEXT state': 3, 'Script data state': 4, 'CDATA section state': 5, } for filename in sorted(glob.glob('../html5lib-tests/tokenizer/*.test')): match = re.search('/([^/]*).test$', filename) if match is None: continue testname = match[1] if testname == 'xmlViolation': continue with open(filename) as json_data: root = json.load(json_data) test_out = open(f'test/html-tokenizer/{testname}.test', 'w') result_out = open(f'result/html-tokenizer/{testname}.test', 'w') counter = 0 for tests in root.values(): for test in tests: input = test['input'] # Skip surrogate tests if re.search(r'\\uD[89A-F]', input, re.I): continue input = re.sub(r'\\u([A-Fa-f0-9]{4})', lambda m: chr(int(m[1], 16)), input) output = '' for token in test['output']: output += token[0] + '\n' if token[0] == 'DOCTYPE': for i in range(1, 4): if token[i] is None: output += '\n' else: output += token[i] + '\n' else: output += token[1] if token[0] == 'StartTag': for name, value in token[2].items(): output += f' {name}={value}' output += '\n' output = re.sub(r'\\u([A-Fa-f0-9]{4})', lambda m: chr(int(m[1], 16)), output) output = re.sub(r'\x00', '\uFFFD', output) for state in test.get('initialStates', ['Data state']): state_no = state_map.get(state) if state_no is None: raise Exception(f'{filename}: unknown state: {state}') if state_no == 5: continue start_tag = test.get('lastStartTag', '-') test_out.write(f'{counter} {start_tag} {state_no} ' f'{len(input.encode())}\n') test_out.write(input) test_out.write('\n') result_out.write(f'{counter}\n') result_out.write(output) counter += 1 test_out.close() result_out.close()