import re regex = re.compile('\|-\n\| (\w+)\n\|.+\n\| U\+\w+ \((\d+)\)\n\| (.+)\n') with open('wikipedia_table.txt') as wiki_table: table_text = wiki_table.read() for ent_name, dec_code, std in regex.findall(table_text): uni = list(unichr(int(dec_code)).encode('utf-8')) print '"%s", %d,' % (ent_name, len(uni)), print "{", ", ".join("0x%02X" % ord(c) for c in uni), "}"