#!/usr/bin/env python # -*- coding: utf-8 -*- # # Script for decoding Lingea Dictionary (.trd) file # Result is
\t file, convertable easily # by stardict-editor from package stardict-tools into native # Stardict dictionary (stardict.sf.net and www.stardict.org) # # Copyright (C) 2007 - Klokan Petr Přidal (www.klokan.cz) # # Based on script CobuildConv.rb by Nomad # http://hp.vector.co.jp/authors/VA005784/cobuild/cobuildconv.html # # Version history: # 0.7 (19.6.2008) Patch by Petr Dlouhy, added support for many new dicts # 0.6 (29.5.2008) Patch by Petr Dlouhy, added support for French-Czech and Spanish-Czech dictionaries; automatic encoding selection; all unrecognized characters are printed now as #something# (note: some of them are bugs in Lingea dictionaries); typo # 0.5 (3.12.2007) Patch by Petr Dlouhy, iPaq and 2000 dicts support # Patch by Josef Riha # 0.4 (30.10.2007) Patch by Petr Dlouhy, optional HTML generation # 0.3 (28.10.2007) Patch by Petr Dlouhy, cleanup, bugfix. More dictionaries. # 0.2 (19.7.2007) Changes, documentation, first 100% dictionary # 0.1 (20.5.2006) Initial version based on Nomad specs # # Supported dictionaries: # - Lingea Německý Kapesní slovník # - Lingea Anglický Kapesní slovník # - Lingea 2002 series (theoretically all of them) # - Lingea 2000 series (theoretically all of them) # - Lingea Pocket series # # Tested on this dicts: # lg_czen-eco.trd # lg_czen-ind.trd # lg_czen-law.trd # lg_czen-2.trd # lg_czgr-eco.trd # lg_czgr-1.trd # lg_encz-abc.trd # lg_encz-eco.trd # lg_encz-grm.trd # lg_encz-ind.trd # lg_encz-laa.trd # lg_encz-law.trd # lg_encz-2.trd # lg_en-rth.trd # lg_ensk-2.trd # lg_ensk-2_.trd # lg_en-wn.trd # lg_frcz-1f.trd # lg_grcz-eco.trd # lg_grcz-1.trd # lg_grsk-2.trd # lg_sken-2.trd # lg_sken-2_.trd # lg_skgr-2.trd # lg_spcz-1.trd # 00czus-1.trd # 00uscz-1.trd # # # Modified by: # - Petr Dlouhy (petr.dlouhy | email.cz) # Generalization of data block rules, sampleFlag 0x04, sound out fix, data phrase prefix with comment (0x04) # HTML output, debugging patch, options on command line # Decoding for 2000 and Pocket series. # # - Ing. Josef Riha ( jose1711 | gmail.com ) # Slovak letters support # # # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Library General Public # License as published by the Free Software Foundation; either # version 2 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Library General Public License for more details. # # You should have received a copy of the GNU Library General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., 59 Temple Place - Suite 330, # Boston, MA 02111-1307, USA. # VERSION VERSION = "0.7" import getopt, sys def usage(): print "Lingea Dictionary Decoder" print "-------------------------" print "Version: %s" % VERSION print "Copyright (C) 2007 - Klokan Petr Pridal, Petr Dlouhy" print print "Usage: python lingea-trd-decoder.py DICTIONARY.trd > DICTIONARY.tab" print "Result conversion by stardict-tools: /usr/lib/stardict-tools/tabfile" print print " -o --out-style : Output style" print " 0 no tags" print " 1 \\n tags" print " 2 html tags" print " -h --help : Print this message" print " -d --debug : Debug" print " -r --debug-header : Debug - print headers" print " -a --debug-all : Debug - print all records" print " -l --debug-limit : Debug limit" print print "For HTML support in StarDict dictionary .ifo has to contain:" print "sametypesequence=g" print "!!! Change the .ifo file after generation by tabfile !!!" print try: opts, args = getopt.getopt(sys.argv[1:], "hdo:ral:e:", ["help", "debug", "out-style=", "debug-header", "debug-all", "debug-limit="]) except getopt.GetoptError: usage() print "ERROR: Bad option" sys.exit(2) import locale DEBUG = False OUTSTYLE = 2 DEBUGHEADER = False DEBUGALL = False DEBUGLIMIT = 1 for o, a in opts: if o in ("-d", "-debug"): # DEBUGING !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! DEBUG = True if o in ("-o", "--out-style"): # output style OUTSTYLE = locale.atoi(a) if OUTSTYLE > 2: usage() print "ERROR: Output style not specified" sys.exit(2) if o in ("-r", "--debug-header"): # If DEBUG and DEBUGHEADER, then print just all header records DEBUGHEADER = True if o in ("-a", "--debug-all"): # If DEBUG and DEBUGALL then print debug info for all records DEBUGALL = True if o in ("-h", "--help"): usage() sys.exit(0) if o in ("-l", "--debug-limit"): # Number of wrong records for printing to stop during debugging DEBUGLIMIT = locale.atoi(a) # FILENAME is a first parameter on the command line now if len(args) == 1: FILENAME = args[0] else: usage() print "ERROR: You have to specify .trd file to decode" sys.exit(2) from struct import * import re if OUTSTYLE == 0: tag = { 'db':('' ,''), #Data beginning 'rn':('' ,'\t'), #Record name 'va':('' ,' '), #Header variant 'wc':('(' ,')'), #WordClass 'pa':('' ,' '), #Header parts 'fo':('(' ,') '), #Header forms 'on':('(' ,')' ), #Header origin note 'pr':('[' ,']'), #Header pronunciation; not printed by Lingea 'du':('(' ,')'), #Data sub example 'dc':('(' ,')'), #Data sub shortcut 'hs':('(' ,') '), #Header source 'dv':('{' ,'} '), #Header dataVariant 'sh':('' ,''), #Header shortcut 'pv':('/' ,'/ '), #Header plural variant 'ex':('(' ,') '), #Header example 'sa':('`' ,'`' ), #Data sample 'sw':('' ,''), #Data sample wordclass; is no printed by Lingea (it is printed only in French?) 'do':('`' ,'`' ), #Data origin note 'df':('' ,' '), #Data definition 'nt':('' ,' '), #Data note 'ps':('"' ,'" '), #Data phrase short form 'pg':('"' ,' = '), #Data phrase green 'pc':('`' ,'`'), #Data phrase comment; this comment is not printed by Lingea, but it seems useful 'p1':('"' ,' = '), #Data phrase 1 'p2':('' ,'" ' ), #Data phrase 2 'rs':('SYNONYM: ' ,'' ), #Reference synonym 'rr':('HYPERNYM: ','' ), #Reference hypernym 'rp':('HYPONYM: ' ,'' ), #Reference hyponym 'sp':('"' ,' = ' ),#Data simple phrase 'b1':('"' ,' = '), #Data phrase (block) 1 'b2':('" ' ,''), #Data phrase (block) 2 } if OUTSTYLE == 1: tag = { 'db':('•' ,''), #Data beginning 'rn':('' ,'\t'), #Record name 'va':('' ,' '), #Header variant 'wc':('' ,'\\n'), #WordClass 'pa':('' ,':\\n'), #Header parts 'fo':('(' ,') '), #Header forms 'on':('(' ,')\\n' ), #Header origin note 'pr':('[' ,']\\n'), #Header pronunciation; not printed by Lingea 'du':('(' ,')'), #Data sub example 'dc':('(' ,')'), #Data sub shortcut 'hs':('(' ,')\\n'), #Header source 'dv':('{' ,'} '), #Header dataVariant 'sh':('' ,'\\n'), #Header shortcut 'pv':('/' ,'/\\n'), #Header plural variant 'ex':('(' ,')\\n'), #Header example 'sa':(' ' ,'\\n' ), #Data sample 'sw':('' ,''), #Data sample wordclass; is not printed by Lingea (it is printed in only in French?) 'do':(' ' ,' ' ), #Data origin note 'df':(' ' ,'\\n'), #Data definition 'nt':(' ' ,'\\n'), #Data note 'ps':(' ' ,'\\n'), #Data phrase short form 'pg':(' ' ,' '), #Data phrase green 'pc':(' ' ,' '), #Data phrase comment; this comment is not printed by Lingea, but it seems useful 'p1':(' ' ,' '), #Data phrase 1 'p2':(' ' ,'\\n' ), #Data phrase 2 'rs':('SYNONYM: ' ,'\\n' ),#Reference synonym 'rr':('HYPERNYM: ','\\n' ),#Reference hypernym 'rp':('HYPONYM: ' ,'\\n' ),#Reference hyponym 'sp':(' ' ,'\\n' ), #Data simple phrase 'b1':('"' ,' = '), #Data phrase (block) 1 'b2':('" ' ,''), #Data phrase (block) 2 } if OUTSTYLE == 2: tag = { 'db':('•' ,''), #Data beginning 'rn':('' ,'\t'), #Record name 'va':('' ,' '), #Header variant 'wc':('','\\n'), #WordClass 'pa':('',':\\n'), #Header parts 'fo':('(' ,') '), #Header forms 'on':('(' ,')\\n' ), #Header origin note 'pr':('[' ,']\\n'), #Header pronunciation; not printed by Lingea 'du':('(' ,')'), #Data sub example 'dc':('(' ,')'), #Data sub shortcut 'hs':('(' ,')\\n'), #Header source 'dv':('{' ,'} '), #Header dataVariant 'sh':('' ,'\\n'), #Header shortcut 'pv':('/' ,'/\\n'), #Header plural variant 'ex':('(' ,')\\n'), #Header example 'sa':(' ' ,'\\n' ), #Data sample 'sw':('' ,''), #Data sample wordclass; is not printed by Lingea (it is printed in only in French?) 'do':(' ' ,' ' ), #Data origin note 'df':(' ' ,'\\n'), #Data definition 'nt':('' ,''), #Data note 'ps':(' ' ,'\\n'), #Data phrase short form 'pg':(' ' ,' '), #Data phrase green 'pc':(' ' ,' '), #Data phrase comment; this comment is not printed by Lingea, but it seems useful 'p1':(' ' ,' '), #Data phrase 1 'p2':(' ' ,'\\n' ), #Data phrase 2 'rs':('SYNONYM: ' ,'\\n' ), #Reference synonym 'rr':('HYPERNYM: ' ,'\\n' ), #Reference hypernym 'rp':('HYPONYM: ' ,'\\n' ), #Reference hyponym 'sp':(' ' ,'\\n' ), #Data simple phrase 'b1':('"' ,' = '), #Data phrase (block) 1 'b2':('" ' ,''), #Data phrase (block) 2 } # Print color debug functions purple = lambda c: '\x1b[1;35m'+c+'\x1b[0m' blue = lambda c: '\x1b[1;34m'+c+'\x1b[0m' cyan = lambda c: '\x1b[36m'+c+'\x1b[0m' gray = lambda c: '\x1b[1m'+c+'\x1b[0m' def getRec(n): """Get data stream for record of given number""" if n >= 0 and n < entryCount: f.seek(index[n]) return f.read(index[n+1] - index[n]) else: return '' def decode_alpha( stream, nullstop=True): """Decode 6-bit encoding data stream from the beginning until first NULL""" offset = 0 triple = 0 result = [] while triple < len( stream ): if offset % 4 == 0: c = stream[triple] >> 2 triple += 1 if offset % 4 == 1: c = (stream[triple-1] & 3) << 4 | stream[triple] >> 4 triple += 1 if offset % 4 == 2: c = (stream[triple-1] & 15) << 2 | (stream[triple] & 192) >> 6 triple += 1 if offset % 4 == 3: c = stream[triple-1] & 63 if c == 0 and nullstop: break offset += 1 # TODO: ENCODE UNICODE 4 BYTE STREAM!!! and but it after #UNICODE# as unichr() result.append(c) return decode_alpha_postprocessing(result), triple - 1 def decode_alpha_postprocessing( input ): """Lowlevel alphabet decoding postprocessing, combines tuples into one character""" result = "" input.extend([0x00]*5) # UPCASE, UPCASE_PRON, SYMBOL, SPECIAL skip = False for i in range(0,len(input)-1): if skip > 0: skip -= 1 continue bc = input[i] c = alpha[bc] bc1 = input[i+1] c1 = alpha[bc1] if c[0] == '#': skip = 1 if c in subs: if c in ("#UPCASE#", "#SPECIAL#", "#SYMBOL#"): result += subs[c][bc1] elif c in ("#PRON#"): bc2 = input[i+2] c2 = alpha[bc2] cc = c1 + c2 if cc in subs[c]: result += subs[c][cc] else: result += c + cc # debug skip = 2 elif c1 in subs[c]: result += subs[c][c1] else: result += c + c1 # debug else: result += c # debug else: result += c return result def pronunciation_encode(s): """Encode pronunciation upcase symbols into IPA symbols""" for i in range(0, 64): s = s.replace(upcase[i], upcase_pron[i]) return s re_a = re.compile(r'') re_c = re.compile(r'') re_d = re.compile(r'') re_e = re.compile(r'') re_E = re.compile(r'') re_f = re.compile(r'') re_g = re.compile(r'') #language re_h = re.compile(r'') re_i = re.compile(r'') re_I = re.compile(r'') re_l = re.compile(r'') re_L = re.compile(r'') re_n = re.compile(r'') re_N = re.compile(r'') re_o = re.compile(r'') re_p = re.compile(r'') re_q = re.compile(r'') re_r = re.compile(r'') re_t = re.compile(r'') re_u = re.compile(r'') re_v = re.compile(r'') re_w = re.compile(r'') re_x = re.compile(r'') re_y = re.compile(r'') re_z = re.compile(r'') re__ = re.compile(r'<\^(.*?)>') def decode_tag_postprocessing(input): """Decode and replace tags used in Lingea dictionaries; decode internal tags""" s = input # General information in http://www.david-zbiral.cz/El-slovniky-plnaverze.htm#_Toc151656799 # TODO: Better output handling if (OUTSTYLE == 0) or (OUTSTYLE == 1): s = re_a.sub(r'(\1)',s) s = re_c.sub(r'(\1)',s) s = re_d.sub(r'(\1)',s) s = re_e.sub(r'(\1)',s) s = re_E.sub(r'(\1)',s) s = re_f.sub(r'(\1)',s) s = re_g.sub(r'(\1)',s) s = re_h.sub(r'(\1)',s) s = re_i.sub(r'(\1)',s) s = re_I.sub(r'(\1)',s) s = re_l.sub(r'(\1)',s) s = re_L.sub(r'(\1)',s) s = re_n.sub(r'(\1)',s) s = re_N.sub(r'(\1)',s) s = re_o.sub(r'(\1)',s) s = re_p.sub(r'(\1)',s) s = re_q.sub(r'(\1)',s) s = re_r.sub(r'(\1)',s) s = re_t.sub(r'(\1)',s) s = re_u.sub(r'(\1)',s) s = re_v.sub(r'(\1)',s) s = re_w.sub(r'(\1)',s) s = re_x.sub(r'(\1)',s) s = re_y.sub(r'(\1)',s) s = re_z.sub(r'(\1)',s) s = re__.sub(r'(\1)',s) if OUTSTYLE == 2: s = re_a.sub(r'\1',s) s = re_c.sub(r'\1',s) s = re_d.sub(r'(\1)',s) s = re_e.sub(r'\1',s) s = re_E.sub(r'\1',s) s = re_f.sub(r'\1',s) s = re_g.sub(r'\1',s) s = re_h.sub(r'\1',s) s = re_i.sub(r'\1',s) s = re_I.sub(r'\1',s) s = re_l.sub(r'\1',s) s = re_L.sub(r'\1',s) s = re_n.sub(r'\1',s) s = re_N.sub(r'\1',s) s = re_o.sub(r'\1',s) s = re_p.sub(r'\1',s) s = re_q.sub(r'\1',s) s = re_r.sub(r'\1',s) s = re_t.sub(r'\1',s) s = re_u.sub(r'\1',s) s = re_v.sub(r'\1',s) s = re_w.sub(r'\1',s) s = re_x.sub(r'\1',s) s = re_y.sub(r'\1',s) s = re_z.sub(r'\1',s) s = re__.sub(r'\1',s) return s def toBin( b ): """Prettify debug output format: hex(bin)dec""" original = b r = 0; i = 1; while b > 0: if b & 0x01 != 0: r += i i *= 10 b = b >> 1 return "0x%02X(%08d)%03d" % (original, r, original) def outInt( comment = "" ): """Read next byte and output DEBUG info""" global bs, pos if DEBUG: print "%03d %s %s | %03d" % (pos, toBin(bs[pos]),comment, pos) if (comment.find('%') != -1): comment = comment % bs[pos] pos += 1 return bs[pos-1] def outStr( comment = "" ): """Read next string and output DEBUG info""" global bs, pos s, triple = decode_alpha(bs[pos:]) s = s.split('\x00')[0] # give me string until first NULL if (comment.find('%') != -1): comment = comment % s if DEBUG: print "%03d %s %s | %s" % (pos, toBin(bs[pos]),comment, s) pos += triple + 1 return s.replace('`','') # Remove '`' character from words def decode(stream): """Decode byte stream of one record, return decoded string with formatting in utf""" result = "" global bs, pos # stream - data byte stream for one record bs = unpack("<%sB" % len(stream), stream) # bs - list of bytes from stream pos = 0 itemCount = outInt("ItemCount: %s") # Number of blocks in the record mainFlag = outInt("MainFlag: %s") # HEADER BLOCK # ------------ if mainFlag & 0x01: headerFlag = outInt("HeaderFlag: %s") # Blocks in header if headerFlag & 0x01: result += tag['rn'][0] + outStr("Header record name: %s").replace('_','') + tag['rn'][1] # Remove character '_' from index if headerFlag & 0x02: result += tag['va'][0] + outStr("Header variant: %s") + tag['va'][1] if headerFlag & 0x04: s = outInt("Header wordclass: %s") if s < 32: result += tag['wc'][0] + wordclass[s] + tag['wc'][1] else: raise "Header wordclass out of range in: %s" % result if headerFlag & 0x08: result += tag['pa'][0] + outStr("Header parts: %s") + tag['pa'][1] if headerFlag & 0x10: result += tag['fo'][0] + outStr("Header forms: %s") + tag['fo'][1] if headerFlag & 0x20: result += tag['on'][0] + outStr("Header origin note: %s") + tag['on'][1] if headerFlag & 0x80: result += tag['pr'][0] + pronunciation_encode(outStr("Header pronunciation: %s")) + tag['pr'][1] # Header data block if mainFlag & 0x02: headerFlag = outInt("Header headerFlag: %s") # Blocks in header if headerFlag & 0x01: result += tag['hs'][0] + outStr("Header source: %s")+ tag['hs'][1] if headerFlag & 0x02: result += tag['dv'][0] + outStr("Header dataVariant: %s")+ tag['dv'][1] if headerFlag & 0x08: result += tag['ex'][0] + outStr("Example: %s") + tag['ex'][1] if headerFlag & 0x10: result += tag['sh'][0] + outStr("Header shortcut: %s") + tag['sh'][1] if headerFlag & 0x40: result += tag['pv'][0] + outStr("Plural variant: %s") + tag['pv'][1] # ??? Link elsewhere pass # SOUND DATA REFERENCE if mainFlag & 0x80: outInt("Sound reference byte #1: %s") outInt("Sound reference byte #2: %s") outInt("Sound reference byte #3: %s") outInt("Sound reference byte #4: %s") if outInt("Sound reference continue: %s") & 0x80: outInt("Sound reference byte #5: %s") outInt("Sound reference byte #6: %s") outInt("Sound reference byte #7: %s") outInt("Sound reference byte #8: %s") # TODO: Test all mainFlags in header!!!! #result += ': ' li = 0 #print just every first word class identifier # TODO: this is not systematic (should be handled by output) global lastWordClass lastWordClass = 0 # DATA BLOCK(S) # ------------- for i in range(0, itemCount): item = tag['db'][0] + tag['db'][1] ol = False dataFlag = outInt("DataFlag #%i: %%s -----------------------------" % i) if dataFlag & 0x01: # small index sampleFlag = outInt("Data sampleFlag: %s") if sampleFlag & 0x01: result += tag['sa'][0] + outStr("Data sample: %s") + tag['sa'][1] if sampleFlag & 0x02: result += tag['sa'][0] + outStr("Data sample variant: %s") + tag['sa'][1] if sampleFlag & 0x04: s = outInt("Data wordclass: %s") if s != lastWordClass: if s < 32: result += tag['wc'][0] + wordclass[s] + tag['wc'][1] else: raise "Header wordclass out of range in: %s" % result lastWordClass = s if sampleFlag & 0x08: result += tag['sw'][0] + outStr("Data sample wordclass: %s") + tag['sw'][1] if sampleFlag & 0x10: outInt("Data sample Int: %s") outInt("Data sample Int: %s") outInt("Data sample Int: %s") if sampleFlag & 0x20: item += tag['do'][0] + outStr("Data origin note: %s") + tag['do'][1] if sampleFlag & 0x80: item += " " result += tag['pr'][0] + pronunciation_encode(outStr("Data sample pronunciation: %s")) + tag['pr'][1] if dataFlag & 0x02: item += " " subFlag = outInt("Data subFlag: %s") if subFlag & 0x08: item += tag['du'][0] + outStr("Data sub example: %s") + tag['du'][1] if subFlag & 0x10: item += tag['dc'][0] + outStr("Data sub shortcut: %s") + tag['dc'][1] if subFlag & 0x80: outStr("Data sub prefix: %s") # It seams that data sub prefix content is ignored and there is a generated number for the whole block instead. li += 1 ol = True if dataFlag & 0x04: # chart pass # ??? if dataFlag & 0x08: # reference item += tag['df'][0] + outStr("Data definition: %s") + tag['df'][1] if dataFlag & 0x10: # note??? noteFlag = outInt("Data noteFlag: %s"); if noteFlag & 0x01: item += tag['nt'][0] + outStr("Data note 0x01: %s") + tag['nt'][1] if noteFlag & 0x02: noteCount = outInt("Data noteCount: %s") for i in range(0, noteCount): item += tag['nt'][0] + outStr("Data note 0x02: %s") + tag['nt'][1] if noteFlag & 0x08: noteCount = outInt("Data noteCount: %s") for i in range(0, noteCount): item += tag['nt'][0] + outStr("Data note 0x08: %s") + tag['nt'][1] if noteFlag & 0x40: item += tag['nt'][0] + outStr("Data note 0x40: %s") + tag['nt'][1] if dataFlag & 0x20: # phrase phraseFlag1 = outInt("Data phraseFlag1: %s") if phraseFlag1 & 0x01: item += tag['ps'][0] + outStr("Data phrase short form: %s") + tag['ps'][1] if phraseFlag1 & 0x02: phraseCount = outInt("Data phraseCount: %s") for i in range(0, phraseCount): phraseComment = outInt("Data phrase prefix") if phraseComment & 0x04: item += tag['pc'][0] + outStr("Data phrase comment: %s") + tag['pc'][1] item += tag['p1'][0] + outStr("Data phrase 1: %s") + tag['p1'][1] item += tag['p2'][0] + outStr("Data phrase 2: %s") + tag['p2'][1] if phraseFlag1 & 0x04: phraseCount = outInt("Data phraseCount: %s") for i in range(0, phraseCount): phraseComment = outInt("Data phrase prefix") if phraseComment & 0x04: item += tag['pc'][0] + outStr("Data phrase 1: %s") + tag['pc'][1] item += tag['pg'][0] + outStr("Data phrase comment: %s") + tag['pg'][1] item += tag['p2'][0] + outStr("Data phrase 2: %s") + tag['p2'][1] if phraseFlag1 & 0x08: phraseCount = outInt("Data simple phraseCount: %s") for i in range(0, phraseCount): item += tag['sp'][0] + outStr("Data simple phrase: %s") + tag['sp'][1] if phraseFlag1 & 0x10: if smallIndex: # different behaviour in small and big dictionaries item += tag['ps'][0] + outStr("Data phrase short form: %s") + tag['ps'][1] else: phraseCount = outInt("Data phraseCount: %s") for i in range(0, phraseCount): item += tag['ps'][0] + outStr("Data phrase short form: %s") + tag['ps'][1] if phraseFlag1 & 0x40: item += tag['ps'][0] + outStr("Data phrase short form: %s") + tag['ps'][1] # TODO: be careful in changing the rules, to have back compatibility! if dataFlag & 0x40: # reference, related language referenceFlag = outInt("Data referenceFlag: %s") if referenceFlag & 0x01: item += tag['rs'][0] + outStr("Reference synonym: %s") + tag['rs'][1] if referenceFlag & 0x04: # lg_en-wn item += tag['rr'][0] + outStr("Reference hypernym: %s") + tag['rr'][1] if referenceFlag & 0x08: # lg_en-wn item += tag['rp'][0] + outStr("Reference hyponym: %s") + tag['rp'][1] #0x02 antonym ? if dataFlag & 0x80: # Phrase block flags = [ out("Data phrase block: %s"), out("Data phrase block: %s"), out("Data phrase block: %s"), out("Data phrase block: %s"), out("Data phrase block: %s"), out("Data phrase block: %s"), out("Data phrase block: %s"), out("Data phrase block: %s")] if flags == [0x80,0x80,0xF9,0xDF,0x9D,0x00,0x0B,0x01]: result += "\\nphr: " li = 1 ol = True item += tag['b1'][0]+outStr("Data phrase 1: %s") + tag['b1'][1] out("Data phrase block: %s") out("Data phrase block: %s") out("Data phrase block: %s") out("Data phrase block: %s") item += tag['ds'][0] + outStr("Data phrase 2: %s") + tag['ds'][1] if flags == [0x80,0x80,0xF9,0xDF,0x9D,0x00,0x23,0x01]: result += "\\nphr: " li = 1 ol = True item += tag['b1'][0]+outStr("Data phrase 1: %s") + tag['b1'][1] out("Data phrase block: %s") out("Data phrase block: %s") out("Data phrase block: %s") out("Data phrase block: %s") out("Data phrase block: %s") item += tag['ds'][0] + outStr("Data phrase 2: %s") + tag['ds'][1] if ol: result += "\\n%d. %s" % (li, item) else: result += item ok = True if (len(stream) != 13752) and (len(stream) != 21988) and (len(stream) != 16204) and (len(stream) != 12656): #hack to workaround bug in some dicts (lg_czen-eco, lg_encz-ind, lg_czgr-eco, lg_grsk-2) while pos < len(stream): ok = (outInt() == 0x00) and ok if ok: result += '\n' return decode_tag_postprocessing(result) ################################################################ # MAIN ################################################################ f = open(FILENAME,'rb') # DECODE HEADER OF FILE copyright = unpack("<64s",f.read(64))[0] a = unpack("<16L",f.read(64)) entryCount = a[4] indexBaseCount = a[6] indexOffsetCount = a[7] pos1 = a[8] indexPos = a[9] bodyPos = a[10] smallIndex = (a[3] == 2052) ################################################################ # TRANSLATION TABLES ################################################################ if smallIndex: # TODO: smallIndex might not correspond with encoding alpha = ['\x00', 'a','b','c','d','e','f','g','h','i', 'j','k','l','m','n','o','p','q','r','s', 't','u','v','w','x','y','z','á','ä','č', 'ď','é', 'ě', 'í', '#AL34#', '#AL35#', 'ň', 'ó', 'ö', '#AL39#', 'ř', 'š', 'ť', 'ú', 'ů', 'ü', 'ý', 'ž', 'ß', ' ', '.', ',', '-', '\'', '(', ')', '`', '"', '#AL58#', '#AL59#', '#UPCASE#', 'à', '#SPECIAL#', "#AL1234213"] # 4 bytes after unicode upcase = ['\x00', 'A','B','C','D','E','F','G','H','I', 'J','K','L','M','N','O','P','Q','R','S', 'T','U','V','W','X','Y','Z','Á','Ä','Č', 'Ď','É', 'Ě', 'Í', '<', '>', 'Ň', 'Ó', '-', '#UP39#', 'Ř', 'Š', 'Ť', 'Ú', 'Ů', 'Ü', 'Ý', 'Ž', '#UP48#', ' ', '#UP.#', '#UP,#', '#UP-#', '#UP\'#', '#UP(#', '#UP)#', '#UP`#', '#UP"#', '#UP58#', '#UP59#', '#~UPCASE#', 'À', '#UP/#'] # 4 bytes after unicode else: alpha = ['\x00', 'a','b','c','d','e','f','g','h','i', 'j','k','l','m','n','o','p','q','r','s', 't','u','v','w','x','y','z','#AL27#','#AL28#','#AL29#', '#AL30#','#AL31#', ' ', '.', '<', '>', ',', ';', '-', '#AL39#', '#GRAVE#', '#ACUTE#', '#CIRC#', '#TILDE#', '#UML#', '#AL45#', '#DACUT#', '#CARON#', '#BREVE#', '#CEDIL#', '#STROKE#', '#SHARP#', 'β', '#AL53#', '#AL54#', '#AL55#', '#AL56#', '#AL57#', 's', '#SYMBOL#', # symbol 58 is used in Spanish word pillo as s (seimpre) '#PRON#', '#UPCASE#', '#SPECIAL#', '#UNICODE#'] # 4 bytes after unicode upcase = ['#UP0#','#UP1#','#UP2#','#UP3#','#UP4#','#UP5#','#UP6#','#UP7#','#UP8#','#UP9#', '#UP10#','#UP11#','#UP12#','#UP13#','#UP14#','#UP15#','#UP16#','#UP17#','#UP18#','#UP19#', '#UP20#','#UP21#','#UP22#','#UP23#','#UP24#','#UP25#','#UP26#','#UP27#','#UP28#','#UP29#', '#UP30#','#UP31#','A','B','C','D','E','F','G','H', 'I','J','K','L','M','N','O','P','Q','R', 'S','T','U','V','W','X','Y','Z','#UP58#','#UP59#', '#UP60#','#UP61#','#UP62#','#UP63#'] upcase_pron = ['#upr0#', '#upr1#','#upr2#','#upr3#','#upr4#','#upr5#','#upr6#','#upr7#','#upr8#','#upr9#', '#upr10#', '#upr11#','#upr12#','#upr13#','#upr14#','#upr15#','#upr16#','#upr17#','#upr18#','#upr19#', '#upr20#', '#upr21#','#upr22#','#upr23#','#upr24#','#upr25#','#upr26#','#upr27#','#upr28#','#upr29#', '#upr30#', '#upr31#','ɑ','#upr33#','ʧ','ð','ə','ɜ','#upr38#','æ', 'ɪ', 'ɭ','#upr42#','ŋ','#upr44#','ɳ','ɔ','#upr47#','ɒ','ɽ', 'ʃ', 'θ','ʊ','ʌ','#pr54#','#upr55#','#upr56#','ʒ','#upr58#','#upr59#', '#upr60#', '#upr61#','#upr62#','#upr63#'] symbol = ['#SY0#', '#SY1#','„','…','§','#SY5#','#SY6#','#SY7#','‘','’', '“', '”','#SY12#','—','#SY14#','™','#SY16#','¡','¢','£', '¤', '#SY21#','#SY22#','§','©','#SY25#','#SY26#','#SY27#','®','°', '#SY30#', '²','³','#SY33#','#SY34#','#SY35#','¹','#SY37#','#SY38#','#SY39#', '½', '#SY41#','¿','×','÷','#SY45#','#SY46#','#SY47#','#SY48#','#SY49#', '#SY50#', '#SY51#','#SY52#','#SY53#','#SY54#','#SY55#','#SY56#','#SY57#','#SY58#','#SY59#', '#SY60#', '#SY61#','#SY62#','#SY63#'] special = ['#SP0#', '!','"','#','$','%','&','\'','(',')', '*', '+','#SP12#','#SP13#','#SP14#','/','0','1','2','3', '4', '5','6','7','8','9',':',';','<','=', '>', '?','@','[','\\',']','^','_','`','{', '|', '}','~','#SP43#','#SP44#','#SP45#','#SP46#','#SP47#','#SP48#','#SP49#', '#SP50#', '#SP51#','#SP52#','#SP53#','#SP54#','#SP55#','#SP56#','#SP57#','#SP58#','#SP59#', '#SP60#', '#SP61#','#SP62#','#SP63#'] wordclass = ('subs:','n:','adj:','pron:','num:','v:','adv:','prep:','conj:','part:', 'intr:','phr:','#WC12#','#WC13#','#WC14#','#WC15#','#WC16#','#WC17#','#WC18#','#WC19#', 'm/f:','m:','f:','#WC23#','#WC24#','#WC25#','#WC26#','#WC27#','#WC28#','#WC29#', '#WC30#','#WC31#') subs = { "#GRAVE#" : { 'a': 'à', 'e': 'è', 'u': 'û' # '#SPECIAL#': '?' # what the hell is this one # 'q': '?', # what the hell is this one # 's': '?', # what the hell is this one }, "#UML#" : { 'o': 'ö', 'u': 'ü', 'a': 'ä', 'e': 'ë', 'i': 'ï', ' ': 'Ä', '#DACUT#': 'Ö', 'β': 'Ü' }, "#ACUTE#" : { 'a': 'á', 'e': 'é', 'i': 'í', 'n': 'ń', 'o': 'ó', 'u': 'ú', 'l': 'ĺ', 'r': 'ŕ', 'y': 'ý', ' ': 'Á', ',': 'É', '#DACUT#':'Ó', '#AL56#': 'Ý', '#GRAVE#':'Í', '#CEDIL#': 'Ŕ', 'β':'Ú', '<':'Ć' }, "#CARON#" : { 'r': 'ř', 'c': 'č', 's': 'š', 'z': 'ž', 'e': 'ě', 'd': 'ď', 't': 'ť', 'a': 'å', 'u': 'ů', 'n': 'ň', 'l': 'ľ', '<': 'Č', '>': 'Ď', '#STROKE#': 'Š', ' ': 'Å', ',': 'Ě', 'β': 'Ů', '#TILDE#': 'Ľ', '#CEDIL#': 'Ř', '#SHARP#': 'Ť', '#AL45#': 'Ň', '#AL57#': 'Ž' }, "#SHARP#": { 's': 'ß', 'o': 'œ', 'a': 'æ', '#DACUT#': 'Œ' }, "#TILDE#": { 'n': 'ñ', 'o': 'õ', 'a': 'ã', 'i': 'ĩ' # 'e': '?' # what the hell is this one # '#SYMBOL#': '?' # what the hell is this one }, "#CIRC#": { 'a': 'â', 'e': 'ê', 'o': 'ô', 'i': 'î', 'u': 'û', ' ': 'Â', ',': 'Ê', # used in french word survętement, but not decoded by Lingea '#GRAVE#': 'Î', # used in french île, but not decoded by Lingea '#DACUT#': 'Ô' }, "#CEDIL#": { 'c': 'ç', 'e': 'ę', 'a': 'ą', 'k': 'ķ', 'i': 'ļ', 'n': 'ņ', '<': 'Ç' # 'j': '?' # what the hell is this one # '#UML#': '?' # what the hell is this one (used in word Jesús) }, "#DACUT#": { 'u': 'ű', 'z': 'ż', }, "#STROKE#": { 'l': 'ł', }, "#BREVE#": { 'a': 'ă', }, "#PRON#": { 'el': 'ɛ', 'ou': 'ɶ', 'or': 'ɸ', '#CEDIL#c': 'ʀ', 'hi': 'ɥ', 'nh': 'ɲ', 'ex': 'ɛ̃', 'cv': 'ɔ̃', 'ov': 'œ̃', 'av': 'ɑ̃' }, "#UPCASE#": upcase, "#SYMBOL#": symbol, "#SPECIAL#": special, } # DECODE INDEX STRUCTURE OF FILE index = [] f.seek(indexPos) bases = unpack("<%sL" % indexBaseCount, f.read(indexBaseCount * 4)) if smallIndex: # In small dictionaries every base is used 4-times bases4 = [] for i in bases: bases4.extend([i,i,i,i]) bases = bases4 for b in bases: offsets = unpack("<64H", f.read(64*2)) for o in offsets: if len(index) < indexOffsetCount: #print "Index %s: %s + %s + %s * 4 = %s" % (len(index), bodyPos, b, o, toBin(bodyPos + b + o * 4)) index.append(bodyPos + b + o * 4) # DECODE RECORDS if DEBUG: # PRINTOUT DEBUG OF FIRST WRONG RECORDS: for i in range(1,entryCount): if not DEBUGALL: DEBUG = False s = decode(getRec(i)) if DEBUGHEADER: # print s.split('\t')[0] print s if DEBUGLIMIT > 0 and not s.endswith('\n'): DEBUG = True print "-"*80 print "%s) at address %s" % (i, toBin(index[i])) print s = decode(getRec(i)) print s DEBUGLIMIT -= 1 DEBUG = True else: # DECODE EACH RECORD AND PRINT IT IN FORMAT FOR stardict-editor \t for i in range(1,entryCount): s = decode(getRec(i)) if s.endswith('\n'): print s, else: print s print "!!! RECORD STRUCTURE DECODING ERROR !!!" print "Please run this script in DEBUG mode and repair DATA BLOCK(S) section in function decode()" print "If you succeed with whole dictionary send report (name of the dictionary and source code of script) to slovniky@googlegroups.com" break