import os, struct; sys=os.sys from TupleStruct import TupleStructType if sys.hexversion<0x02030000: from tzot.future import enumerate magic_list= None mime_magic_list= None datatypes= None maximum_offset= 0 line_no= 0 def _exact_bits(a, b): return (a&b)==b # define the class to store the tests MagicTest= TupleStructType("level offset testtype mask comparison value description") Extraction= TupleStructType("format size datatype") def _prepare_datatypes(): global datatypes if datatypes is not None: return datatypes= {} for prefix, pack_prefix in ( ('', '='), ('be', '>'), ('le', '<')): for datatype, pack_letter in ( ('date', 'i'), ('long', 'i'), ('short', 'h'), ('byte', 'B'), ('string', 's') ): for unsigned in ('', 'u'): if unsigned: if datatype=='string': continue pack_letter= pack_letter.upper() struct_format= pack_prefix+pack_letter if datatype=='string': size= None else: size= struct.calcsize(struct_format) datatypes[unsigned+prefix+datatype]= Extraction(struct_format, size, datatype) def _get_magic_list(): global magic_list, datatypes, maximum_offset, line_no from cPickle import dump as pckl_dump, load as pckl_load struct_pack= struct.pack; struct_unpack= struct.unpack my_version= (0, 8, 6) # change this whenever changes occur in the algorithm if not magic_list is None: return from operator import __eq__ as op_eq, __lt__ as op_lt, __gt__ as op_gt, \ __and__ as op_and, __ne__ as op_ne, __or__ as op_or, \ __ge__ as op_ge, __le__ as op_le ops= {'=': op_eq, '<': op_lt, '>': op_gt, '!': op_ne, '^': op_or, '&': _exact_bits, '<=': op_le, '>=': op_ge} _prepare_datatypes() unsigned_to_signed= -2*(sys.maxint+1) def my_int(ascii): base= 10; negate= False if len(ascii)>1 and ascii[0]=="0": if ascii[1]=="x": ascii= ascii[2:]; base= 16 else: ascii= ascii[1:]; base= 8 if ascii[0]=='~': negate= True; ascii=ascii[1:] if ascii[-1]=="L": ascii= ascii[:-1] try: value= int(ascii, base) except ValueError: value= int(long(ascii, base)+unsigned_to_signed) if negate: negate= ~negate return value def unescaped(text): if not text: return '' lst= text.split('\\') for i in range(1, len(lst)): if not lst[i]: lst[i]= '\\' if lst[i][0].isdigit(): idx=1 while idxos.path.getmtime(filename): # maybe compiled ok fle= file(filename2, "rb") if pckl_load(fle) >= my_version: magic_list= pckl_load(fle) fle.close() return fle= file(filename, "rb") for lne in fle.xreadlines(): line_no+= 1 lne= lne.strip() if not lne or lne.startswith("#"): continue ## print lne fields= lne.split(None, 2) level= 0 while fields[0][level]=='>': level+= 1 try: offset= my_int(fields[0][level:]) except ValueError: ## print >>sys.stderr, "ignore offset:", lne continue # now 'level' contains count of ">" at the start of line # and 'offset' is the byte offset into the file if maximum_offset < offset: maximum_offset= offset ## print lne try: type, mask= fields[1].split('&') except ValueError: type= fields[1]; mask= ~0 if isinstance(mask, str): mask= my_int(mask) try: testtype= datatypes[type] except KeyError: # type not implemented yet ## print >>sys.stderr, "ignore type:", lne continue # now testtype is a tuple of format, size and datatype # 'mask' is read too if testtype.datatype != 'string': # fields2 is a single value and a description try: value, description= fields[2].split(None, 1) except ValueError: # they forgot the description continue if value == "=": # sometimes it's type = value in the magic file try: dummy, value, description= fields[2].split(None, 2) except ValueError: # they forgot the description again continue if value.startswith('>'): value= '>' + value[4:] if value[:2] in ops: comparison= ops[value[:2]] value= my_int(value[2:]) elif len(value)>1 and value[0] in ops: comparison= ops[value[0]] value= my_int(value[1:]) else: comparison= op_eq if value == 'x': # just get the value for output value= None else: value= struct_unpack(testtype.format, struct_pack(testtype.format, my_int(value)))[0] else: # datatype=='string' idx=0 if len(fields) < 3: # no value or decription fields.append('* ,') try: while not fields[2][idx].isspace(): if fields[2][idx]=='\\': idx+= 1 idx+=1 except IndexError: idx= len(fields[2]) comparison= op_eq value= fields[2][:idx] if value[:2] in ops: comparison= ops[value[:2]] value= value[2:] elif value[:1] in ops: if value[:1] != "!": comparison= ops[value[:1]] value= value[1:] else: comparison= op_eq value= unescaped(value) description= unescaped(fields[2][idx:].strip()) magic_list.append(MagicTest(level, offset, testtype, mask, comparison, value, description)) fle.close(); del fle fle= file(filename2, "wb") pckl_dump(my_version, fle, True) pckl_dump(magic_list, fle, True) fle.close() def file_magic(filename): global magic_list f= file(filename, "rb") try: _get_magic_list() except: print "At line %d" % line_no raise last_level= 0; last_success= False result= [] reads_done= {} success_at_level= None for magic in magic_list: if magic.level > last_level and not last_success: continue if success_at_level and magic.level==0: break last_success= False value= None if magic.testtype.size: # not string if (magic.offset, magic.testtype.size, magic.testtype.format) in reads_done: value= reads_done[(magic.offset, magic.testtype.size, magic.testtype.format)] else: value= None if not value: f.seek(magic.offset) if magic.testtype.size is None: # string if magic.value == '\0': byte_count= 0 else: byte_count=len(magic.value) unpack_format= str(byte_count) + "s" else: byte_count= magic.testtype.size unpack_format= magic.testtype.format if byte_count: bytes= f.read(byte_count) else: bytes= f.read(80) if '\0' not in bytes: bytes+= '\0' byte_count= bytes.index('\0') bytes= bytes[:byte_count] unpack_format= "%ds" % byte_count # now we have read 'bytes' and byte_count == len(bytes) if len(bytes) != byte_count: continue value= struct.unpack(unpack_format, bytes)[0] if magic.testtype.size: reads_done[(magic.offset, magic.testtype.size, magic.testtype.format)]= value if magic.value is not None: if magic.testtype.size is not None: if magic.comparison(value & magic.mask, magic.value): if '%' in magic.description: result.append(magic.description % value) else: result.append(magic.description) if not success_at_level: success_at_level= magic.level last_success= True else: if magic.comparison(value, magic.value): if '%' in magic.description: result.append(magic.description % value) else: result.append(magic.description) if not success_at_level: success_at_level= magic.level last_success= True else: # magic.value is None -> value in file was 'x' result.append(magic.description % value) last_success= True # match always # end of loop last_level= magic.level return ' '.join(result).replace(' \x08', '').replace(' \\b', '') if __name__=="__main__": __file__= "c:/pyth.22/lib/site-packages/tzot/magic.py" _get_magic_list() ## dirname= "d:/temp/download/texts/" dirname= "d:/temp/attachments/" for filename in os.listdir(dirname): try: print filename + "=" + repr(file_magic(dirname + filename)) except IOError: pass import pprint pprint.pprint