diff --git a/compare-db.py b/compare-db.py index 3a80451..06b1d70 100644 --- a/compare-db.py +++ b/compare-db.py @@ -13,74 +13,95 @@ # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -# +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. + +"""Compare contents of db with output from other version of file(1).""" + +from __future__ import print_function -import os import sys +import mutex from pyfile import * -from pyfile.progressbar import ProgressBar from pyfile.threadpool import * -import mutex -def compare_all_files(file_name = 'file', magdir = 'Magdir', exact = False): - pool = ThreadPool(4) - m = mutex.mutex() - split_patterns(magdir, file_name) - compile_patterns(file_name) - compiled = is_compilation_supported(file_name) +def compare_all_files(file_name='file', magdir='Magdir', exact=False): + """ + Compares all saved file(1) output in db with that of other file(1) version. + + Creates a ThreadPool to do this in parallel. Uses a mutex lock to ensure + that text output is not garbled. + """ + n_threads = 4 + pool = ThreadPool(n_threads) + print_lock = mutex.mutex() + + split_patterns(magdir, file_name) + compile_patterns(file_name) + compiled = is_compilation_supported(file_name) + + entries = get_stored_files("db") + + def store_mimedata(entry): + """For a single db entry, calls file(1) and compares it to db data.""" + metadata = get_full_metadata(entry, file_name, compiled) + stored_metadata = get_stored_metadata(entry) + text = "PASS " + entry + if is_regression(stored_metadata, metadata, exact): + text = "FAIL " + entry + "\n" + \ + get_diff(stored_metadata, metadata, exact) + return text + + def data_print(data): + """Print result for single entry and unlock print lock.""" + print(data) + print_lock.unlock() + + def data_stored(data): + """Call data_print as soon as print lock has been acquired.""" + print_lock.lock(data_print, data) + + for entry in entries: + # Insert tasks into the queue and let them run + pool.queueTask(store_mimedata, args=(entry, ), + callback=data_stored) + + # When all tasks are finished, allow the threads to terminate + pool.joinAll() + print('') + - entries = get_stored_files("db") +def main(): + """Parse arguments, call :py:func:`compare_all_files`.""" + file_name = 'file' + magdir = "Magdir" + exact = False - def store_mimedata(data): - metadata = get_full_metadata(data[0], file_name, compiled) - stored_metadata = get_stored_metadata(data[0]) - text = "PASS " + data[0] - if is_regression(stored_metadata, metadata, exact): - text = "FAIL " + data[0] + "\n" + get_diff(stored_metadata, metadata, exact) - return text + if len(sys.argv) >= 3: + file_name = sys.argv[1] + magdir = sys.argv[2] + elif (len(sys.argv) == 2 and sys.argv[1] == "-h") or len(sys.argv) == 1: + print("Compares files in database with output of current file binary.") + print(sys.argv[0] + " [path_to_magdir_directory] [file_name]") + print(" Default path_to_magdir_directory='Magdir'") + print(" Default file_name='file'") + print("Examples:") + print(" " + sys.argv[0] + " file-5.07;") + print(" " + sys.argv[0] + " file-5.07 file-5.04/magic/Magdir;") + sys.exit(0) - def data_print(data): - print data - m.unlock() + if magdir == "exact": + exact = True + magdir = "Magdir" - def data_stored(data): - m.lock(data_print, data) + if len(sys.argv) == 4 and sys.argv[3] == "exact": + exact = True - for i,entry in enumerate(entries): - # Insert tasks into the queue and let them run - pool.queueTask(store_mimedata, (entry, i % 2), data_stored) + file_name = sys.argv[1] + compare_all_files(file_name, magdir, exact) - # When all tasks are finished, allow the threads to terminate - pool.joinAll() - print '' # run this only if started as script from command line if __name__ == '__main__': - file_name = 'file' - magdir = "Magdir" - exact = False - - if len(sys.argv) >= 3: - file_name = sys.argv[1] - magdir = sys.argv[2] - elif (len(sys.argv) == 2 and sys.argv[1] == "-h") or len(sys.argv) == 1: - print "Compares files in database with output of current file binary." - print sys.argv[0] + " [path_to_magdir_directory] [file_name]" - print " Default path_to_magdir_directory='Magdir'" - print " Default file_name='file'" - print "Examples:" - print " " + sys.argv[0] + " file-5.07;" - print " " + sys.argv[0] + " file-5.07 file-5.04/magic/Magdir;" - sys.exit(0) - - if magdir == "exact": - exact = True - magdir = "Magdir" - - if len(sys.argv) == 4 and sys.argv[3] == "exact": - exact = True - - file_name = sys.argv[1] - compare_all_files(file_name, magdir, exact) + main() diff --git a/fast-regression-test.py b/fast-regression-test.py index ad14d1f..d040bae 100755 --- a/fast-regression-test.py +++ b/fast-regression-test.py @@ -14,87 +14,110 @@ # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -# +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. + +"""Do a quick comparison of output of file(1) with that saved in db.""" + +from __future__ import print_function -import os import sys import getopt +import mutex from pyfile import * from pyfile.threadpool import * -import mutex - -ret = 0 -def test_all_files(exact = False, binary = "file"): - global ret - ret = 0 +#: return value from test_all_files +#: TODO: make this a nonlocal in py3 +ret = 0 - print_file_info(binary) - m = mutex.mutex() +def test_all_files(exact=False, binary="file"): + """Compare output of given file(1) binary with db for all entries.""" + global ret + ret = 0 + + print_file_info(binary) + + print_lock = mutex.mutex() + + entries = sorted(get_stored_files("db")) + + def store_mimedata(filename): + """Compare file(1) output with db for single entry.""" + metadata = get_simple_metadata(filename, binary) + try: + stored_metadata = get_stored_metadata(filename) + except IOError: + # file not found or corrupt + text = "FAIL " + filename + "\n" + \ + "FAIL could not find stored metadata!\n" + \ + "This can mean that the File failed to generate " + \ + "any output for this file." + else: + text = "PASS " + filename + if is_regression(stored_metadata, metadata, exact): + text = "FAIL " + filename + "\n" + \ + get_diff(stored_metadata, metadata, exact) + return text + + def data_print(data): + """Print given text, set global return value, unlock print lock.""" + print(data) + if data[0] == "F": + global ret + ret = 1 + print_lock.unlock() + + def data_stored(data): + """Acquire print lock and call :py:function:`data_print`.""" + print_lock.lock(data_print, data) + + # create here so program exits if error occurs earlier + n_threads = 1 + pool = ThreadPool(n_threads) + + for entry in entries: + # Insert tasks into the queue and let them run + pool.queueTask(store_mimedata, entry, data_stored) + + # When all tasks are finished, allow the threads to terminate + pool.joinAll() + print('') + return ret - entries = sorted(get_stored_files("db")) - def store_mimedata(filename): - metadata = get_simple_metadata(filename, binary) - try: - stored_metadata = get_stored_metadata(filename) - except IOError: - # file not found or corrupt - text = "FAIL " + filename + "\n" + "FAIL could not find stored metadata!\n\ - This can mean that the File failed to generate any output for this file." - else: - text = "PASS " + filename - if is_regression(stored_metadata, metadata, exact): - text = "FAIL " + filename + "\n" + get_diff(stored_metadata, metadata, exact) - return text +def usage(ecode): + """Print info on how to use this program. Return given code.""" + print("Runs regressions.") + print(sys.argv[0] + " [-e] [-b ]") + print(" Default file_binary='file'") + print("Examples:") + print(" " + sys.argv[0] + " -e -b '../file -m ../../magic/magic.mgc'") + print(" " + sys.argv[0] + " -e") + sys.exit(ecode) - def data_print(data): - print data - if data[0] == "F": - global ret - ret = 1 - m.unlock() - def data_stored(data): - m.lock(data_print, data) +def main(): + """Called when running this as script. Parse args, call test_all_files.""" + exact = False + file_binary = "file" + args = sys.argv[1:] - pool = ThreadPool(4) # create here so program exits if error occurs earlier + optlist, args = getopt.getopt(args, 'b:e') - for entry in entries: - # Insert tasks into the queue and let them run - pool.queueTask(store_mimedata, entry, data_stored) + for option, arg in optlist: + if option == '-b': + file_binary = arg + elif option == '-e': + exact = True + else: + usage(1) - # When all tasks are finished, allow the threads to terminate - pool.joinAll() - print '' - return ret + sys.exit(test_all_files(exact, file_binary)) -def usage(ecode): - print "Runs regressions." - print sys.argv[0] + " [-e] [-b ]" - print " Default file_binary='file'" - print "Examples:" - print " " + sys.argv[0] + " -e -b '../file -m ../../magic/magic.mgc'" - print " " + sys.argv[0] + " -e" - sys.exit(ecode) # run this only if started as script from command line if __name__ == '__main__': - exact = False - file_binary = "file" - args = sys.argv[1:] - - optlist, args = getopt.getopt(args, 'b:e') - - for o, a in optlist: - if o == '-b': - file_binary = a - elif o == '-e': - exact = True - else: - usage(1) - - sys.exit(test_all_files(exact, file_binary)) + main() diff --git a/pyfile/db.py b/pyfile/db.py index f9190a1..e9670a9 100644 --- a/pyfile/db.py +++ b/pyfile/db.py @@ -13,129 +13,169 @@ # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -# +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. + +"""Load file(1) output from db, compare it, store output in db.""" + import os -import sys -import errno -from subprocess import Popen, PIPE import pickle import difflib import mimetypes from cStringIO import StringIO -import re mimetypes.init() + def get_stored_metadata(filename): - f = open (filename + ".pickle", 'r') - p = pickle.load(f) - f.close() - return p + """Retrieve metadata stored for given entry in db.""" + with open(filename + ".pickle", 'r') as file_handle: + return pickle.load(file_handle) + def set_stored_metadata(filename, metadata): - f = open (filename + ".pickle", 'w') - pickle.dump(metadata, f) - f.close() - -def is_regression(m1, m2, exact = False, ratio = 0.7): - if m1['output'] == None or m2['output'] == None: - return True - if m1['output'] != m2['output']: - # previous file didn't detect it, so we hope new output is ok - if not m1['output'].endswith("data\n"): - if exact: - if m1['output'] != m2['output']: - return True - else: - r = difflib.SequenceMatcher(None, m1['output'], m2['output']).ratio() - if (r < ratio): - #print >> sys.stderr, "Expected:%sGot :%s" % (m2['output'], m1['output']) - return True - - mime = m2['mime'].split(":")[-1].split(";")[0].strip() - old_mime = m1['mime'].split(":")[-1].split(";")[0].strip() - - # if old_mime is empty, then previous version of File didn't know that filetype. - # we will hope that new mime is right. - if old_mime != mime and len(old_mime) != 0: - ext = os.path.splitext(mime)[-1] - # it's not error if new mimetype is correct type for that extension. - if ext in mimetypes.types_map.keys(): - expected = mimetypes.types_map[ext] - if expected == mime: - return True - #else: - #print >> sys.stderr, "Expected:%s" % (expected) - #print >> sys.stderr, "Expected:%s\nGot :%s" % (old_mime, mime) - return True - return False; - -def get_diff(m1, m2, exact = False, ratio = 0.7): - if m1['output'] == None or m2['output'] == None: - return "Output is None, was there error during File execution?" - - text = "" - if m1['output'] != m2['output']: - # previous file didn't detect it, so we hope new output is ok - if not m1['output'].endswith("data\n"): - if exact: - if m1['output'] != m2['output']: - text = "Expected :%sGot :%s" % (m1['output'], m2['output']) - else: - r = difflib.SequenceMatcher(None, m1['output'], m2['output']).ratio() - if (r < ratio): - text = "Expected :%sGot :%s" % (m1['output'], m2['output']) - - mime = m2['mime'].split(":")[-1].split(";")[0].strip() - old_mime = m1['mime'].split(":")[-1].split(";")[0].strip() - - want_mime_diff = False - - # if old_mime is empty, then previous version of File didn't know that filetype. - # we will hope that new mime is right. - if old_mime != mime and len(old_mime) != 0: - ext = os.path.splitext(mime)[-1] - # it's not error if new mimetype is correct type for that extension. - if ext in mimetypes.types_map.keys(): - expected = mimetypes.types_map[ext] - if expected != mime: - want_mime_diff = True - want_mime_diff = True - if want_mime_diff: - text += "Expected :%sGot :%s" % (m1['mime'], m2['mime']) - - if text != "": - if m1.has_key('pattern') and m2.has_key('pattern') and m1['pattern'] != "" and m2['pattern'] != "": - for line in difflib.unified_diff(StringIO(m1['pattern']).readlines(), StringIO(m2['pattern']).readlines()): - text += line - return text - -def get_stored_files(dir_name, subdir = True, *args): - '''Return a list of file names found in directory 'dir_name' - If 'subdir' is True, recursively access subdirectories under 'dir_name'. - Additional arguments, if any, are file extensions to match filenames. Matched - file names are added to the list. - If there are no additional arguments, all files found in the directory are - added to the list. - Example usage: fileList = dirEntries(r'H:\TEMP', False, 'txt', 'py') - Only files with 'txt' and 'py' extensions will be added to the list. - Example usage: fileList = dirEntries(r'H:\TEMP', True) - All files and all the files in subdirectories under H:\TEMP will be added - to the list. - ''' - fileList = [] - for file in os.listdir(dir_name): - dirfile = os.path.join(dir_name, file) - if os.path.isfile(dirfile): - if not args: - if not dirfile.endswith("pickle") and not dirfile.endswith(".source.txt"): - fileList.append(dirfile) - else: - if os.path.splitext(dirfile)[1][1:] in args: - fileList.append(dirfile) - # recursively access file names in subdirectories - elif os.path.isdir(dirfile) and subdir: - #print "Accessing directory:", dirfile - fileList.extend(get_stored_files(dirfile, subdir, *args)) - return fileList + """Store given metadata for given entry in db.""" + with open(filename + ".pickle", 'w') as file_handle: + pickle.dump(metadata, file_handle) + + +def is_regression(meta1, meta2, exact=False, ratio=0.7): + """ + Determine whether two file(1) outputs for same entry are incompatible. + + Metadata can be obtained from py:func`get_stored_metadata` or + :py:func:`file.get_full_metadata`. + + :param dict meta1: metadata for entry1. + :param dict meta2: metadata for entry2. + :param bool exact: whether output has to match letter for letter (True) or + whether slight changes are allowed. + :param float ratio: Amount of difference required for slightly different + entries to be considered the same: + `0` = all changes allowed; `1` = need perfect match. + :returns: True if there is a (significant) difference between `meta1` + and `meta2`. + + .. todo:: Reduce code duplication with function get_diff + """ + if meta1['output'] is None or meta2['output'] is None: + return True + if meta1['output'] != meta2['output']: + # previous file didn't detect it, so we hope new output is ok + if not meta1['output'].endswith("data\n"): + if exact: + if meta1['output'] != meta2['output']: + return True + else: + match = difflib.SequenceMatcher(None, meta1['output'], + meta2['output']).ratio() + if match < ratio: + # print >> sys.stderr, "Expected:%sGot :%s" \ + # % (meta2['output'], meta1['output']) + return True + + mime = meta2['mime'].split(":")[-1].split(";")[0].strip() + old_mime = meta1['mime'].split(":")[-1].split(";")[0].strip() + + # if old_mime is empty, then previous version of File didn't know that + # filetype. we will hope that new mime is right. + if old_mime and old_mime != mime: + ext = os.path.splitext(mime)[-1] + # it's not error if new mimetype is correct type for that extension. + if ext in mimetypes.types_map.keys(): + expected = mimetypes.types_map[ext] + if expected == mime: + return True + # else: + # print >> sys.stderr, "Expected:%s" % (expected) + # print >> sys.stderr, "Expected:%s\nGot :%s" % (old_mime, mime) + return True + return False + + +def get_diff(meta1, meta2, exact=False, ratio=0.7): + """ + Get textual description about how well file(1) outputs match. + + Like :py:func:`is_regression`, except the output is a description instead + of just a bool. + + .. todo:: Reduce code duplication with function is_regression + """ + if meta1['output'] is None or meta2['output'] is None: + return "Output is None, was there error during File execution?" + + text = "" + if meta1['output'] != meta2['output']: + # previous file didn't detect it, so we hope new output is ok + if not meta1['output'].endswith("data\n"): + if exact: + if meta1['output'] != meta2['output']: + text = "Expected :%sGot :%s" % (meta1['output'], + meta2['output']) + else: + match = difflib.SequenceMatcher(None, meta1['output'], + meta2['output']).ratio() + if match < ratio: + text = "Expected :%sGot :%s" % (meta1['output'], + meta2['output']) + + mime = meta2['mime'].split(":")[-1].split(";")[0].strip() + old_mime = meta1['mime'].split(":")[-1].split(";")[0].strip() + + want_mime_diff = False + + # if old_mime is empty, then previous version of File didn't know that + # filetype. we will hope that new mime is right. + if old_mime and old_mime != mime: + ext = os.path.splitext(mime)[-1] + # it's not error if new mimetype is correct type for that extension. + if ext in mimetypes.types_map.keys(): + expected = mimetypes.types_map[ext] + if expected != mime: + want_mime_diff = True + want_mime_diff = True # TODO: this invalidates lines above + if want_mime_diff: + text += "Expected :%sGot :%s" % (meta1['mime'], meta2['mime']) + + if text != "": + if ('pattern' in meta1) and ('pattern' in meta2) and \ + meta1['pattern'] != "" and meta2['pattern'] != "": + for line in difflib.unified_diff( + StringIO(meta1['pattern']).readlines(), + StringIO(meta2['pattern']).readlines()): + text += line + return text + + +def get_stored_files(dir_name, subdir=True, *args): + r""" + Return a list of file names found in directory 'dir_name'. + + If 'subdir' is True, recursively access subdirectories under 'dir_name'. + Additional arguments, if any, are file extensions to match filenames. + Matched file names are added to the list. + If there are no additional arguments, all files found in the directory are + added to the list. + Example usage: file_list = dirEntries(r'H:\TEMP', False, 'txt', 'py') + Only files with 'txt' and 'py' extensions will be added to the list. + Example usage: file_list = dirEntries(r'H:\TEMP', True) + All files and all the files in subdirectories under H:\TEMP will be added + to the list. + """ + file_list = [] + for file_name in os.listdir(dir_name): + dirfile = os.path.join(dir_name, file_name) + if os.path.isfile(dirfile): + if not args: + if not dirfile.endswith("pickle") and \ + not dirfile.endswith(".source.txt"): + file_list.append(dirfile) + else: + if os.path.splitext(dirfile)[1][1:] in args: + file_list.append(dirfile) + # recursively access file names in subdirectories + elif os.path.isdir(dirfile) and subdir: + # print "Accessing directory:", dirfile + file_list.extend(get_stored_files(dirfile, subdir, *args)) + return file_list diff --git a/pyfile/file.py b/pyfile/file.py index 4ca7d26..bfc909e 100644 --- a/pyfile/file.py +++ b/pyfile/file.py @@ -13,254 +13,358 @@ # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -# +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. + +"""Wrapper for `file(1)` with additional pattern compilation & search.""" + +from __future__ import print_function import os import sys import errno from subprocess import Popen, PIPE -from progressbar import ProgressBar import hashlib import re +from progressbar import ProgressBar -def print_file_info(file_binary = 'file'): - if not file_binary.startswith("/") and not file_binary.startswith("./") and not file_binary.startswith("../"): - popen = Popen('which ' + file_binary, shell=True, bufsize=4096, stdout=PIPE) - pipe = popen.stdout - output_which = pipe.read().strip() - if popen.wait() != 0: - raise ValueError('could not query {0} for its version ({1})!'.format(file_binary, output_which)) - else: - output_which = file_binary - popen = Popen(file_binary + " --version", shell=True, bufsize=4096, stdout=PIPE) - pipe = popen.stdout - output_ver = pipe.read().strip() - if popen.wait() not in (0,1): - raise ValueError('could not query {0} for its version ({1})!'.format(file_binary, output_ver)) - print 'using file from', output_which - print 'version is', output_ver + +def print_file_info(file_binary='file'): + """`print()` absolute path and version of given `file(1)` binary.""" + if not file_binary.startswith("/") and not file_binary.startswith("./") \ + and not file_binary.startswith("../"): + popen = Popen('which ' + file_binary, shell=True, bufsize=4096, + stdout=PIPE) + pipe = popen.stdout + output_which = pipe.read().strip() + if popen.wait() != 0: + raise ValueError('could not query {0} for its version ({1})!' + .format(file_binary, output_which)) + else: + output_which = file_binary + popen = Popen(file_binary + " --version", shell=True, bufsize=4096, + stdout=PIPE) + pipe = popen.stdout + output_ver = pipe.read().strip() + if popen.wait() not in (0, 1): + raise ValueError('could not query {0} for its version ({1})!' + .format(file_binary, output_ver)) + print('using file from', output_which) + print('version is', output_ver) def mkdir_p(path): - try: - os.makedirs(path) - except OSError as exc: # Python >2.5 - if exc.errno == errno.EEXIST: - pass - else: raise - -def get_file_output(filename, binary = "file"): - popen = Popen(binary + " -b " + filename, shell=True, bufsize=4096, stdout=PIPE, stderr=PIPE) - pipe = popen.stdout - output = pipe.read() - output_err = popen.stderr.read() - if popen.wait() != 0: - return "Error while calling file, output: " + str(output) - return output - -def get_file_mime(filename, binary = "file"): - popen = Popen(binary + " -ib " + filename, shell=True, bufsize=4096, stdout=PIPE, stderr=PIPE) - pipe = popen.stdout - output = pipe.read() - output_err = popen.stderr.read() - if popen.wait() != 0: - return "Error while calling file, output: " + str(output) - return output - -def get_simple_metadata(filename, binary = "file"): - metadata = {} - metadata['output'] = get_file_output(filename, binary) - metadata['mime'] = get_file_mime(filename, binary) - return metadata - -def _split_patterns(pattern_id = 0, magdir = "Magdir", file_name = "file", only_name = False): - FILE_BINARY_HASH = hashlib.sha224(file_name).hexdigest() - outputdir = ".mgc_temp/" + FILE_BINARY_HASH + "/output" - mkdir_p(outputdir) - - files = os.listdir(magdir) - files.sort() - if len(files) == 0: - raise ValueError('no files found in Magdir {0}'.format( os.path.join(os.getcwd(), magdir) )) - prog = ProgressBar(0, len(files), 50, mode='fixed', char='#') - for f in files: - mfile = os.path.join(magdir, f) - if os.path.isdir(mfile): - continue - fd = open(mfile, "r") - buff = "" - in_pattern = False - prog.increment_amount() - print prog, "Splitting patterns", '\r', - sys.stdout.flush() - lines = fd.readlines() - for i,line in enumerate(lines): - if line.strip().startswith("#") or len(line.strip()) == 0: - continue - #print line.strip() - if line.strip()[0].isdigit(): - if in_pattern: - fd_out = open(os.path.join(outputdir, str(pattern_id)), "w") - fd_out.write(buff) - fd_out.close() - in_pattern = False - buff = "" - if only_name: - if not re.match("^[0-9]*(\\s)*name", line.strip()): - continue - in_pattern = True - pattern_id += 1 - buff += "#" + f +"\n" - buff += "# Automatically generated from:\n" - buff += "#" + f + ":" + str(i) + "\n" - buff += line - elif line.strip().startswith(">") or line.strip().startswith("!"): - if in_pattern: - buff += line - elif only_name == False: - print "broken pattern in file '" + f + "':" + str(i) - if in_pattern: - fd_out = open(os.path.join(outputdir, str(pattern_id)), "w") - fd_out.write(buff) - fd_out.close() - fd.close() - return pattern_id - -def split_patterns(magdir = "Magdir", file_name = "file"): - pattern_id = _split_patterns(0, magdir, file_name, True) - _split_patterns(pattern_id, magdir, file_name) - - print '' - -def compile_patterns(file_name = "file", file_binary = "file"): - FILE_BINARY_HASH = hashlib.sha224(file_name).hexdigest() - magdir = ".mgc_temp/" + FILE_BINARY_HASH + "/output" - files = os.listdir(magdir) - if len(files) == 0: - raise ValueError('no files found in Magdir {0}'.format( os.path.join(os.getcwd(), magdir) )) - files.sort(key=lambda x: [int(x)]) - mkdir_p(".mgc_temp") - mkdir_p(".mgc_temp/" + FILE_BINARY_HASH) - mkdir_p(".mgc_temp/" + FILE_BINARY_HASH + "/tmp") - prog = ProgressBar(0, len(files), 50, mode='fixed', char='#') - - for i,f in enumerate(files): - out_file = ".mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + ".mgc" - if not os.path.exists(out_file): - fd = open(os.path.join(magdir, f), "r") - buf = fd.read() - fd.close() - x = buf.split("\n")[0][1:len(buf.split("\n")[0])] - tmp = open(os.path.join(".mgc_temp/" + FILE_BINARY_HASH + "/tmp/" + x), "a") - tmp.write(buf) - tmp.flush() - tmp.close() - ##tmp = open(".mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp", "a") - ##tmp.write(buf) - ##tmp.flush() - ##tmp.close() - #os.chdir(".mgc_temp") - #print "cp .mgc_temp/.find-magic.tmp .mgc_temp/.find-magic.tmp." + str(i) + ";" + FILE_BINARY + " -C -m .mgc_temp/.find-magic.tmp." + str(i) + ";" - #mv .find-magic.tmp." + str(i) + ".mgc .mgc_temp/; - - ##os.system("cp .mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp .mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + ";file -C -m .mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + ";") - cmd = file_binary + " -C -m .mgc_temp/" + FILE_BINARY_HASH + "/tmp" - ret_code = os.system(cmd) - if ret_code != 0: - raise ValueError('command {0} returned non-zero exit code {1}!'.format(cmd, ret_code)) - if os.path.exists("tmp.mgc"): - ret_code = os.system("mv tmp.mgc " + out_file) - if ret_code != 0: - raise ValueError('moving tmp.mgc to {0} failed with code {1}!'.format(out_file, ret_code)) - #os.chdir("..") - prog.increment_amount() - print prog, "Compiling patterns", '\r', - sys.stdout.flush() - print "" - -def get_full_metadata(infile, file_name = "file", compiled = True, file_binary = "file"): - """ file-output plus binary search to find the relevant line in magic file """ - COMPILED_SUFFIX = ".mgc" - if not compiled: - COMPILED_SUFFIX = "" - FILE_BINARY_HASH = hashlib.sha224(file_name).hexdigest() - magdir = ".mgc_temp/" + FILE_BINARY_HASH + "/output" - FILE_BINARY = file_binary - files = os.listdir(magdir) - files.sort(key=lambda x: [int(x)]) - tlist = [] - mkdir_p(".mgc_temp") - a = 0 - b = len(files) - 1 - i = b - - a_out = "" - b_out = None - - while True: - f = files[i] - cmd = FILE_BINARY + " -b " + infile + " -m .mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + COMPILED_SUFFIX - #print FILE_BINARY + " " + infile + " -m .mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + COMPILED_SUFFIX - popen = Popen(cmd, shell=True, bufsize=4096, stdout=PIPE) - pipe = popen.stdout - last = pipe.read() - if popen.wait() != 0: - return {'output':None, 'mime':None, 'pattern':None, "suffix":None, "err":(cmd, last.strip())} - if b_out == None: - b_out = last - # a---------i---------b - # a_out == last \solution here - if last != b_out: - a = i - a_out = last - # a-------------------i-------------------b - # solution here/ last == b_out - else: - b = i - b_out = last - - if i == a + (b - a) / 2: - if b_out != last: - i += 1 - last = b_out - f = files[i] - #if f in PATTERNS: - #PATTERNS.remove(f); - #print i, f - fd = open(os.path.join(magdir, f), "r") - buf = fd.read() - fd.close() - if os.path.exists(os.path.dirname(FILE_BINARY) + "/../magic/magic.mime.mgc"): - cmd = FILE_BINARY + " -bi " + infile + " -m " + os.path.dirname(FILE_BINARY) + "/../magic/magic" - else: - cmd = FILE_BINARY + " -bi " + infile + " -m .mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + COMPILED_SUFFIX - popen = Popen(cmd, shell=True, bufsize=4096, stdout=PIPE) - pipe = popen.stdout - mime = pipe.read() - if popen.wait() != 0: - return {'output':None, 'mime':None, 'pattern':None, "suffix":None, "err":(cmd, mime.strip())} - tlist.append(last) - index = infile.find('.') - if index == -1: - suffix = "" - else: - suffix = infile[index:] - if last == "data\n" and i == 0: - buf = "" - return {'output':last, 'mime':mime, 'pattern':buf, "suffix":suffix} - else: - i = a + (b - a) / 2 - -def is_compilation_supported(file_name = "file", file_binary = "file"): - FILE_BINARY_HASH = hashlib.sha224(file_name).hexdigest() - if os.system(file_binary + " /bin/sh -m .mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp.0.mgc > /dev/null") != 0: - print '' - print "This file version doesn't support compiled patterns => they won't be used" - return False - else: - print 'Compiled patterns will be used' - print '' - return True + """Wrapper around :py:func:`os.makedirs` that catches EEXIST.""" + try: + os.makedirs(path) + except OSError as exc: # Python >2.5 + if exc.errno == errno.EEXIST: + pass + else: + raise + + +def get_file_output(filename, binary="file"): + """Run file(1) binary on given filename, return output.""" + popen = Popen(binary + " -b " + filename, shell=True, bufsize=4096, + stdout=PIPE, stderr=PIPE) + pipe = popen.stdout + output = pipe.read() + output_err = popen.stderr.read() + if popen.wait() != 0: + return "Error while calling file, output: " + str(output) + \ + str(output_err) + return output + + +def get_file_mime(filename, binary="file"): + """Run file(1) binary with mime option on given filename, return output.""" + popen = Popen(binary + " -ib " + filename, shell=True, bufsize=4096, + stdout=PIPE, stderr=PIPE) + pipe = popen.stdout + output = pipe.read() + output_err = popen.stderr.read() + if popen.wait() != 0: + return "Error while calling file, output: " + str(output) + \ + str(output_err) + return output + + +def get_simple_metadata(filename, binary="file"): + """ + Get output of `file` and `file -i` on given filename. + + Calls :py:func:`get_file_output` and :py:func:`get_file_mime` and saves + them in a `dict` as fields `output` and `mime`. + + Quick version of :py:func:`get_full_metadata`. + """ + metadata = {} + metadata['output'] = get_file_output(filename, binary) + metadata['mime'] = get_file_mime(filename, binary) + return metadata + + +def _split_patterns(pattern_id=0, magdir="Magdir", file_name="file", + only_name=False): + """ + Actual worker function for :py:func:split_patterns`. + + Creates `output` dir in `.mgc_temp`. Loops over pattern files in `magdir` + and for each pattern found in each file creates an extra file in `output` + dir with just that pattern. + + Output file name are just their pattern_id, starting with id given as arg. + + Arg `file_name` only used for getting dir name through hashing. `file(1)` + is not called here. + + Returns number of pattern files thus created. + """ + file_binary_hash = hashlib.sha224(file_name).hexdigest() + outputdir = ".mgc_temp/" + file_binary_hash + "/output" + mkdir_p(outputdir) + + files = os.listdir(magdir) + files.sort() # TODO: sort like the others? + if not files: + raise ValueError('no files found in Magdir {0}' + .format(os.path.join(os.getcwd(), magdir))) + prog = ProgressBar(0, len(files), 50, mode='fixed', char='#') + for loop_file_name in files: + mfile = os.path.join(magdir, loop_file_name) + if os.path.isdir(mfile): + continue + buff = "" + in_pattern = False + prog.increment_amount() + print(prog, "Splitting patterns", end='\r') + sys.stdout.flush() + with open(mfile, "r") as reader: + lines = reader.readlines() + for line_idx, line in enumerate(lines): + if line.strip().startswith("#") or not line.strip(): + continue + # print(line.strip() + if line.strip()[0].isdigit() or \ + (line.strip()[0] == '-' and line.strip()[1].isdigit()): + # start of next pattern. first write finished pattern to file + if in_pattern: + with open(os.path.join(outputdir, str(pattern_id)), "w") \ + as writer: + writer.write(buff) + in_pattern = False + buff = "" + if only_name: + if not re.match("^[0-9]*(\\s)*name", line.strip()): + continue + in_pattern = True + pattern_id += 1 + buff += "#" + loop_file_name + "\n" + buff += "# Automatically generated from:\n" + buff += "#" + loop_file_name + ":" + str(line_idx) + "\n" + buff += line + elif line.strip().startswith(">") or line.strip().startswith("!"): + if in_pattern: + buff += line + elif not only_name: + print("broken pattern in file '" + loop_file_name + "':" + + str(line_idx)) + if in_pattern: + with open(os.path.join(outputdir, str(pattern_id)), "w") as writer: + writer.write(buff) + return pattern_id + + +def split_patterns(magdir="Magdir", file_name="file"): + """ + Given a dir with magic pattern files, create dir with isolated patterns. + + First create isolated pattern files for patterns with a "name" attribute. + Then create pattern files for all patterns. + """ + pattern_id = _split_patterns(0, magdir, file_name, True) + _split_patterns(pattern_id, magdir, file_name) + + print('') + + +def compile_patterns(file_name="file", file_binary="file"): + """ + Creates increasingly complex magic files. + + Loops over isolated patterns, re-assembles original magic files pattern by + pattern and always re-creates a magic file. Creates files + `.mgc_temp/HASH/.find-magic.tmp.PATTERN-ID.mgc` used by + :py:func:`get_full_metadata`. + + This requires quite some space on disc. + """ + file_binary_hash = hashlib.sha224(file_name).hexdigest() + magdir = ".mgc_temp/" + file_binary_hash + "/output" + files = os.listdir(magdir) + if not files: + raise ValueError('no files found in Magdir {0}' + .format(os.path.join(os.getcwd(), magdir))) + files.sort(key=lambda x: [int(x)]) + mkdir_p(".mgc_temp") + mkdir_p(".mgc_temp/" + file_binary_hash) + mkdir_p(".mgc_temp/" + file_binary_hash + "/tmp") + prog = ProgressBar(0, len(files), 50, mode='fixed', char='#') + + for file_index, loop_file_name in enumerate(files): + out_file = ".mgc_temp/" + file_binary_hash + "/.find-magic.tmp." + \ + str(file_index) + ".mgc" + if not os.path.exists(out_file): + with open(os.path.join(magdir, loop_file_name), "r") as reader: + buf = reader.read() + # read name of original pattern file in magic dir from first line + mfile = buf.split("\n")[0][1:] + + # iteratively re-assemble original pattern file + with open(os.path.join(".mgc_temp/" + file_binary_hash + + "/tmp/" + mfile), "a") as appender: + appender.write(buf) + appender.flush() + # tmp = open(".mgc_temp/" + file_binary_hash + "/.find-magic.tmp", + # "a") + # tmp.write(buf) + # tmp.flush() + # tmp.close() + # os.chdir(".mgc_temp") + # print("cp .mgc_temp/.find-magic.tmp " + + # ".mgc_temp/.find-magic.tmp." + str(file_index) + ";" + + # file_binary + " -C -m .mgc_temp/.find-magic.tmp." + + # str(file_index) + ";") + # mv .find-magic.tmp." + str(file_index) + ".mgc .mgc_temp/; + + # os.system("cp .mgc_temp/" + file_binary_hash + + # "/.find-magic.tmp .mgc_temp/" + file_binary_hash + + # "/.find-magic.tmp." + str(file_index) + ";" + + # "file -C -m .mgc_temp/" + file_binary_hash + + # "/.find-magic.tmp." + str(file_index) + ";") + cmd = file_binary + " -C -m .mgc_temp/" + file_binary_hash + "/tmp" + ret_code = os.system(cmd) + if ret_code != 0: + raise ValueError('command {0} returned non-zero exit code {1}!' + .format(cmd, ret_code)) + if os.path.exists("tmp.mgc"): # TODO: move without forking shell + ret_code = os.system("mv tmp.mgc " + out_file) + if ret_code != 0: + raise ValueError('moving tmp.mgc to {0} failed with code ' + '{1}!'.format(out_file, ret_code)) + # os.chdir("..") + prog.increment_amount() + print(prog, "Compiling patterns", end='\r') + sys.stdout.flush() + print("") + + +def get_full_metadata(infile, file_name="file", compiled=True, + file_binary="file"): + """ + file-output plus binary search to find the relevant line in magic file. + + Run `file(1)` repeatedly with different magic files created in + :py:func`compile_patterns` until the one pattern is identified that defines + the `file(1)` output of the given `infile`. + """ + compiled_suffix = ".mgc" + if not compiled: + compiled_suffix = "" + file_binary_hash = hashlib.sha224(file_name).hexdigest() + magdir = ".mgc_temp/" + file_binary_hash + "/output" + files = os.listdir(magdir) + files.sort(key=lambda x: [int(x)]) + tlist = [] + mkdir_p(".mgc_temp") + + # Divide and conquer: find the relevant pattern + idx_left = 0 # left-most index to consider + idx_rigt = len(files) - 1 # right-most index to consider + idx_curr = idx_rigt # some index in the middle we currently test + + # out_left = "" # ouput at idx_left, unused + out_rigt = None # output at idx_rigt + + while True: + file_curr = files[idx_curr] # file name at idx_curr + cmd = file_binary + " -b " + infile + " -m .mgc_temp/" + \ + file_binary_hash + "/.find-magic.tmp." + str(idx_curr) + \ + compiled_suffix + # print(file_binary + " " + infile + " -m .mgc_temp/" + + # file_binary_hash + "/.find-magic.tmp." + str(idx_curr) + + # compiled_suffix) + popen = Popen(cmd, shell=True, bufsize=4096, stdout=PIPE) + pipe = popen.stdout + out_curr = pipe.read() + if popen.wait() != 0: + return dict(output=None, mime=None, pattern=None, suffix=None, + err=(cmd, out_curr.strip())) + if out_rigt is None: # first iteration, uses complete magic file + out_rigt = out_curr + # idx_left---------idx_curr---------idx_rigt + # out_left == out_curr \solution here + if out_curr != out_rigt: + idx_left = idx_curr + # out_left = out_curr + # idx_left-------------------idx_curr-------------------idx_rigt + # solution here/ out_curr == out_rigt + else: + idx_rigt = idx_curr + out_rigt = out_curr + + # are we done? + if idx_curr == idx_left + (idx_rigt - idx_left) / 2: + # idx_* are so close together that next iteration idx_curr would + # not change --> we are done + if out_rigt != out_curr: + idx_curr += 1 + out_curr = out_rigt + file_curr = files[idx_curr] + # if file_curr in PATTERNS: + # PATTERNS.remove(file_curr); + # print(idx_curr, file_curr) + with open(os.path.join(magdir, file_curr), "r") as reader: + buf = reader.read() + if os.path.exists(os.path.dirname(file_binary) + + "/../magic/magic.mime.mgc"): + cmd = file_binary + " -bi " + infile + " -m " + \ + os.path.dirname(file_binary) + "/../magic/magic" + else: + cmd = file_binary + " -bi " + infile + " -m .mgc_temp/" + \ + file_binary_hash + "/.find-magic.tmp." + str(idx_curr) +\ + compiled_suffix + popen = Popen(cmd, shell=True, bufsize=4096, stdout=PIPE) + pipe = popen.stdout + mime = pipe.read() + if popen.wait() != 0: + return dict(output=None, mime=None, pattern=None, suffix=None, + err=(cmd, mime.strip())) + tlist.append(out_curr) + index = infile.find('.') + if index == -1: + suffix = "" + else: + suffix = infile[index:] + if out_curr == "data\n" and idx_curr == 0: + buf = "" + return dict(output=out_curr, mime=mime, pattern=buf, suffix=suffix) + else: + # continue: set idx_curr to middle between idx_left and idx_rigt + idx_curr = idx_left + (idx_rigt - idx_left) / 2 + +def is_compilation_supported(file_name="file", file_binary="file"): + """Determine whether data from :py:func:`compile_patterns` is available.""" + file_binary_hash = hashlib.sha224(file_name).hexdigest() + if os.system(file_binary + " /bin/sh -m .mgc_temp/" + file_binary_hash + + "/.find-magic.tmp.0.mgc > /dev/null") != 0: + print('') + print("This file version doesn't support compiled patterns " + "=> they won't be used") + return False + print('Compiled patterns will be used') + print('') + return True diff --git a/update-db.py b/update-db.py index eb50dd4..2d3b887 100755 --- a/update-db.py +++ b/update-db.py @@ -1,4 +1,5 @@ #!/usr/bin/env python + # Copyright (C) 2012 Red Hat, Inc. # Authors: Jan Kaluza # @@ -14,96 +15,128 @@ # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -# +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. + +"""Run file(1) on all entries in db folder and save the output.""" + +from __future__ import print_function import os import sys import getopt from pyfile import * from pyfile.progressbar import ProgressBar -from pyfile.threadpool import * +from pyfile.threadpool import ThreadPool +#: flag for error during :py:func:`update_all_files` +#: TODO: make this a nonlocal in py3 global_error = False -def update_all_files(file_name = 'file', magdir = 'Magdir', file_binary = 'file'): - - print_file_info(file_binary) - - split_patterns(magdir, file_name) - compile_patterns(file_name, file_binary) - compiled = is_compilation_supported(file_name, file_binary) - - entries = get_stored_files("db") - if len(entries) == 0: - raise ValueError('no files in db {0}'.format( os.path.join(os.getcwd(), 'db') )) - prog = ProgressBar(0, len(entries), 50, mode='fixed', char='#') - - def store_mimedata(data): - metadata = get_full_metadata(data[0], file_name, compiled, file_binary) - error = metadata['output'] == None - if not error: - set_stored_metadata(data[0], metadata) - return (data[0], data[1], False) - else: - return (data[0], data[1], metadata['err']) # err=(cmd, output) - - def data_stored(data): - entry, hide, error = data - if error: - global global_error - global_error = True - print 'ERROR for', entry - print 'ERROR running command', error[0] - print 'ERROR produced output', error[1] - return - prog.increment_amount() - if not hide: - print prog, "Updating database", '\r', - sys.stdout.flush() - - pool = ThreadPool(4) # create this here, so program exits if error occurs earlier - for i,entry in enumerate(entries): - # Insert tasks into the queue and let them run - pool.queueTask(store_mimedata, (entry, i % 2), data_stored) - if global_error: - print "Error when executing File binary" - break - - # When all tasks are finished, allow the threads to terminate - pool.joinAll() - print '' - return global_error + +def update_all_files(file_name='file', magdir='Magdir', file_binary='file'): + """ + Run file(1) on all entries in db folder, save result in same folder + + Performs: + (1) print a quick info on used file(1) binary + (2) compiles patterns + (3) Run in parallel on each db entry using a ThreadPool: + (3a) get full metadata for file + (3b) save metadata in db + """ + print_file_info(file_binary) + + split_patterns(magdir, file_name) + compile_patterns(file_name, file_binary) + compiled = is_compilation_supported(file_name, file_binary) + + entries = get_stored_files("db") + if not entries: + db_dir = os.path.join(os.getcwd(), 'db') # TODO: not always correct + raise ValueError('no files in db {0}'.format(db_dir)) + prog = ProgressBar(0, len(entries), 50, mode='fixed', char='#') + + def store_mimedata(data): + """Compute file output for single entry, save it.""" + entry, hide = data + metadata = get_full_metadata(entry, file_name, compiled, file_binary) + if metadata['output'] is None: + return (entry, hide, metadata['err']) # err=(cmd, output) + else: + set_stored_metadata(entry, metadata) + return (entry, hide, False) + + def data_stored(data): + """Update progress bar after each entry or print error and set flag.""" + entry, hide, error = data + if error: + global global_error + global_error = True + print('ERROR for', entry) + print('ERROR running command', error[0]) + print('ERROR produced output', error[1]) + return + prog.increment_amount() + if not hide: + print(prog, "Updating database", end='\r') + sys.stdout.flush() + + # create thread pool here, so program exits if error occurs earlier + n_threads = 4 # TODO: probably need this instead of 2 in queueTasks + pool = ThreadPool(n_threads) + for index, entry in enumerate(entries): + # Insert tasks into the queue and let them run + pool.queueTask(store_mimedata, args=(entry, index % 2), + taskCallback=data_stored) + if global_error: + print("Error when executing File binary") + break + + # When all tasks are finished, allow the threads to terminate + pool.joinAll() + print('') + return global_error + def usage(ecode): - print "Updates database." - print sys.argv[0] + " [-v ] [-m ] [-b ]" - print " Default path_to_magdir_directory='Magdir'" - print " Default version_name='file'" - print "Examples:" - print " " + sys.argv[0] + " -v file-5.07;" - print " " + sys.argv[0] + " -v file-5.04-my-version -m file-5.04/magic/Magdir;" - sys.exit(ecode) + """Print usage information and exit with given exit code.""" + print("Updates database.") + print(sys.argv[0] + + " [-v ] [-m ] [-b ]") + print(" Default path_to_magdir_directory='Magdir'") + print(" Default version_name='file'") + print("Examples:") + print(" " + sys.argv[0] + " -v file-5.07;") + print(" " + sys.argv[0] + + " -v file-5.04-my-version -m file-5.04/magic/Magdir;") + sys.exit(ecode) + + +def main(): + """Parse arguments and call :py:func:`update_all_files`.""" + file_name = 'file' + file_binary = "file" + magdir = "Magdir" + args = sys.argv[1:] + + optlist, args = getopt.getopt(args, 'b:hm:v:') + + for option, argument in optlist: + if option == '-b': + file_binary = argument + elif option == '-m': + magdir = argument + elif option == '-h': + usage(0) + elif option == '-v': + file_name = argument + else: + usage(1) + + sys.exit(update_all_files(file_name, magdir, file_binary)) + # run this only if started as script from command line if __name__ == '__main__': - file_name = 'file' - file_binary = "file" - magdir = "Magdir" - args = sys.argv[1:] - - optlist, args = getopt.getopt(args, 'b:hm:v:') - - for o, a in optlist: - if o == '-b': - file_binary = a - elif o == '-m': - magdir = a - elif o == '-h': - usage(0) - elif o == '-v': - file_name = a - else: - usage(1) - - sys.exit(update_all_files(file_name, magdir, file_binary)) + main()