From 3da6242f5167b65d1fcef733358dd676d1349921 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Thu, 10 Jan 2019 17:36:35 +0100 Subject: [PATCH 01/42] Pep8-ify and pylint-ify update-db.py Fix whitespace Shorten long lines Use print() function Add doc strings Replace single-letter variable names Define main() function Add TODOs for more changes --- update-db.py | 190 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 111 insertions(+), 79 deletions(-) diff --git a/update-db.py b/update-db.py index eb50dd4..4961def 100755 --- a/update-db.py +++ b/update-db.py @@ -1,4 +1,5 @@ #!/usr/bin/env python + # Copyright (C) 2012 Red Hat, Inc. # Authors: Jan Kaluza # @@ -14,96 +15,127 @@ # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -# +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. + +"""Run file(1) on all entries in db folder and save the output.""" + +from __future__ import print_function import os import sys import getopt from pyfile import * from pyfile.progressbar import ProgressBar -from pyfile.threadpool import * +from pyfile.threadpool import ThreadPool +#: flag for error during :py:func:`update_all_files` +#: TODO: make this a nonlocal in py3 global_error = False -def update_all_files(file_name = 'file', magdir = 'Magdir', file_binary = 'file'): - - print_file_info(file_binary) - - split_patterns(magdir, file_name) - compile_patterns(file_name, file_binary) - compiled = is_compilation_supported(file_name, file_binary) - - entries = get_stored_files("db") - if len(entries) == 0: - raise ValueError('no files in db {0}'.format( os.path.join(os.getcwd(), 'db') )) - prog = ProgressBar(0, len(entries), 50, mode='fixed', char='#') - - def store_mimedata(data): - metadata = get_full_metadata(data[0], file_name, compiled, file_binary) - error = metadata['output'] == None - if not error: - set_stored_metadata(data[0], metadata) - return (data[0], data[1], False) - else: - return (data[0], data[1], metadata['err']) # err=(cmd, output) - - def data_stored(data): - entry, hide, error = data - if error: - global global_error - global_error = True - print 'ERROR for', entry - print 'ERROR running command', error[0] - print 'ERROR produced output', error[1] - return - prog.increment_amount() - if not hide: - print prog, "Updating database", '\r', - sys.stdout.flush() - - pool = ThreadPool(4) # create this here, so program exits if error occurs earlier - for i,entry in enumerate(entries): - # Insert tasks into the queue and let them run - pool.queueTask(store_mimedata, (entry, i % 2), data_stored) - if global_error: - print "Error when executing File binary" - break - - # When all tasks are finished, allow the threads to terminate - pool.joinAll() - print '' - return global_error + +def update_all_files(file_name='file', magdir='Magdir', file_binary='file'): + """ + Run file(1) on all entries in db folder, save result in same folder + + Performs: + (1) print a quick info on used file(1) binary + (2) compiles patterns + (3) Run in parallel on each db entry using a ThreadPool: + (3a) get full metadata for file + (3b) save metadata in db + """ + print_file_info(file_binary) + + split_patterns(magdir, file_name) + compile_patterns(file_name, file_binary) + compiled = is_compilation_supported(file_name, file_binary) + + entries = get_stored_files("db") + if not entries: + db_dir = os.path.join(os.getcwd(), 'db') # TODO: not always correct + raise ValueError('no files in db {0}'.format(db_dir)) + prog = ProgressBar(0, len(entries), 50, mode='fixed', char='#') + + def store_mimedata(data): + """Compute file output for single entry, save it.""" + entry, hide = data + metadata = get_full_metadata(entry, file_name, compiled, file_binary) + if metadata['output'] is None: + return (entry, hide, metadata['err']) # err=(cmd, output) + else: + set_stored_metadata(entry, metadata) + return (entry, hide, False) + + def data_stored(data): + """Update progress bar after each entry or print error and set flag.""" + entry, hide, error = data + if error: + global global_error + global_error = True + print('ERROR for', entry) + print('ERROR running command', error[0]) + print('ERROR produced output', error[1]) + return + prog.increment_amount() + if not hide: + print(prog, "Updating database", end='\r', flush=True) + + # create thread pool here, so program exits if error occurs earlier + n_threads = 4 # TODO: probably need this instead of 2 in queueTasks + pool = ThreadPool(n_threads) + for index, entry in enumerate(entries): + # Insert tasks into the queue and let them run + pool.queueTask(store_mimedata, args=(entry, index % 2), + callback=data_stored) + if global_error: + print("Error when executing File binary") + break + + # When all tasks are finished, allow the threads to terminate + pool.joinAll() + print('') + return global_error + def usage(ecode): - print "Updates database." - print sys.argv[0] + " [-v ] [-m ] [-b ]" - print " Default path_to_magdir_directory='Magdir'" - print " Default version_name='file'" - print "Examples:" - print " " + sys.argv[0] + " -v file-5.07;" - print " " + sys.argv[0] + " -v file-5.04-my-version -m file-5.04/magic/Magdir;" - sys.exit(ecode) + """Print usage information and exit with given exit code.""" + print("Updates database.") + print(sys.argv[0] + + " [-v ] [-m ] [-b ]") + print(" Default path_to_magdir_directory='Magdir'") + print(" Default version_name='file'") + print("Examples:") + print(" " + sys.argv[0] + " -v file-5.07;") + print(" " + sys.argv[0] + + " -v file-5.04-my-version -m file-5.04/magic/Magdir;") + sys.exit(ecode) + + +def main(): + """Parse arguments and call :py:func:`update_all_files`.""" + file_name = 'file' + file_binary = "file" + magdir = "Magdir" + args = sys.argv[1:] + + optlist, args = getopt.getopt(args, 'b:hm:v:') + + for option, argument in optlist: + if option == '-b': + file_binary = argument + elif option == '-m': + magdir = argument + elif option == '-h': + usage(0) + elif option == '-v': + file_name = argument + else: + usage(1) + + sys.exit(update_all_files(file_name, magdir, file_binary)) + # run this only if started as script from command line if __name__ == '__main__': - file_name = 'file' - file_binary = "file" - magdir = "Magdir" - args = sys.argv[1:] - - optlist, args = getopt.getopt(args, 'b:hm:v:') - - for o, a in optlist: - if o == '-b': - file_binary = a - elif o == '-m': - magdir = a - elif o == '-h': - usage(0) - elif o == '-v': - file_name = a - else: - usage(1) - - sys.exit(update_all_files(file_name, magdir, file_binary)) + main() From c38a7c6bf320c91c2cc034a1766cd52938f58812 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Thu, 10 Jan 2019 17:37:53 +0100 Subject: [PATCH 02/42] pep8-ify, pylint-ify compare-db.py: reindent --- compare-db.py | 102 +++++++++++++++++++++++++------------------------- 1 file changed, 52 insertions(+), 50 deletions(-) diff --git a/compare-db.py b/compare-db.py index 3a80451..83f56ea 100644 --- a/compare-db.py +++ b/compare-db.py @@ -23,64 +23,66 @@ from pyfile.threadpool import * import mutex + def compare_all_files(file_name = 'file', magdir = 'Magdir', exact = False): - pool = ThreadPool(4) - m = mutex.mutex() + pool = ThreadPool(4) + m = mutex.mutex() + + split_patterns(magdir, file_name) + compile_patterns(file_name) + compiled = is_compilation_supported(file_name) - split_patterns(magdir, file_name) - compile_patterns(file_name) - compiled = is_compilation_supported(file_name) + entries = get_stored_files("db") - entries = get_stored_files("db") + def store_mimedata(data): + metadata = get_full_metadata(data[0], file_name, compiled) + stored_metadata = get_stored_metadata(data[0]) + text = "PASS " + data[0] + if is_regression(stored_metadata, metadata, exact): + text = "FAIL " + data[0] + "\n" + get_diff(stored_metadata, metadata, exact) + return text - def store_mimedata(data): - metadata = get_full_metadata(data[0], file_name, compiled) - stored_metadata = get_stored_metadata(data[0]) - text = "PASS " + data[0] - if is_regression(stored_metadata, metadata, exact): - text = "FAIL " + data[0] + "\n" + get_diff(stored_metadata, metadata, exact) - return text + def data_print(data): + print data + m.unlock() - def data_print(data): - print data - m.unlock() + def data_stored(data): + m.lock(data_print, data) - def data_stored(data): - m.lock(data_print, data) + for i,entry in enumerate(entries): + # Insert tasks into the queue and let them run + pool.queueTask(store_mimedata, (entry, i % 2), data_stored) - for i,entry in enumerate(entries): - # Insert tasks into the queue and let them run - pool.queueTask(store_mimedata, (entry, i % 2), data_stored) + # When all tasks are finished, allow the threads to terminate + pool.joinAll() + print '' - # When all tasks are finished, allow the threads to terminate - pool.joinAll() - print '' # run this only if started as script from command line if __name__ == '__main__': - file_name = 'file' - magdir = "Magdir" - exact = False - - if len(sys.argv) >= 3: - file_name = sys.argv[1] - magdir = sys.argv[2] - elif (len(sys.argv) == 2 and sys.argv[1] == "-h") or len(sys.argv) == 1: - print "Compares files in database with output of current file binary." - print sys.argv[0] + " [path_to_magdir_directory] [file_name]" - print " Default path_to_magdir_directory='Magdir'" - print " Default file_name='file'" - print "Examples:" - print " " + sys.argv[0] + " file-5.07;" - print " " + sys.argv[0] + " file-5.07 file-5.04/magic/Magdir;" - sys.exit(0) - - if magdir == "exact": - exact = True - magdir = "Magdir" - - if len(sys.argv) == 4 and sys.argv[3] == "exact": - exact = True - - file_name = sys.argv[1] - compare_all_files(file_name, magdir, exact) + file_name = 'file' + magdir = "Magdir" + exact = False + + if len(sys.argv) >= 3: + file_name = sys.argv[1] + magdir = sys.argv[2] + elif (len(sys.argv) == 2 and sys.argv[1] == "-h") or len(sys.argv) == 1: + print "Compares files in database with output of current file binary." + print sys.argv[0] + " [path_to_magdir_directory] [file_name]" + print " Default path_to_magdir_directory='Magdir'" + print " Default file_name='file'" + print "Examples:" + print " " + sys.argv[0] + " file-5.07;" + print " " + sys.argv[0] + " file-5.07 file-5.04/magic/Magdir;" + sys.exit(0) + + if magdir == "exact": + exact = True + magdir = "Magdir" + + if len(sys.argv) == 4 and sys.argv[3] == "exact": + exact = True + + file_name = sys.argv[1] + compare_all_files(file_name, magdir, exact) From 6f4e1c27094a6f109db61ad46e80a8f35d81472a Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Thu, 10 Jan 2019 17:41:14 +0100 Subject: [PATCH 03/42] pep8-ify, pylint-ify compare-db.py: break lines, fix spaces --- compare-db.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/compare-db.py b/compare-db.py index 83f56ea..574a0d9 100644 --- a/compare-db.py +++ b/compare-db.py @@ -13,8 +13,8 @@ # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -# +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. import os import sys @@ -24,7 +24,7 @@ import mutex -def compare_all_files(file_name = 'file', magdir = 'Magdir', exact = False): +def compare_all_files(file_name='file', magdir='Magdir', exact=False): pool = ThreadPool(4) m = mutex.mutex() @@ -39,7 +39,8 @@ def store_mimedata(data): stored_metadata = get_stored_metadata(data[0]) text = "PASS " + data[0] if is_regression(stored_metadata, metadata, exact): - text = "FAIL " + data[0] + "\n" + get_diff(stored_metadata, metadata, exact) + text = "FAIL " + data[0] + "\n" + \ + get_diff(stored_metadata, metadata, exact) return text def data_print(data): @@ -49,7 +50,7 @@ def data_print(data): def data_stored(data): m.lock(data_print, data) - for i,entry in enumerate(entries): + for i, entry in enumerate(entries): # Insert tasks into the queue and let them run pool.queueTask(store_mimedata, (entry, i % 2), data_stored) From f6998982fb17372f3bdddf7dcf931b0bebb36396 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Thu, 10 Jan 2019 17:41:29 +0100 Subject: [PATCH 04/42] pep8-ify, pylint-ify compare-db.py: use print() function --- compare-db.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/compare-db.py b/compare-db.py index 574a0d9..3250d5b 100644 --- a/compare-db.py +++ b/compare-db.py @@ -16,6 +16,8 @@ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, # USA. +from __future__ import print_function + import os import sys from pyfile import * @@ -44,7 +46,7 @@ def store_mimedata(data): return text def data_print(data): - print data + print(data) m.unlock() def data_stored(data): @@ -56,7 +58,7 @@ def data_stored(data): # When all tasks are finished, allow the threads to terminate pool.joinAll() - print '' + print('') # run this only if started as script from command line @@ -69,13 +71,13 @@ def data_stored(data): file_name = sys.argv[1] magdir = sys.argv[2] elif (len(sys.argv) == 2 and sys.argv[1] == "-h") or len(sys.argv) == 1: - print "Compares files in database with output of current file binary." - print sys.argv[0] + " [path_to_magdir_directory] [file_name]" - print " Default path_to_magdir_directory='Magdir'" - print " Default file_name='file'" - print "Examples:" - print " " + sys.argv[0] + " file-5.07;" - print " " + sys.argv[0] + " file-5.07 file-5.04/magic/Magdir;" + print("Compares files in database with output of current file binary.") + print(sys.argv[0] + " [path_to_magdir_directory] [file_name]") + print(" Default path_to_magdir_directory='Magdir'") + print(" Default file_name='file'") + print("Examples:") + print(" " + sys.argv[0] + " file-5.07;") + print(" " + sys.argv[0] + " file-5.07 file-5.04/magic/Magdir;") sys.exit(0) if magdir == "exact": From deedb9e4f368df2b122fb1efed5daadad40787ff Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Thu, 10 Jan 2019 17:42:45 +0100 Subject: [PATCH 05/42] pep8-ify, pylint-ify compare-db.py: create main() function --- compare-db.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/compare-db.py b/compare-db.py index 3250d5b..dc5067e 100644 --- a/compare-db.py +++ b/compare-db.py @@ -61,8 +61,8 @@ def data_stored(data): print('') -# run this only if started as script from command line -if __name__ == '__main__': +def main(): + """Parse arguments, call :py:func:`compare_all_files`.""" file_name = 'file' magdir = "Magdir" exact = False @@ -89,3 +89,8 @@ def data_stored(data): file_name = sys.argv[1] compare_all_files(file_name, magdir, exact) + + +# run this only if started as script from command line +if __name__ == '__main__': + main() From 603e16a10a73c59864f678ccc4b1bf2789e82cf8 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Thu, 10 Jan 2019 17:52:52 +0100 Subject: [PATCH 06/42] pep8-ify, pylint-ify compare-db.py: add doc strings --- compare-db.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/compare-db.py b/compare-db.py index dc5067e..faee364 100644 --- a/compare-db.py +++ b/compare-db.py @@ -16,6 +16,8 @@ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, # USA. +"""Compare contents of db with output from other version of file(1).""" + from __future__ import print_function import os @@ -27,6 +29,12 @@ def compare_all_files(file_name='file', magdir='Magdir', exact=False): + """ + Compares all saved file(1) output in db with that of other file(1) version. + + Creates a ThreadPool to do this in parallel. Uses a mutex lock to ensure + that text output is not garbled. + """ pool = ThreadPool(4) m = mutex.mutex() @@ -37,6 +45,7 @@ def compare_all_files(file_name='file', magdir='Magdir', exact=False): entries = get_stored_files("db") def store_mimedata(data): + """For a single db entry, calls file(1) and compares it to db data.""" metadata = get_full_metadata(data[0], file_name, compiled) stored_metadata = get_stored_metadata(data[0]) text = "PASS " + data[0] @@ -46,10 +55,12 @@ def store_mimedata(data): return text def data_print(data): + """Print result for single entry and unlock print lock.""" print(data) m.unlock() def data_stored(data): + """Call data_print as soon as print lock has been acquired.""" m.lock(data_print, data) for i, entry in enumerate(entries): From 8951ed3c9229b5b9cb4630e77819cace8f9e1d83 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Thu, 10 Jan 2019 17:53:18 +0100 Subject: [PATCH 07/42] pep8-ify, pylint-ify compare-db.py: replace single-letter vars --- compare-db.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/compare-db.py b/compare-db.py index faee364..6941ff2 100644 --- a/compare-db.py +++ b/compare-db.py @@ -35,8 +35,10 @@ def compare_all_files(file_name='file', magdir='Magdir', exact=False): Creates a ThreadPool to do this in parallel. Uses a mutex lock to ensure that text output is not garbled. """ - pool = ThreadPool(4) - m = mutex.mutex() + n_threads = 4 + + pool = ThreadPool(n_threads) + print_lock = mutex.mutex() split_patterns(magdir, file_name) compile_patterns(file_name) @@ -46,26 +48,28 @@ def compare_all_files(file_name='file', magdir='Magdir', exact=False): def store_mimedata(data): """For a single db entry, calls file(1) and compares it to db data.""" - metadata = get_full_metadata(data[0], file_name, compiled) - stored_metadata = get_stored_metadata(data[0]) - text = "PASS " + data[0] + entry, _ = data + metadata = get_full_metadata(entry, file_name, compiled) + stored_metadata = get_stored_metadata(entry) + text = "PASS " + entry if is_regression(stored_metadata, metadata, exact): - text = "FAIL " + data[0] + "\n" + \ + text = "FAIL " + entry + "\n" + \ get_diff(stored_metadata, metadata, exact) return text def data_print(data): """Print result for single entry and unlock print lock.""" print(data) - m.unlock() + print_lock.unlock() def data_stored(data): """Call data_print as soon as print lock has been acquired.""" - m.lock(data_print, data) + print_lock.lock(data_print, data) - for i, entry in enumerate(entries): + for index, entry in enumerate(entries): # Insert tasks into the queue and let them run - pool.queueTask(store_mimedata, (entry, i % 2), data_stored) + pool.queueTask(store_mimedata, args=(entry, index % 2), + callback=data_stored) # When all tasks are finished, allow the threads to terminate pool.joinAll() From 4da20105c42b13192b8cf84b682c8d7a8b523898 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Thu, 10 Jan 2019 17:57:08 +0100 Subject: [PATCH 08/42] pep8-ify, pylint-ify compare-db.py: remove unused --- compare-db.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/compare-db.py b/compare-db.py index 6941ff2..2d72fbc 100644 --- a/compare-db.py +++ b/compare-db.py @@ -20,10 +20,8 @@ from __future__ import print_function -import os import sys from pyfile import * -from pyfile.progressbar import ProgressBar from pyfile.threadpool import * import mutex @@ -36,7 +34,6 @@ def compare_all_files(file_name='file', magdir='Magdir', exact=False): that text output is not garbled. """ n_threads = 4 - pool = ThreadPool(n_threads) print_lock = mutex.mutex() @@ -46,9 +43,8 @@ def compare_all_files(file_name='file', magdir='Magdir', exact=False): entries = get_stored_files("db") - def store_mimedata(data): + def store_mimedata(entry): """For a single db entry, calls file(1) and compares it to db data.""" - entry, _ = data metadata = get_full_metadata(entry, file_name, compiled) stored_metadata = get_stored_metadata(entry) text = "PASS " + entry @@ -66,9 +62,9 @@ def data_stored(data): """Call data_print as soon as print lock has been acquired.""" print_lock.lock(data_print, data) - for index, entry in enumerate(entries): + for entry in entries: # Insert tasks into the queue and let them run - pool.queueTask(store_mimedata, args=(entry, index % 2), + pool.queueTask(store_mimedata, args=(entry, ), callback=data_stored) # When all tasks are finished, allow the threads to terminate From cb983d224090f02bc88f643b1d6a2c0b8580cbf6 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Fri, 11 Jan 2019 17:35:08 +0100 Subject: [PATCH 09/42] pep8-ify, pylint-ify compare-db.py: Miscellaneous Order of imports --- compare-db.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compare-db.py b/compare-db.py index 2d72fbc..06b1d70 100644 --- a/compare-db.py +++ b/compare-db.py @@ -21,9 +21,9 @@ from __future__ import print_function import sys +import mutex from pyfile import * from pyfile.threadpool import * -import mutex def compare_all_files(file_name='file', magdir='Magdir', exact=False): From 0b001579250c3af6d6bb524ac8555cfe3ca7dc68 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Fri, 11 Jan 2019 17:35:49 +0100 Subject: [PATCH 10/42] pep8-ify, pylint-ify fast-regression-test: re-indent --- fast-regression-test.py | 124 ++++++++++++++++++++-------------------- 1 file changed, 62 insertions(+), 62 deletions(-) diff --git a/fast-regression-test.py b/fast-regression-test.py index ad14d1f..4ed4778 100755 --- a/fast-regression-test.py +++ b/fast-regression-test.py @@ -28,73 +28,73 @@ def test_all_files(exact = False, binary = "file"): - global ret - ret = 0 - - print_file_info(binary) - - m = mutex.mutex() - - entries = sorted(get_stored_files("db")) - - def store_mimedata(filename): - metadata = get_simple_metadata(filename, binary) - try: - stored_metadata = get_stored_metadata(filename) - except IOError: - # file not found or corrupt - text = "FAIL " + filename + "\n" + "FAIL could not find stored metadata!\n\ - This can mean that the File failed to generate any output for this file." - else: - text = "PASS " + filename - if is_regression(stored_metadata, metadata, exact): - text = "FAIL " + filename + "\n" + get_diff(stored_metadata, metadata, exact) - return text - - def data_print(data): - print data - if data[0] == "F": - global ret - ret = 1 - m.unlock() - - def data_stored(data): - m.lock(data_print, data) - - pool = ThreadPool(4) # create here so program exits if error occurs earlier - - for entry in entries: - # Insert tasks into the queue and let them run - pool.queueTask(store_mimedata, entry, data_stored) - - # When all tasks are finished, allow the threads to terminate - pool.joinAll() - print '' - return ret + global ret + ret = 0 + + print_file_info(binary) + + m = mutex.mutex() + + entries = sorted(get_stored_files("db")) + + def store_mimedata(filename): + metadata = get_simple_metadata(filename, binary) + try: + stored_metadata = get_stored_metadata(filename) + except IOError: + # file not found or corrupt + text = "FAIL " + filename + "\n" + "FAIL could not find stored metadata!\n\ +This can mean that the File failed to generate any output for this file." + else: + text = "PASS " + filename + if is_regression(stored_metadata, metadata, exact): + text = "FAIL " + filename + "\n" + get_diff(stored_metadata, metadata, exact) + return text + + def data_print(data): + print data + if data[0] == "F": + global ret + ret = 1 + m.unlock() + + def data_stored(data): + m.lock(data_print, data) + + pool = ThreadPool(1) # create here so program exits if error occurs earlier + + for entry in entries: + # Insert tasks into the queue and let them run + pool.queueTask(store_mimedata, entry, data_stored) + + # When all tasks are finished, allow the threads to terminate + pool.joinAll() + print '' + return ret def usage(ecode): - print "Runs regressions." - print sys.argv[0] + " [-e] [-b ]" - print " Default file_binary='file'" - print "Examples:" - print " " + sys.argv[0] + " -e -b '../file -m ../../magic/magic.mgc'" - print " " + sys.argv[0] + " -e" - sys.exit(ecode) + print "Runs regressions." + print sys.argv[0] + " [-e] [-b ]" + print " Default file_binary='file'" + print "Examples:" + print " " + sys.argv[0] + " -e -b '../file -m ../../magic/magic.mgc'" + print " " + sys.argv[0] + " -e" + sys.exit(ecode) # run this only if started as script from command line if __name__ == '__main__': - exact = False - file_binary = "file" - args = sys.argv[1:] + exact = False + file_binary = "file" + args = sys.argv[1:] - optlist, args = getopt.getopt(args, 'b:e') + optlist, args = getopt.getopt(args, 'b:e') - for o, a in optlist: - if o == '-b': - file_binary = a - elif o == '-e': - exact = True - else: - usage(1) + for o, a in optlist: + if o == '-b': + file_binary = a + elif o == '-e': + exact = True + else: + usage(1) - sys.exit(test_all_files(exact, file_binary)) + sys.exit(test_all_files(exact, file_binary)) From b024d354c58aa01e669f9244331b9a5079fdc8d8 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Fri, 11 Jan 2019 17:42:28 +0100 Subject: [PATCH 11/42] pep8-ify, pylint-ify fast-regression-test: Wrap long lines --- fast-regression-test.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/fast-regression-test.py b/fast-regression-test.py index 4ed4778..e71d647 100755 --- a/fast-regression-test.py +++ b/fast-regression-test.py @@ -14,8 +14,8 @@ # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -# +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. import os import sys @@ -43,12 +43,15 @@ def store_mimedata(filename): stored_metadata = get_stored_metadata(filename) except IOError: # file not found or corrupt - text = "FAIL " + filename + "\n" + "FAIL could not find stored metadata!\n\ -This can mean that the File failed to generate any output for this file." + text = "FAIL " + filename + "\n" + \ + "FAIL could not find stored metadata!\n" + \ + "This can mean that the File failed to generate " + \ + "any output for this file." else: text = "PASS " + filename if is_regression(stored_metadata, metadata, exact): - text = "FAIL " + filename + "\n" + get_diff(stored_metadata, metadata, exact) + text = "FAIL " + filename + "\n" + \ + get_diff(stored_metadata, metadata, exact) return text def data_print(data): @@ -61,7 +64,9 @@ def data_print(data): def data_stored(data): m.lock(data_print, data) - pool = ThreadPool(1) # create here so program exits if error occurs earlier + # create here so program exits if error occurs earlier + n_threads = 1 + pool = ThreadPool(n_threads) for entry in entries: # Insert tasks into the queue and let them run From bad6b913bfd5bb3b516ad0089cc5ff521b5fd0f3 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Fri, 11 Jan 2019 17:43:33 +0100 Subject: [PATCH 12/42] pep8-ify, pylint-ify fast-regression-test: Spaces --- fast-regression-test.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fast-regression-test.py b/fast-regression-test.py index e71d647..9749b82 100755 --- a/fast-regression-test.py +++ b/fast-regression-test.py @@ -24,10 +24,11 @@ from pyfile.threadpool import * import mutex + ret = 0 -def test_all_files(exact = False, binary = "file"): +def test_all_files(exact=False, binary="file"): global ret ret = 0 @@ -77,6 +78,7 @@ def data_stored(data): print '' return ret + def usage(ecode): print "Runs regressions." print sys.argv[0] + " [-e] [-b ]" From d8b930c31f756de138ae63248e3535d2a305c97a Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Fri, 11 Jan 2019 17:43:46 +0100 Subject: [PATCH 13/42] pep8-ify, pylint-ify fast-regression-test: main function --- fast-regression-test.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fast-regression-test.py b/fast-regression-test.py index 9749b82..9dfc06c 100755 --- a/fast-regression-test.py +++ b/fast-regression-test.py @@ -88,8 +88,8 @@ def usage(ecode): print " " + sys.argv[0] + " -e" sys.exit(ecode) -# run this only if started as script from command line -if __name__ == '__main__': + +def main(): exact = False file_binary = "file" args = sys.argv[1:] @@ -105,3 +105,8 @@ def usage(ecode): usage(1) sys.exit(test_all_files(exact, file_binary)) + + +# run this only if started as script from command line +if __name__ == '__main__': + main() From d0934e13f82c3365cc3eaa7871a19185ffc12578 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Fri, 11 Jan 2019 17:44:48 +0100 Subject: [PATCH 14/42] pep8-ify, pylint-ify fast-regression-test: Rename single-char var --- fast-regression-test.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fast-regression-test.py b/fast-regression-test.py index 9dfc06c..c062ddb 100755 --- a/fast-regression-test.py +++ b/fast-regression-test.py @@ -34,7 +34,7 @@ def test_all_files(exact=False, binary="file"): print_file_info(binary) - m = mutex.mutex() + print_lock = mutex.mutex() entries = sorted(get_stored_files("db")) @@ -60,10 +60,10 @@ def data_print(data): if data[0] == "F": global ret ret = 1 - m.unlock() + print_lock.unlock() def data_stored(data): - m.lock(data_print, data) + print_lock.lock(data_print, data) # create here so program exits if error occurs earlier n_threads = 1 @@ -96,10 +96,10 @@ def main(): optlist, args = getopt.getopt(args, 'b:e') - for o, a in optlist: - if o == '-b': - file_binary = a - elif o == '-e': + for option, arg in optlist: + if option == '-b': + file_binary = arg + elif option == '-e': exact = True else: usage(1) From 142149363d87147cb9de14c7533f667ba7d72b53 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Fri, 11 Jan 2019 17:49:13 +0100 Subject: [PATCH 15/42] pep8-ify, pylint-ify fast-regression-test: print function --- fast-regression-test.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/fast-regression-test.py b/fast-regression-test.py index c062ddb..dc02624 100755 --- a/fast-regression-test.py +++ b/fast-regression-test.py @@ -17,6 +17,8 @@ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, # USA. +from __future__ import print_function + import os import sys import getopt @@ -56,7 +58,7 @@ def store_mimedata(filename): return text def data_print(data): - print data + print(data) if data[0] == "F": global ret ret = 1 @@ -75,17 +77,17 @@ def data_stored(data): # When all tasks are finished, allow the threads to terminate pool.joinAll() - print '' + print('') return ret def usage(ecode): - print "Runs regressions." - print sys.argv[0] + " [-e] [-b ]" - print " Default file_binary='file'" - print "Examples:" - print " " + sys.argv[0] + " -e -b '../file -m ../../magic/magic.mgc'" - print " " + sys.argv[0] + " -e" + print("Runs regressions.") + print(sys.argv[0] + " [-e] [-b ]") + print(" Default file_binary='file'") + print("Examples:") + print(" " + sys.argv[0] + " -e -b '../file -m ../../magic/magic.mgc'") + print(" " + sys.argv[0] + " -e") sys.exit(ecode) From 13ad9e6fa70cd00df07d88410b5f62702afed422 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Fri, 11 Jan 2019 17:49:37 +0100 Subject: [PATCH 16/42] pep8-ify, pylint-ify fast-regression-test: Fix imports --- fast-regression-test.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fast-regression-test.py b/fast-regression-test.py index dc02624..1a351e9 100755 --- a/fast-regression-test.py +++ b/fast-regression-test.py @@ -19,12 +19,11 @@ from __future__ import print_function -import os import sys import getopt +import mutex from pyfile import * from pyfile.threadpool import * -import mutex ret = 0 From aab4bc0c4c9f024f457dddff435d3f512bb9f8b8 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Fri, 11 Jan 2019 17:54:34 +0100 Subject: [PATCH 17/42] pep8-ify, pylint-ify fast-regression-test: Create doc --- fast-regression-test.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fast-regression-test.py b/fast-regression-test.py index 1a351e9..d040bae 100755 --- a/fast-regression-test.py +++ b/fast-regression-test.py @@ -17,6 +17,8 @@ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, # USA. +"""Do a quick comparison of output of file(1) with that saved in db.""" + from __future__ import print_function import sys @@ -26,10 +28,13 @@ from pyfile.threadpool import * +#: return value from test_all_files +#: TODO: make this a nonlocal in py3 ret = 0 def test_all_files(exact=False, binary="file"): + """Compare output of given file(1) binary with db for all entries.""" global ret ret = 0 @@ -40,6 +45,7 @@ def test_all_files(exact=False, binary="file"): entries = sorted(get_stored_files("db")) def store_mimedata(filename): + """Compare file(1) output with db for single entry.""" metadata = get_simple_metadata(filename, binary) try: stored_metadata = get_stored_metadata(filename) @@ -57,6 +63,7 @@ def store_mimedata(filename): return text def data_print(data): + """Print given text, set global return value, unlock print lock.""" print(data) if data[0] == "F": global ret @@ -64,6 +71,7 @@ def data_print(data): print_lock.unlock() def data_stored(data): + """Acquire print lock and call :py:function:`data_print`.""" print_lock.lock(data_print, data) # create here so program exits if error occurs earlier @@ -81,6 +89,7 @@ def data_stored(data): def usage(ecode): + """Print info on how to use this program. Return given code.""" print("Runs regressions.") print(sys.argv[0] + " [-e] [-b ]") print(" Default file_binary='file'") @@ -91,6 +100,7 @@ def usage(ecode): def main(): + """Called when running this as script. Parse args, call test_all_files.""" exact = False file_binary = "file" args = sys.argv[1:] From 5bffaa2661e13cd6371ee04c26a6731efb0b0e2a Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Thu, 10 Jan 2019 17:57:28 +0100 Subject: [PATCH 18/42] pep8-ify, pylint-ify db.py: Re-indent --- pyfile/db.py | 195 ++++++++++++++++++++++++++------------------------- 1 file changed, 99 insertions(+), 96 deletions(-) diff --git a/pyfile/db.py b/pyfile/db.py index f9190a1..227e991 100644 --- a/pyfile/db.py +++ b/pyfile/db.py @@ -28,114 +28,117 @@ mimetypes.init() def get_stored_metadata(filename): - f = open (filename + ".pickle", 'r') - p = pickle.load(f) - f.close() - return p + f = open (filename + ".pickle", 'r') + p = pickle.load(f) + f.close() + return p def set_stored_metadata(filename, metadata): - f = open (filename + ".pickle", 'w') - pickle.dump(metadata, f) - f.close() + f = open (filename + ".pickle", 'w') + pickle.dump(metadata, f) + f.close() def is_regression(m1, m2, exact = False, ratio = 0.7): - if m1['output'] == None or m2['output'] == None: - return True - if m1['output'] != m2['output']: - # previous file didn't detect it, so we hope new output is ok - if not m1['output'].endswith("data\n"): - if exact: - if m1['output'] != m2['output']: - return True - else: - r = difflib.SequenceMatcher(None, m1['output'], m2['output']).ratio() - if (r < ratio): - #print >> sys.stderr, "Expected:%sGot :%s" % (m2['output'], m1['output']) - return True + if m1['output'] == None or m2['output'] == None: + return True + if m1['output'] != m2['output']: + # previous file didn't detect it, so we hope new output is ok + if not m1['output'].endswith("data\n"): + if exact: + if m1['output'] != m2['output']: + return True + else: + r = difflib.SequenceMatcher(None, m1['output'], m2['output']).ratio() + if (r < ratio): + #print >> sys.stderr, "Expected:%sGot :%s" % (m2['output'], m1['output']) + return True - mime = m2['mime'].split(":")[-1].split(";")[0].strip() - old_mime = m1['mime'].split(":")[-1].split(";")[0].strip() + mime = m2['mime'].split(":")[-1].split(";")[0].strip() + old_mime = m1['mime'].split(":")[-1].split(";")[0].strip() - # if old_mime is empty, then previous version of File didn't know that filetype. - # we will hope that new mime is right. - if old_mime != mime and len(old_mime) != 0: - ext = os.path.splitext(mime)[-1] - # it's not error if new mimetype is correct type for that extension. - if ext in mimetypes.types_map.keys(): - expected = mimetypes.types_map[ext] - if expected == mime: - return True - #else: - #print >> sys.stderr, "Expected:%s" % (expected) - #print >> sys.stderr, "Expected:%s\nGot :%s" % (old_mime, mime) - return True - return False; + # if old_mime is empty, then previous version of File didn't know that filetype. + # we will hope that new mime is right. + if old_mime != mime and len(old_mime) != 0: + ext = os.path.splitext(mime)[-1] + # it's not error if new mimetype is correct type for that extension. + if ext in mimetypes.types_map.keys(): + expected = mimetypes.types_map[ext] + if expected == mime: + return True + #else: + #print >> sys.stderr, "Expected:%s" % (expected) + #print >> sys.stderr, "Expected:%s\nGot :%s" % (old_mime, mime) + return True + return False; def get_diff(m1, m2, exact = False, ratio = 0.7): - if m1['output'] == None or m2['output'] == None: - return "Output is None, was there error during File execution?" + if m1['output'] == None or m2['output'] == None: + return "Output is None, was there error during File execution?" - text = "" - if m1['output'] != m2['output']: - # previous file didn't detect it, so we hope new output is ok - if not m1['output'].endswith("data\n"): - if exact: - if m1['output'] != m2['output']: - text = "Expected :%sGot :%s" % (m1['output'], m2['output']) - else: - r = difflib.SequenceMatcher(None, m1['output'], m2['output']).ratio() - if (r < ratio): - text = "Expected :%sGot :%s" % (m1['output'], m2['output']) + text = "" + if m1['output'] != m2['output']: + # previous file didn't detect it, so we hope new output is ok + if not m1['output'].endswith("data\n"): + if exact: + if m1['output'] != m2['output']: + text = "Expected :%sGot :%s" % (m1['output'], + m2['output']) + else: + r = difflib.SequenceMatcher(None, m1['output'], + m2['output']).ratio() + if (r < ratio): + text = "Expected :%sGot :%s" % (m1['output'], + m2['output']) - mime = m2['mime'].split(":")[-1].split(";")[0].strip() - old_mime = m1['mime'].split(":")[-1].split(";")[0].strip() + mime = m2['mime'].split(":")[-1].split(";")[0].strip() + old_mime = m1['mime'].split(":")[-1].split(";")[0].strip() - want_mime_diff = False + want_mime_diff = False - # if old_mime is empty, then previous version of File didn't know that filetype. - # we will hope that new mime is right. - if old_mime != mime and len(old_mime) != 0: - ext = os.path.splitext(mime)[-1] - # it's not error if new mimetype is correct type for that extension. - if ext in mimetypes.types_map.keys(): - expected = mimetypes.types_map[ext] - if expected != mime: - want_mime_diff = True - want_mime_diff = True - if want_mime_diff: - text += "Expected :%sGot :%s" % (m1['mime'], m2['mime']) + # if old_mime is empty, then previous version of File didn't know that filetype. + # we will hope that new mime is right. + if old_mime != mime and len(old_mime) != 0: + ext = os.path.splitext(mime)[-1] + # it's not error if new mimetype is correct type for that extension. + if ext in mimetypes.types_map.keys(): + expected = mimetypes.types_map[ext] + if expected != mime: + want_mime_diff = True + want_mime_diff = True + if want_mime_diff: + text += "Expected :%sGot :%s" % (m1['mime'], m2['mime']) - if text != "": - if m1.has_key('pattern') and m2.has_key('pattern') and m1['pattern'] != "" and m2['pattern'] != "": - for line in difflib.unified_diff(StringIO(m1['pattern']).readlines(), StringIO(m2['pattern']).readlines()): - text += line - return text + if text != "": + if m1.has_key('pattern') and m2.has_key('pattern') and m1['pattern'] != "" and m2['pattern'] != "": + for line in difflib.unified_diff(StringIO(m1['pattern']).readlines(), StringIO(m2['pattern']).readlines()): + text += line + return text def get_stored_files(dir_name, subdir = True, *args): - '''Return a list of file names found in directory 'dir_name' - If 'subdir' is True, recursively access subdirectories under 'dir_name'. - Additional arguments, if any, are file extensions to match filenames. Matched - file names are added to the list. - If there are no additional arguments, all files found in the directory are - added to the list. - Example usage: fileList = dirEntries(r'H:\TEMP', False, 'txt', 'py') - Only files with 'txt' and 'py' extensions will be added to the list. - Example usage: fileList = dirEntries(r'H:\TEMP', True) - All files and all the files in subdirectories under H:\TEMP will be added - to the list. - ''' - fileList = [] - for file in os.listdir(dir_name): - dirfile = os.path.join(dir_name, file) - if os.path.isfile(dirfile): - if not args: - if not dirfile.endswith("pickle") and not dirfile.endswith(".source.txt"): - fileList.append(dirfile) - else: - if os.path.splitext(dirfile)[1][1:] in args: - fileList.append(dirfile) - # recursively access file names in subdirectories - elif os.path.isdir(dirfile) and subdir: - #print "Accessing directory:", dirfile - fileList.extend(get_stored_files(dirfile, subdir, *args)) - return fileList + '''Return a list of file names found in directory 'dir_name' + If 'subdir' is True, recursively access subdirectories under 'dir_name'. + Additional arguments, if any, are file extensions to match filenames. Matched + file names are added to the list. + If there are no additional arguments, all files found in the directory are + added to the list. + Example usage: fileList = dirEntries(r'H:\TEMP', False, 'txt', 'py') + Only files with 'txt' and 'py' extensions will be added to the list. + Example usage: fileList = dirEntries(r'H:\TEMP', True) + All files and all the files in subdirectories under H:\TEMP will be added + to the list. + ''' + fileList = [] + for file in os.listdir(dir_name): + dirfile = os.path.join(dir_name, file) + if os.path.isfile(dirfile): + if not args: + if not dirfile.endswith("pickle") and not dirfile.endswith(".source.txt"): + fileList.append(dirfile) + else: + if os.path.splitext(dirfile)[1][1:] in args: + fileList.append(dirfile) + # recursively access file names in subdirectories + elif os.path.isdir(dirfile) and subdir: + #print "Accessing directory:", dirfile + fileList.extend(get_stored_files(dirfile, subdir, *args)) + return fileList From 6a842d754e2c76e7c1dea3b5adeafb90f3eaaae6 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Fri, 11 Jan 2019 18:02:36 +0100 Subject: [PATCH 19/42] pep8-ify, pylint-ify db.py: wrap long lines --- pyfile/db.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/pyfile/db.py b/pyfile/db.py index 227e991..581b954 100644 --- a/pyfile/db.py +++ b/pyfile/db.py @@ -13,8 +13,9 @@ # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -# +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. + import os import sys @@ -48,16 +49,18 @@ def is_regression(m1, m2, exact = False, ratio = 0.7): if m1['output'] != m2['output']: return True else: - r = difflib.SequenceMatcher(None, m1['output'], m2['output']).ratio() + r = difflib.SequenceMatcher(None, m1['output'], + m2['output']).ratio() if (r < ratio): - #print >> sys.stderr, "Expected:%sGot :%s" % (m2['output'], m1['output']) + # print >> sys.stderr, "Expected:%sGot :%s" \ + # % (m2['output'], m1['output']) return True mime = m2['mime'].split(":")[-1].split(";")[0].strip() old_mime = m1['mime'].split(":")[-1].split(";")[0].strip() - # if old_mime is empty, then previous version of File didn't know that filetype. - # we will hope that new mime is right. + # if old_mime is empty, then previous version of File didn't know that + # filetype. we will hope that new mime is right. if old_mime != mime and len(old_mime) != 0: ext = os.path.splitext(mime)[-1] # it's not error if new mimetype is correct type for that extension. @@ -95,8 +98,8 @@ def get_diff(m1, m2, exact = False, ratio = 0.7): want_mime_diff = False - # if old_mime is empty, then previous version of File didn't know that filetype. - # we will hope that new mime is right. + # if old_mime is empty, then previous version of File didn't know that + # filetype. we will hope that new mime is right. if old_mime != mime and len(old_mime) != 0: ext = os.path.splitext(mime)[-1] # it's not error if new mimetype is correct type for that extension. @@ -109,16 +112,19 @@ def get_diff(m1, m2, exact = False, ratio = 0.7): text += "Expected :%sGot :%s" % (m1['mime'], m2['mime']) if text != "": - if m1.has_key('pattern') and m2.has_key('pattern') and m1['pattern'] != "" and m2['pattern'] != "": - for line in difflib.unified_diff(StringIO(m1['pattern']).readlines(), StringIO(m2['pattern']).readlines()): + if m1.has_key('pattern') and m2.has_key('pattern') and \ + m1['pattern'] != "" and m2['pattern'] != "": + for line in \ + difflib.unified_diff(StringIO(m1['pattern']).readlines(), + StringIO(m2['pattern']).readlines()): text += line return text def get_stored_files(dir_name, subdir = True, *args): '''Return a list of file names found in directory 'dir_name' If 'subdir' is True, recursively access subdirectories under 'dir_name'. - Additional arguments, if any, are file extensions to match filenames. Matched - file names are added to the list. + Additional arguments, if any, are file extensions to match filenames. + Matched file names are added to the list. If there are no additional arguments, all files found in the directory are added to the list. Example usage: fileList = dirEntries(r'H:\TEMP', False, 'txt', 'py') @@ -132,7 +138,8 @@ def get_stored_files(dir_name, subdir = True, *args): dirfile = os.path.join(dir_name, file) if os.path.isfile(dirfile): if not args: - if not dirfile.endswith("pickle") and not dirfile.endswith(".source.txt"): + if not dirfile.endswith("pickle") and \ + not dirfile.endswith(".source.txt"): fileList.append(dirfile) else: if os.path.splitext(dirfile)[1][1:] in args: From 45c7a842ab8503b1fd1878bb662c427323e3db95 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Fri, 11 Jan 2019 18:03:55 +0100 Subject: [PATCH 20/42] pep8-ify, pylint-ify db.py: whitespace --- pyfile/db.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/pyfile/db.py b/pyfile/db.py index 581b954..0db1322 100644 --- a/pyfile/db.py +++ b/pyfile/db.py @@ -28,18 +28,21 @@ import re mimetypes.init() + def get_stored_metadata(filename): - f = open (filename + ".pickle", 'r') + f = open(filename + ".pickle", 'r') p = pickle.load(f) f.close() return p + def set_stored_metadata(filename, metadata): - f = open (filename + ".pickle", 'w') + f = open(filename + ".pickle", 'w') pickle.dump(metadata, f) f.close() -def is_regression(m1, m2, exact = False, ratio = 0.7): + +def is_regression(m1, m2, exact=False, ratio=0.7): if m1['output'] == None or m2['output'] == None: return True if m1['output'] != m2['output']: @@ -68,13 +71,14 @@ def is_regression(m1, m2, exact = False, ratio = 0.7): expected = mimetypes.types_map[ext] if expected == mime: return True - #else: - #print >> sys.stderr, "Expected:%s" % (expected) - #print >> sys.stderr, "Expected:%s\nGot :%s" % (old_mime, mime) + # else: + # print >> sys.stderr, "Expected:%s" % (expected) + # print >> sys.stderr, "Expected:%s\nGot :%s" % (old_mime, mime) return True - return False; + return False -def get_diff(m1, m2, exact = False, ratio = 0.7): + +def get_diff(m1, m2, exact=False, ratio=0.7): if m1['output'] == None or m2['output'] == None: return "Output is None, was there error during File execution?" @@ -120,7 +124,8 @@ def get_diff(m1, m2, exact = False, ratio = 0.7): text += line return text -def get_stored_files(dir_name, subdir = True, *args): + +def get_stored_files(dir_name, subdir=True, *args): '''Return a list of file names found in directory 'dir_name' If 'subdir' is True, recursively access subdirectories under 'dir_name'. Additional arguments, if any, are file extensions to match filenames. @@ -146,6 +151,6 @@ def get_stored_files(dir_name, subdir = True, *args): fileList.append(dirfile) # recursively access file names in subdirectories elif os.path.isdir(dirfile) and subdir: - #print "Accessing directory:", dirfile + # print "Accessing directory:", dirfile fileList.extend(get_stored_files(dirfile, subdir, *args)) return fileList From 96182843043a52c5e83c0f56d59727bee48dec6c Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Mon, 14 Jan 2019 09:17:04 +0100 Subject: [PATCH 21/42] pep8-ify, pylint-ify db.py: use with open() --- pyfile/db.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pyfile/db.py b/pyfile/db.py index 0db1322..ac62998 100644 --- a/pyfile/db.py +++ b/pyfile/db.py @@ -30,16 +30,13 @@ def get_stored_metadata(filename): - f = open(filename + ".pickle", 'r') - p = pickle.load(f) - f.close() - return p + with open(filename + ".pickle", 'r') as file_handle: + return pickle.load(file_handle) def set_stored_metadata(filename, metadata): - f = open(filename + ".pickle", 'w') - pickle.dump(metadata, f) - f.close() + with open(filename + ".pickle", 'w') as file_handle: + pickle.dump(metadata, file_handle) def is_regression(m1, m2, exact=False, ratio=0.7): From 61c7d160158c74cec3cf5c13507e97c56daf4886 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Mon, 14 Jan 2019 09:17:11 +0100 Subject: [PATCH 22/42] pep8-ify, pylint-ify db.py: replace single-letter var --- pyfile/db.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pyfile/db.py b/pyfile/db.py index ac62998..e3bb53b 100644 --- a/pyfile/db.py +++ b/pyfile/db.py @@ -49,9 +49,9 @@ def is_regression(m1, m2, exact=False, ratio=0.7): if m1['output'] != m2['output']: return True else: - r = difflib.SequenceMatcher(None, m1['output'], - m2['output']).ratio() - if (r < ratio): + match = difflib.SequenceMatcher(None, m1['output'], + m2['output']).ratio() + if (match < ratio): # print >> sys.stderr, "Expected:%sGot :%s" \ # % (m2['output'], m1['output']) return True @@ -88,9 +88,9 @@ def get_diff(m1, m2, exact=False, ratio=0.7): text = "Expected :%sGot :%s" % (m1['output'], m2['output']) else: - r = difflib.SequenceMatcher(None, m1['output'], - m2['output']).ratio() - if (r < ratio): + match = difflib.SequenceMatcher(None, m1['output'], + m2['output']).ratio() + if (match < ratio): text = "Expected :%sGot :%s" % (m1['output'], m2['output']) From 117426f61ee1b1fb1e5f01fb7fcf96bff13b45ec Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Mon, 14 Jan 2019 09:46:39 +0100 Subject: [PATCH 23/42] pep8-ify, pylint-ify db.py: Fix error from re-indent --- pyfile/db.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyfile/db.py b/pyfile/db.py index e3bb53b..4daecd3 100644 --- a/pyfile/db.py +++ b/pyfile/db.py @@ -48,9 +48,9 @@ def is_regression(m1, m2, exact=False, ratio=0.7): if exact: if m1['output'] != m2['output']: return True - else: - match = difflib.SequenceMatcher(None, m1['output'], - m2['output']).ratio() + else: + match = difflib.SequenceMatcher(None, m1['output'], + m2['output']).ratio() if (match < ratio): # print >> sys.stderr, "Expected:%sGot :%s" \ # % (m2['output'], m1['output']) From b592f3b2e196abf8e15d803504f3469b636059d2 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Mon, 14 Jan 2019 09:47:21 +0100 Subject: [PATCH 24/42] pep8-ify, pylint-ify db.py: doc strings --- pyfile/db.py | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/pyfile/db.py b/pyfile/db.py index 4daecd3..9a04f03 100644 --- a/pyfile/db.py +++ b/pyfile/db.py @@ -16,6 +16,8 @@ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, # USA. +"""Load file(1) output from db, compare it, store output in db.""" + import os import sys @@ -30,16 +32,33 @@ def get_stored_metadata(filename): + """Retrieve metadata stored for given entry in db.""" with open(filename + ".pickle", 'r') as file_handle: return pickle.load(file_handle) def set_stored_metadata(filename, metadata): + """Store given metadata for given entry in db.""" with open(filename + ".pickle", 'w') as file_handle: pickle.dump(metadata, file_handle) def is_regression(m1, m2, exact=False, ratio=0.7): + """ + Determine whether two file(1) outputs for same entry are incompatible. + + Metadata can be obtained from py:func`get_stored_metadata` or + :py:func:`file.get_full_metadata`. + + :param dict m1: metadata for entry1. + :param dict m2: metadata for entry2. + :param bool exact: whether output has to match letter for letter (True) or + whether slight changes are allowed. + :param float ratio: Amount of difference required for slightly different + entries to be considered the same: + `0` = all changes allowed; `1` = need perfect match. + :returns: True if there is a (significant) difference between `m1` and `m2` + """ if m1['output'] == None or m2['output'] == None: return True if m1['output'] != m2['output']: @@ -76,6 +95,12 @@ def is_regression(m1, m2, exact=False, ratio=0.7): def get_diff(m1, m2, exact=False, ratio=0.7): + """ + Get textual description about how well file(1) outputs match. + + Like :py:func:`is_regression`, except the output is a description instead + of just a bool. + """ if m1['output'] == None or m2['output'] == None: return "Output is None, was there error during File execution?" @@ -123,7 +148,9 @@ def get_diff(m1, m2, exact=False, ratio=0.7): def get_stored_files(dir_name, subdir=True, *args): - '''Return a list of file names found in directory 'dir_name' + r""" + Return a list of file names found in directory 'dir_name'. + If 'subdir' is True, recursively access subdirectories under 'dir_name'. Additional arguments, if any, are file extensions to match filenames. Matched file names are added to the list. @@ -134,7 +161,7 @@ def get_stored_files(dir_name, subdir=True, *args): Example usage: fileList = dirEntries(r'H:\TEMP', True) All files and all the files in subdirectories under H:\TEMP will be added to the list. - ''' + """ fileList = [] for file in os.listdir(dir_name): dirfile = os.path.join(dir_name, file) From b2eadc052d8fc17dc3c62205dc9b8de95b6d1e43 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Mon, 14 Jan 2019 09:47:29 +0100 Subject: [PATCH 25/42] pep8-ify, pylint-ify db.py: add todos --- pyfile/db.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pyfile/db.py b/pyfile/db.py index 9a04f03..ceccc9d 100644 --- a/pyfile/db.py +++ b/pyfile/db.py @@ -58,6 +58,8 @@ def is_regression(m1, m2, exact=False, ratio=0.7): entries to be considered the same: `0` = all changes allowed; `1` = need perfect match. :returns: True if there is a (significant) difference between `m1` and `m2` + + .. todo:: Reduce code duplication with function get_diff """ if m1['output'] == None or m2['output'] == None: return True @@ -100,6 +102,8 @@ def get_diff(m1, m2, exact=False, ratio=0.7): Like :py:func:`is_regression`, except the output is a description instead of just a bool. + + .. todo:: Reduce code duplication with function is_regression """ if m1['output'] == None or m2['output'] == None: return "Output is None, was there error during File execution?" @@ -133,7 +137,7 @@ def get_diff(m1, m2, exact=False, ratio=0.7): expected = mimetypes.types_map[ext] if expected != mime: want_mime_diff = True - want_mime_diff = True + want_mime_diff = True # TODO: this invalidates lines above if want_mime_diff: text += "Expected :%sGot :%s" % (m1['mime'], m2['mime']) From 913dc9f0d9212d8bc7ec391ba9eaa55254735b6e Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Mon, 14 Jan 2019 09:48:51 +0100 Subject: [PATCH 26/42] pep8-ify, pylint-ify db.py: replace deprecated has_key --- pyfile/db.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyfile/db.py b/pyfile/db.py index ceccc9d..2b8f06f 100644 --- a/pyfile/db.py +++ b/pyfile/db.py @@ -142,7 +142,7 @@ def get_diff(m1, m2, exact=False, ratio=0.7): text += "Expected :%sGot :%s" % (m1['mime'], m2['mime']) if text != "": - if m1.has_key('pattern') and m2.has_key('pattern') and \ + if ('pattern' in m1) and ('pattern' in m2) and \ m1['pattern'] != "" and m2['pattern'] != "": for line in \ difflib.unified_diff(StringIO(m1['pattern']).readlines(), From c3a86a96b96fea090ad0e91c53651ac81f6cd773 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Mon, 14 Jan 2019 09:53:41 +0100 Subject: [PATCH 27/42] pep8-ify, pylint-ify db.py: remove unused imports --- pyfile/db.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pyfile/db.py b/pyfile/db.py index 2b8f06f..9fb9f3a 100644 --- a/pyfile/db.py +++ b/pyfile/db.py @@ -20,14 +20,10 @@ import os -import sys -import errno -from subprocess import Popen, PIPE import pickle import difflib import mimetypes from cStringIO import StringIO -import re mimetypes.init() From b14defcf3b7eeb3ebf9698477bce86032f43e1e0 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Mon, 14 Jan 2019 09:58:52 +0100 Subject: [PATCH 28/42] pep8-ify, pylint-ify db.py: miscellaneous --- pyfile/db.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pyfile/db.py b/pyfile/db.py index 9fb9f3a..194e0c7 100644 --- a/pyfile/db.py +++ b/pyfile/db.py @@ -57,7 +57,7 @@ def is_regression(m1, m2, exact=False, ratio=0.7): .. todo:: Reduce code duplication with function get_diff """ - if m1['output'] == None or m2['output'] == None: + if m1['output'] is None or m2['output'] is None: return True if m1['output'] != m2['output']: # previous file didn't detect it, so we hope new output is ok @@ -68,7 +68,7 @@ def is_regression(m1, m2, exact=False, ratio=0.7): else: match = difflib.SequenceMatcher(None, m1['output'], m2['output']).ratio() - if (match < ratio): + if match < ratio: # print >> sys.stderr, "Expected:%sGot :%s" \ # % (m2['output'], m1['output']) return True @@ -78,7 +78,7 @@ def is_regression(m1, m2, exact=False, ratio=0.7): # if old_mime is empty, then previous version of File didn't know that # filetype. we will hope that new mime is right. - if old_mime != mime and len(old_mime) != 0: + if old_mime and old_mime != mime: ext = os.path.splitext(mime)[-1] # it's not error if new mimetype is correct type for that extension. if ext in mimetypes.types_map.keys(): @@ -101,7 +101,7 @@ def get_diff(m1, m2, exact=False, ratio=0.7): .. todo:: Reduce code duplication with function is_regression """ - if m1['output'] == None or m2['output'] == None: + if m1['output'] is None or m2['output'] is None: return "Output is None, was there error during File execution?" text = "" @@ -115,7 +115,7 @@ def get_diff(m1, m2, exact=False, ratio=0.7): else: match = difflib.SequenceMatcher(None, m1['output'], m2['output']).ratio() - if (match < ratio): + if match < ratio: text = "Expected :%sGot :%s" % (m1['output'], m2['output']) @@ -126,7 +126,7 @@ def get_diff(m1, m2, exact=False, ratio=0.7): # if old_mime is empty, then previous version of File didn't know that # filetype. we will hope that new mime is right. - if old_mime != mime and len(old_mime) != 0: + if old_mime and old_mime != mime: ext = os.path.splitext(mime)[-1] # it's not error if new mimetype is correct type for that extension. if ext in mimetypes.types_map.keys(): From 2bb9c65a57dbb1b8e58ce4e2987ad104a844062d Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Mon, 14 Jan 2019 10:02:46 +0100 Subject: [PATCH 29/42] pep8-ify, pylint-ify db.py: invalid var names --- pyfile/db.py | 83 ++++++++++++++++++++++++++-------------------------- 1 file changed, 42 insertions(+), 41 deletions(-) diff --git a/pyfile/db.py b/pyfile/db.py index 194e0c7..e9670a9 100644 --- a/pyfile/db.py +++ b/pyfile/db.py @@ -39,42 +39,43 @@ def set_stored_metadata(filename, metadata): pickle.dump(metadata, file_handle) -def is_regression(m1, m2, exact=False, ratio=0.7): +def is_regression(meta1, meta2, exact=False, ratio=0.7): """ Determine whether two file(1) outputs for same entry are incompatible. Metadata can be obtained from py:func`get_stored_metadata` or :py:func:`file.get_full_metadata`. - :param dict m1: metadata for entry1. - :param dict m2: metadata for entry2. + :param dict meta1: metadata for entry1. + :param dict meta2: metadata for entry2. :param bool exact: whether output has to match letter for letter (True) or whether slight changes are allowed. :param float ratio: Amount of difference required for slightly different entries to be considered the same: `0` = all changes allowed; `1` = need perfect match. - :returns: True if there is a (significant) difference between `m1` and `m2` + :returns: True if there is a (significant) difference between `meta1` + and `meta2`. .. todo:: Reduce code duplication with function get_diff """ - if m1['output'] is None or m2['output'] is None: + if meta1['output'] is None or meta2['output'] is None: return True - if m1['output'] != m2['output']: + if meta1['output'] != meta2['output']: # previous file didn't detect it, so we hope new output is ok - if not m1['output'].endswith("data\n"): + if not meta1['output'].endswith("data\n"): if exact: - if m1['output'] != m2['output']: + if meta1['output'] != meta2['output']: return True else: - match = difflib.SequenceMatcher(None, m1['output'], - m2['output']).ratio() + match = difflib.SequenceMatcher(None, meta1['output'], + meta2['output']).ratio() if match < ratio: # print >> sys.stderr, "Expected:%sGot :%s" \ - # % (m2['output'], m1['output']) + # % (meta2['output'], meta1['output']) return True - mime = m2['mime'].split(":")[-1].split(";")[0].strip() - old_mime = m1['mime'].split(":")[-1].split(";")[0].strip() + mime = meta2['mime'].split(":")[-1].split(";")[0].strip() + old_mime = meta1['mime'].split(":")[-1].split(";")[0].strip() # if old_mime is empty, then previous version of File didn't know that # filetype. we will hope that new mime is right. @@ -92,7 +93,7 @@ def is_regression(m1, m2, exact=False, ratio=0.7): return False -def get_diff(m1, m2, exact=False, ratio=0.7): +def get_diff(meta1, meta2, exact=False, ratio=0.7): """ Get textual description about how well file(1) outputs match. @@ -101,26 +102,26 @@ def get_diff(m1, m2, exact=False, ratio=0.7): .. todo:: Reduce code duplication with function is_regression """ - if m1['output'] is None or m2['output'] is None: + if meta1['output'] is None or meta2['output'] is None: return "Output is None, was there error during File execution?" text = "" - if m1['output'] != m2['output']: + if meta1['output'] != meta2['output']: # previous file didn't detect it, so we hope new output is ok - if not m1['output'].endswith("data\n"): + if not meta1['output'].endswith("data\n"): if exact: - if m1['output'] != m2['output']: - text = "Expected :%sGot :%s" % (m1['output'], - m2['output']) + if meta1['output'] != meta2['output']: + text = "Expected :%sGot :%s" % (meta1['output'], + meta2['output']) else: - match = difflib.SequenceMatcher(None, m1['output'], - m2['output']).ratio() + match = difflib.SequenceMatcher(None, meta1['output'], + meta2['output']).ratio() if match < ratio: - text = "Expected :%sGot :%s" % (m1['output'], - m2['output']) + text = "Expected :%sGot :%s" % (meta1['output'], + meta2['output']) - mime = m2['mime'].split(":")[-1].split(";")[0].strip() - old_mime = m1['mime'].split(":")[-1].split(";")[0].strip() + mime = meta2['mime'].split(":")[-1].split(";")[0].strip() + old_mime = meta1['mime'].split(":")[-1].split(";")[0].strip() want_mime_diff = False @@ -135,14 +136,14 @@ def get_diff(m1, m2, exact=False, ratio=0.7): want_mime_diff = True want_mime_diff = True # TODO: this invalidates lines above if want_mime_diff: - text += "Expected :%sGot :%s" % (m1['mime'], m2['mime']) + text += "Expected :%sGot :%s" % (meta1['mime'], meta2['mime']) if text != "": - if ('pattern' in m1) and ('pattern' in m2) and \ - m1['pattern'] != "" and m2['pattern'] != "": - for line in \ - difflib.unified_diff(StringIO(m1['pattern']).readlines(), - StringIO(m2['pattern']).readlines()): + if ('pattern' in meta1) and ('pattern' in meta2) and \ + meta1['pattern'] != "" and meta2['pattern'] != "": + for line in difflib.unified_diff( + StringIO(meta1['pattern']).readlines(), + StringIO(meta2['pattern']).readlines()): text += line return text @@ -156,25 +157,25 @@ def get_stored_files(dir_name, subdir=True, *args): Matched file names are added to the list. If there are no additional arguments, all files found in the directory are added to the list. - Example usage: fileList = dirEntries(r'H:\TEMP', False, 'txt', 'py') + Example usage: file_list = dirEntries(r'H:\TEMP', False, 'txt', 'py') Only files with 'txt' and 'py' extensions will be added to the list. - Example usage: fileList = dirEntries(r'H:\TEMP', True) + Example usage: file_list = dirEntries(r'H:\TEMP', True) All files and all the files in subdirectories under H:\TEMP will be added to the list. """ - fileList = [] - for file in os.listdir(dir_name): - dirfile = os.path.join(dir_name, file) + file_list = [] + for file_name in os.listdir(dir_name): + dirfile = os.path.join(dir_name, file_name) if os.path.isfile(dirfile): if not args: if not dirfile.endswith("pickle") and \ not dirfile.endswith(".source.txt"): - fileList.append(dirfile) + file_list.append(dirfile) else: if os.path.splitext(dirfile)[1][1:] in args: - fileList.append(dirfile) + file_list.append(dirfile) # recursively access file names in subdirectories elif os.path.isdir(dirfile) and subdir: # print "Accessing directory:", dirfile - fileList.extend(get_stored_files(dirfile, subdir, *args)) - return fileList + file_list.extend(get_stored_files(dirfile, subdir, *args)) + return file_list From ce3965c72ee180cb173c3b9c30215a9612142b84 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Thu, 10 Jan 2019 17:57:37 +0100 Subject: [PATCH 30/42] pep8-ify, pylint-ify file.py: Re-indent --- pyfile/file.py | 424 ++++++++++++++++++++++++------------------------- 1 file changed, 212 insertions(+), 212 deletions(-) diff --git a/pyfile/file.py b/pyfile/file.py index 4ca7d26..aab106c 100644 --- a/pyfile/file.py +++ b/pyfile/file.py @@ -25,242 +25,242 @@ import re def print_file_info(file_binary = 'file'): - if not file_binary.startswith("/") and not file_binary.startswith("./") and not file_binary.startswith("../"): - popen = Popen('which ' + file_binary, shell=True, bufsize=4096, stdout=PIPE) - pipe = popen.stdout - output_which = pipe.read().strip() - if popen.wait() != 0: - raise ValueError('could not query {0} for its version ({1})!'.format(file_binary, output_which)) - else: - output_which = file_binary - popen = Popen(file_binary + " --version", shell=True, bufsize=4096, stdout=PIPE) - pipe = popen.stdout - output_ver = pipe.read().strip() - if popen.wait() not in (0,1): - raise ValueError('could not query {0} for its version ({1})!'.format(file_binary, output_ver)) - print 'using file from', output_which - print 'version is', output_ver + if not file_binary.startswith("/") and not file_binary.startswith("./") and not file_binary.startswith("../"): + popen = Popen('which ' + file_binary, shell=True, bufsize=4096, stdout=PIPE) + pipe = popen.stdout + output_which = pipe.read().strip() + if popen.wait() != 0: + raise ValueError('could not query {0} for its version ({1})!'.format(file_binary, output_which)) + else: + output_which = file_binary + popen = Popen(file_binary + " --version", shell=True, bufsize=4096, stdout=PIPE) + pipe = popen.stdout + output_ver = pipe.read().strip() + if popen.wait() not in (0,1): + raise ValueError('could not query {0} for its version ({1})!'.format(file_binary, output_ver)) + print 'using file from', output_which + print 'version is', output_ver def mkdir_p(path): - try: - os.makedirs(path) - except OSError as exc: # Python >2.5 - if exc.errno == errno.EEXIST: - pass - else: raise + try: + os.makedirs(path) + except OSError as exc: # Python >2.5 + if exc.errno == errno.EEXIST: + pass + else: raise def get_file_output(filename, binary = "file"): - popen = Popen(binary + " -b " + filename, shell=True, bufsize=4096, stdout=PIPE, stderr=PIPE) - pipe = popen.stdout - output = pipe.read() - output_err = popen.stderr.read() - if popen.wait() != 0: - return "Error while calling file, output: " + str(output) - return output + popen = Popen(binary + " -b " + filename, shell=True, bufsize=4096, stdout=PIPE, stderr=PIPE) + pipe = popen.stdout + output = pipe.read() + output_err = popen.stderr.read() + if popen.wait() != 0: + return "Error while calling file, output: " + str(output) + return output def get_file_mime(filename, binary = "file"): - popen = Popen(binary + " -ib " + filename, shell=True, bufsize=4096, stdout=PIPE, stderr=PIPE) - pipe = popen.stdout - output = pipe.read() - output_err = popen.stderr.read() - if popen.wait() != 0: - return "Error while calling file, output: " + str(output) - return output + popen = Popen(binary + " -ib " + filename, shell=True, bufsize=4096, stdout=PIPE, stderr=PIPE) + pipe = popen.stdout + output = pipe.read() + output_err = popen.stderr.read() + if popen.wait() != 0: + return "Error while calling file, output: " + str(output) + return output def get_simple_metadata(filename, binary = "file"): - metadata = {} - metadata['output'] = get_file_output(filename, binary) - metadata['mime'] = get_file_mime(filename, binary) - return metadata + metadata = {} + metadata['output'] = get_file_output(filename, binary) + metadata['mime'] = get_file_mime(filename, binary) + return metadata def _split_patterns(pattern_id = 0, magdir = "Magdir", file_name = "file", only_name = False): - FILE_BINARY_HASH = hashlib.sha224(file_name).hexdigest() - outputdir = ".mgc_temp/" + FILE_BINARY_HASH + "/output" - mkdir_p(outputdir) + FILE_BINARY_HASH = hashlib.sha224(file_name).hexdigest() + outputdir = ".mgc_temp/" + FILE_BINARY_HASH + "/output" + mkdir_p(outputdir) - files = os.listdir(magdir) - files.sort() - if len(files) == 0: - raise ValueError('no files found in Magdir {0}'.format( os.path.join(os.getcwd(), magdir) )) - prog = ProgressBar(0, len(files), 50, mode='fixed', char='#') - for f in files: - mfile = os.path.join(magdir, f) - if os.path.isdir(mfile): - continue - fd = open(mfile, "r") - buff = "" - in_pattern = False - prog.increment_amount() - print prog, "Splitting patterns", '\r', - sys.stdout.flush() - lines = fd.readlines() - for i,line in enumerate(lines): - if line.strip().startswith("#") or len(line.strip()) == 0: - continue - #print line.strip() - if line.strip()[0].isdigit(): - if in_pattern: - fd_out = open(os.path.join(outputdir, str(pattern_id)), "w") - fd_out.write(buff) - fd_out.close() - in_pattern = False - buff = "" - if only_name: - if not re.match("^[0-9]*(\\s)*name", line.strip()): - continue - in_pattern = True - pattern_id += 1 - buff += "#" + f +"\n" - buff += "# Automatically generated from:\n" - buff += "#" + f + ":" + str(i) + "\n" - buff += line - elif line.strip().startswith(">") or line.strip().startswith("!"): - if in_pattern: - buff += line - elif only_name == False: - print "broken pattern in file '" + f + "':" + str(i) - if in_pattern: - fd_out = open(os.path.join(outputdir, str(pattern_id)), "w") - fd_out.write(buff) - fd_out.close() - fd.close() - return pattern_id + files = os.listdir(magdir) + files.sort() + if len(files) == 0: + raise ValueError('no files found in Magdir {0}'.format( os.path.join(os.getcwd(), magdir) )) + prog = ProgressBar(0, len(files), 50, mode='fixed', char='#') + for f in files: + mfile = os.path.join(magdir, f) + if os.path.isdir(mfile): + continue + fd = open(mfile, "r") + buff = "" + in_pattern = False + prog.increment_amount() + print prog, "Splitting patterns", '\r', + sys.stdout.flush() + lines = fd.readlines() + for i,line in enumerate(lines): + if line.strip().startswith("#") or len(line.strip()) == 0: + continue + #print line.strip() + if line.strip()[0].isdigit() or (line.strip()[0] == '-' and line.strip()[1].isdigit()): + if in_pattern: + fd_out = open(os.path.join(outputdir, str(pattern_id)), "w") + fd_out.write(buff) + fd_out.close() + in_pattern = False + buff = "" + if only_name: + if not re.match("^[0-9]*(\\s)*name", line.strip()): + continue + in_pattern = True + pattern_id += 1 + buff += "#" + f +"\n" + buff += "# Automatically generated from:\n" + buff += "#" + f + ":" + str(i) + "\n" + buff += line + elif line.strip().startswith(">") or line.strip().startswith("!"): + if in_pattern: + buff += line + elif only_name == False: + print "broken pattern in file '" + f + "':" + str(i) + if in_pattern: + fd_out = open(os.path.join(outputdir, str(pattern_id)), "w") + fd_out.write(buff) + fd_out.close() + fd.close() + return pattern_id def split_patterns(magdir = "Magdir", file_name = "file"): - pattern_id = _split_patterns(0, magdir, file_name, True) - _split_patterns(pattern_id, magdir, file_name) + pattern_id = _split_patterns(0, magdir, file_name, True) + _split_patterns(pattern_id, magdir, file_name) - print '' + print '' def compile_patterns(file_name = "file", file_binary = "file"): - FILE_BINARY_HASH = hashlib.sha224(file_name).hexdigest() - magdir = ".mgc_temp/" + FILE_BINARY_HASH + "/output" - files = os.listdir(magdir) - if len(files) == 0: - raise ValueError('no files found in Magdir {0}'.format( os.path.join(os.getcwd(), magdir) )) - files.sort(key=lambda x: [int(x)]) - mkdir_p(".mgc_temp") - mkdir_p(".mgc_temp/" + FILE_BINARY_HASH) - mkdir_p(".mgc_temp/" + FILE_BINARY_HASH + "/tmp") - prog = ProgressBar(0, len(files), 50, mode='fixed', char='#') + FILE_BINARY_HASH = hashlib.sha224(file_name).hexdigest() + magdir = ".mgc_temp/" + FILE_BINARY_HASH + "/output" + files = os.listdir(magdir) + if len(files) == 0: + raise ValueError('no files found in Magdir {0}'.format( os.path.join(os.getcwd(), magdir) )) + files.sort(key=lambda x: [int(x)]) + mkdir_p(".mgc_temp") + mkdir_p(".mgc_temp/" + FILE_BINARY_HASH) + mkdir_p(".mgc_temp/" + FILE_BINARY_HASH + "/tmp") + prog = ProgressBar(0, len(files), 50, mode='fixed', char='#') - for i,f in enumerate(files): - out_file = ".mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + ".mgc" - if not os.path.exists(out_file): - fd = open(os.path.join(magdir, f), "r") - buf = fd.read() - fd.close() - x = buf.split("\n")[0][1:len(buf.split("\n")[0])] - tmp = open(os.path.join(".mgc_temp/" + FILE_BINARY_HASH + "/tmp/" + x), "a") - tmp.write(buf) - tmp.flush() - tmp.close() - ##tmp = open(".mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp", "a") - ##tmp.write(buf) - ##tmp.flush() - ##tmp.close() - #os.chdir(".mgc_temp") - #print "cp .mgc_temp/.find-magic.tmp .mgc_temp/.find-magic.tmp." + str(i) + ";" + FILE_BINARY + " -C -m .mgc_temp/.find-magic.tmp." + str(i) + ";" - #mv .find-magic.tmp." + str(i) + ".mgc .mgc_temp/; - - ##os.system("cp .mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp .mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + ";file -C -m .mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + ";") - cmd = file_binary + " -C -m .mgc_temp/" + FILE_BINARY_HASH + "/tmp" - ret_code = os.system(cmd) - if ret_code != 0: - raise ValueError('command {0} returned non-zero exit code {1}!'.format(cmd, ret_code)) - if os.path.exists("tmp.mgc"): - ret_code = os.system("mv tmp.mgc " + out_file) - if ret_code != 0: - raise ValueError('moving tmp.mgc to {0} failed with code {1}!'.format(out_file, ret_code)) - #os.chdir("..") - prog.increment_amount() - print prog, "Compiling patterns", '\r', - sys.stdout.flush() - print "" + for i,f in enumerate(files): + out_file = ".mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + ".mgc" + if not os.path.exists(out_file): + fd = open(os.path.join(magdir, f), "r") + buf = fd.read() + fd.close() + x = buf.split("\n")[0][1:len(buf.split("\n")[0])] + tmp = open(os.path.join(".mgc_temp/" + FILE_BINARY_HASH + "/tmp/" + x), "a") + tmp.write(buf) + tmp.flush() + tmp.close() + ##tmp = open(".mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp", "a") + ##tmp.write(buf) + ##tmp.flush() + ##tmp.close() + #os.chdir(".mgc_temp") + #print "cp .mgc_temp/.find-magic.tmp .mgc_temp/.find-magic.tmp." + str(i) + ";" + FILE_BINARY + " -C -m .mgc_temp/.find-magic.tmp." + str(i) + ";" + #mv .find-magic.tmp." + str(i) + ".mgc .mgc_temp/; + + ##os.system("cp .mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp .mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + ";file -C -m .mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + ";") + cmd = file_binary + " -C -m .mgc_temp/" + FILE_BINARY_HASH + "/tmp" + ret_code = os.system(cmd) + if ret_code != 0: + raise ValueError('command {0} returned non-zero exit code {1}!'.format(cmd, ret_code)) + if os.path.exists("tmp.mgc"): + ret_code = os.system("mv tmp.mgc " + out_file) + if ret_code != 0: + raise ValueError('moving tmp.mgc to {0} failed with code {1}!'.format(out_file, ret_code)) + #os.chdir("..") + prog.increment_amount() + print prog, "Compiling patterns", '\r', + sys.stdout.flush() + print "" def get_full_metadata(infile, file_name = "file", compiled = True, file_binary = "file"): - """ file-output plus binary search to find the relevant line in magic file """ - COMPILED_SUFFIX = ".mgc" - if not compiled: - COMPILED_SUFFIX = "" - FILE_BINARY_HASH = hashlib.sha224(file_name).hexdigest() - magdir = ".mgc_temp/" + FILE_BINARY_HASH + "/output" - FILE_BINARY = file_binary - files = os.listdir(magdir) - files.sort(key=lambda x: [int(x)]) - tlist = [] - mkdir_p(".mgc_temp") - a = 0 - b = len(files) - 1 - i = b + """ file-output plus binary search to find the relevant line in magic file """ + COMPILED_SUFFIX = ".mgc" + if not compiled: + COMPILED_SUFFIX = "" + FILE_BINARY_HASH = hashlib.sha224(file_name).hexdigest() + magdir = ".mgc_temp/" + FILE_BINARY_HASH + "/output" + FILE_BINARY = file_binary + files = os.listdir(magdir) + files.sort(key=lambda x: [int(x)]) + tlist = [] + mkdir_p(".mgc_temp") + a = 0 + b = len(files) - 1 + i = b - a_out = "" - b_out = None + a_out = "" + b_out = None - while True: - f = files[i] - cmd = FILE_BINARY + " -b " + infile + " -m .mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + COMPILED_SUFFIX - #print FILE_BINARY + " " + infile + " -m .mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + COMPILED_SUFFIX - popen = Popen(cmd, shell=True, bufsize=4096, stdout=PIPE) - pipe = popen.stdout - last = pipe.read() - if popen.wait() != 0: - return {'output':None, 'mime':None, 'pattern':None, "suffix":None, "err":(cmd, last.strip())} - if b_out == None: - b_out = last - # a---------i---------b - # a_out == last \solution here - if last != b_out: - a = i - a_out = last - # a-------------------i-------------------b - # solution here/ last == b_out - else: - b = i - b_out = last + while True: + f = files[i] + cmd = FILE_BINARY + " -b " + infile + " -m .mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + COMPILED_SUFFIX + #print FILE_BINARY + " " + infile + " -m .mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + COMPILED_SUFFIX + popen = Popen(cmd, shell=True, bufsize=4096, stdout=PIPE) + pipe = popen.stdout + last = pipe.read() + if popen.wait() != 0: + return {'output':None, 'mime':None, 'pattern':None, "suffix":None, "err":(cmd, last.strip())} + if b_out == None: + b_out = last + # a---------i---------b + # a_out == last \solution here + if last != b_out: + a = i + a_out = last + # a-------------------i-------------------b + # solution here/ last == b_out + else: + b = i + b_out = last - if i == a + (b - a) / 2: - if b_out != last: - i += 1 - last = b_out - f = files[i] - #if f in PATTERNS: - #PATTERNS.remove(f); - #print i, f - fd = open(os.path.join(magdir, f), "r") - buf = fd.read() - fd.close() - if os.path.exists(os.path.dirname(FILE_BINARY) + "/../magic/magic.mime.mgc"): - cmd = FILE_BINARY + " -bi " + infile + " -m " + os.path.dirname(FILE_BINARY) + "/../magic/magic" - else: - cmd = FILE_BINARY + " -bi " + infile + " -m .mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + COMPILED_SUFFIX - popen = Popen(cmd, shell=True, bufsize=4096, stdout=PIPE) - pipe = popen.stdout - mime = pipe.read() - if popen.wait() != 0: - return {'output':None, 'mime':None, 'pattern':None, "suffix":None, "err":(cmd, mime.strip())} - tlist.append(last) - index = infile.find('.') - if index == -1: - suffix = "" - else: - suffix = infile[index:] - if last == "data\n" and i == 0: - buf = "" - return {'output':last, 'mime':mime, 'pattern':buf, "suffix":suffix} - else: - i = a + (b - a) / 2 + if i == a + (b - a) / 2: + if b_out != last: + i += 1 + last = b_out + f = files[i] + #if f in PATTERNS: + #PATTERNS.remove(f); + #print i, f + fd = open(os.path.join(magdir, f), "r") + buf = fd.read() + fd.close() + if os.path.exists(os.path.dirname(FILE_BINARY) + "/../magic/magic.mime.mgc"): + cmd = FILE_BINARY + " -bi " + infile + " -m " + os.path.dirname(FILE_BINARY) + "/../magic/magic" + else: + cmd = FILE_BINARY + " -bi " + infile + " -m .mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + COMPILED_SUFFIX + popen = Popen(cmd, shell=True, bufsize=4096, stdout=PIPE) + pipe = popen.stdout + mime = pipe.read() + if popen.wait() != 0: + return {'output':None, 'mime':None, 'pattern':None, "suffix":None, "err":(cmd, mime.strip())} + tlist.append(last) + index = infile.find('.') + if index == -1: + suffix = "" + else: + suffix = infile[index:] + if last == "data\n" and i == 0: + buf = "" + return {'output':last, 'mime':mime, 'pattern':buf, "suffix":suffix} + else: + i = a + (b - a) / 2 def is_compilation_supported(file_name = "file", file_binary = "file"): - FILE_BINARY_HASH = hashlib.sha224(file_name).hexdigest() - if os.system(file_binary + " /bin/sh -m .mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp.0.mgc > /dev/null") != 0: - print '' - print "This file version doesn't support compiled patterns => they won't be used" - return False - else: - print 'Compiled patterns will be used' - print '' - return True + FILE_BINARY_HASH = hashlib.sha224(file_name).hexdigest() + if os.system(file_binary + " /bin/sh -m .mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp.0.mgc > /dev/null") != 0: + print '' + print "This file version doesn't support compiled patterns => they won't be used" + return False + else: + print 'Compiled patterns will be used' + print '' + return True From 4ff8f412df86632e0f72cf5ea0268cb00232428a Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Mon, 14 Jan 2019 10:24:25 +0100 Subject: [PATCH 31/42] pep8-ify, pylint-ify file.py: wrap lines --- pyfile/file.py | 101 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 69 insertions(+), 32 deletions(-) diff --git a/pyfile/file.py b/pyfile/file.py index aab106c..a9dd436 100644 --- a/pyfile/file.py +++ b/pyfile/file.py @@ -13,8 +13,8 @@ # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -# +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. import os import sys @@ -25,19 +25,24 @@ import re def print_file_info(file_binary = 'file'): - if not file_binary.startswith("/") and not file_binary.startswith("./") and not file_binary.startswith("../"): - popen = Popen('which ' + file_binary, shell=True, bufsize=4096, stdout=PIPE) + if not file_binary.startswith("/") and not file_binary.startswith("./") \ + and not file_binary.startswith("../"): + popen = Popen('which ' + file_binary, shell=True, bufsize=4096, + stdout=PIPE) pipe = popen.stdout output_which = pipe.read().strip() if popen.wait() != 0: - raise ValueError('could not query {0} for its version ({1})!'.format(file_binary, output_which)) + raise ValueError('could not query {0} for its version ({1})!' + .format(file_binary, output_which)) else: output_which = file_binary - popen = Popen(file_binary + " --version", shell=True, bufsize=4096, stdout=PIPE) + popen = Popen(file_binary + " --version", shell=True, bufsize=4096, + stdout=PIPE) pipe = popen.stdout output_ver = pipe.read().strip() if popen.wait() not in (0,1): - raise ValueError('could not query {0} for its version ({1})!'.format(file_binary, output_ver)) + raise ValueError('could not query {0} for its version ({1})!' + .format(file_binary, output_ver)) print 'using file from', output_which print 'version is', output_ver @@ -51,7 +56,8 @@ def mkdir_p(path): else: raise def get_file_output(filename, binary = "file"): - popen = Popen(binary + " -b " + filename, shell=True, bufsize=4096, stdout=PIPE, stderr=PIPE) + popen = Popen(binary + " -b " + filename, shell=True, bufsize=4096, + stdout=PIPE, stderr=PIPE) pipe = popen.stdout output = pipe.read() output_err = popen.stderr.read() @@ -60,7 +66,8 @@ def get_file_output(filename, binary = "file"): return output def get_file_mime(filename, binary = "file"): - popen = Popen(binary + " -ib " + filename, shell=True, bufsize=4096, stdout=PIPE, stderr=PIPE) + popen = Popen(binary + " -ib " + filename, shell=True, bufsize=4096, + stdout=PIPE, stderr=PIPE) pipe = popen.stdout output = pipe.read() output_err = popen.stderr.read() @@ -74,7 +81,8 @@ def get_simple_metadata(filename, binary = "file"): metadata['mime'] = get_file_mime(filename, binary) return metadata -def _split_patterns(pattern_id = 0, magdir = "Magdir", file_name = "file", only_name = False): +def _split_patterns(pattern_id = 0, magdir = "Magdir", file_name = "file", + only_name = False): FILE_BINARY_HASH = hashlib.sha224(file_name).hexdigest() outputdir = ".mgc_temp/" + FILE_BINARY_HASH + "/output" mkdir_p(outputdir) @@ -82,7 +90,8 @@ def _split_patterns(pattern_id = 0, magdir = "Magdir", file_name = "file", only_ files = os.listdir(magdir) files.sort() if len(files) == 0: - raise ValueError('no files found in Magdir {0}'.format( os.path.join(os.getcwd(), magdir) )) + raise ValueError('no files found in Magdir {0}' + .format( os.path.join(os.getcwd(), magdir) )) prog = ProgressBar(0, len(files), 50, mode='fixed', char='#') for f in files: mfile = os.path.join(magdir, f) @@ -99,9 +108,11 @@ def _split_patterns(pattern_id = 0, magdir = "Magdir", file_name = "file", only_ if line.strip().startswith("#") or len(line.strip()) == 0: continue #print line.strip() - if line.strip()[0].isdigit() or (line.strip()[0] == '-' and line.strip()[1].isdigit()): + if line.strip()[0].isdigit() or \ + (line.strip()[0] == '-' and line.strip()[1].isdigit()): if in_pattern: - fd_out = open(os.path.join(outputdir, str(pattern_id)), "w") + fd_out = open(os.path.join(outputdir, str(pattern_id)), + "w") fd_out.write(buff) fd_out.close() in_pattern = False @@ -138,7 +149,8 @@ def compile_patterns(file_name = "file", file_binary = "file"): magdir = ".mgc_temp/" + FILE_BINARY_HASH + "/output" files = os.listdir(magdir) if len(files) == 0: - raise ValueError('no files found in Magdir {0}'.format( os.path.join(os.getcwd(), magdir) )) + raise ValueError('no files found in Magdir {0}' + .format( os.path.join(os.getcwd(), magdir) )) files.sort(key=lambda x: [int(x)]) mkdir_p(".mgc_temp") mkdir_p(".mgc_temp/" + FILE_BINARY_HASH) @@ -146,41 +158,56 @@ def compile_patterns(file_name = "file", file_binary = "file"): prog = ProgressBar(0, len(files), 50, mode='fixed', char='#') for i,f in enumerate(files): - out_file = ".mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + ".mgc" + out_file = ".mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp." + \ + str(i) + ".mgc" if not os.path.exists(out_file): fd = open(os.path.join(magdir, f), "r") buf = fd.read() fd.close() x = buf.split("\n")[0][1:len(buf.split("\n")[0])] - tmp = open(os.path.join(".mgc_temp/" + FILE_BINARY_HASH + "/tmp/" + x), "a") + tmp = open(os.path.join(".mgc_temp/" + FILE_BINARY_HASH + \ + "/tmp/" + x), "a") tmp.write(buf) tmp.flush() tmp.close() - ##tmp = open(".mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp", "a") + ##tmp = open(".mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp", + ## "a") ##tmp.write(buf) ##tmp.flush() ##tmp.close() #os.chdir(".mgc_temp") - #print "cp .mgc_temp/.find-magic.tmp .mgc_temp/.find-magic.tmp." + str(i) + ";" + FILE_BINARY + " -C -m .mgc_temp/.find-magic.tmp." + str(i) + ";" + #print "cp .mgc_temp/.find-magic.tmp " + \ + # ".mgc_temp/.find-magic.tmp." + str(i) + ";" + \ + # FILE_BINARY + " -C -m .mgc_temp/.find-magic.tmp." + str(i) \ + # + ";" #mv .find-magic.tmp." + str(i) + ".mgc .mgc_temp/; - ##os.system("cp .mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp .mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + ";file -C -m .mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + ";") + ##os.system("cp .mgc_temp/" + FILE_BINARY_HASH + + # "/.find-magic.tmp .mgc_temp/" + FILE_BINARY_HASH + + # "/.find-magic.tmp." + str(i) + ";" + + # "file -C -m .mgc_temp/" + FILE_BINARY_HASH + + # "/.find-magic.tmp." + str(i) + ";") cmd = file_binary + " -C -m .mgc_temp/" + FILE_BINARY_HASH + "/tmp" ret_code = os.system(cmd) if ret_code != 0: - raise ValueError('command {0} returned non-zero exit code {1}!'.format(cmd, ret_code)) + raise ValueError('command {0} returned non-zero exit code {1}!' + .format(cmd, ret_code)) if os.path.exists("tmp.mgc"): ret_code = os.system("mv tmp.mgc " + out_file) if ret_code != 0: - raise ValueError('moving tmp.mgc to {0} failed with code {1}!'.format(out_file, ret_code)) + raise ValueError('moving tmp.mgc to {0} failed with code ' + '{1}!'.format(out_file, ret_code)) #os.chdir("..") prog.increment_amount() print prog, "Compiling patterns", '\r', sys.stdout.flush() print "" -def get_full_metadata(infile, file_name = "file", compiled = True, file_binary = "file"): - """ file-output plus binary search to find the relevant line in magic file """ +def get_full_metadata(infile, file_name = "file", compiled = True, + file_binary = "file"): + """ + file-output plus binary search to find the relevant line in magic file + """ COMPILED_SUFFIX = ".mgc" if not compiled: COMPILED_SUFFIX = "" @@ -200,13 +227,16 @@ def get_full_metadata(infile, file_name = "file", compiled = True, file_binary = while True: f = files[i] - cmd = FILE_BINARY + " -b " + infile + " -m .mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + COMPILED_SUFFIX - #print FILE_BINARY + " " + infile + " -m .mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + COMPILED_SUFFIX + cmd = FILE_BINARY + " -b " + infile + " -m .mgc_temp/" + \ + FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + COMPILED_SUFFIX + #print FILE_BINARY + " " + infile + " -m .mgc_temp/" + \ + # FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + COMPILED_SUFFIX popen = Popen(cmd, shell=True, bufsize=4096, stdout=PIPE) pipe = popen.stdout last = pipe.read() if popen.wait() != 0: - return {'output':None, 'mime':None, 'pattern':None, "suffix":None, "err":(cmd, last.strip())} + return {'output':None, 'mime':None, 'pattern':None, "suffix":None, + "err":(cmd, last.strip())} if b_out == None: b_out = last # a---------i---------b @@ -231,15 +261,20 @@ def get_full_metadata(infile, file_name = "file", compiled = True, file_binary = fd = open(os.path.join(magdir, f), "r") buf = fd.read() fd.close() - if os.path.exists(os.path.dirname(FILE_BINARY) + "/../magic/magic.mime.mgc"): - cmd = FILE_BINARY + " -bi " + infile + " -m " + os.path.dirname(FILE_BINARY) + "/../magic/magic" + if os.path.exists(os.path.dirname(FILE_BINARY) + + "/../magic/magic.mime.mgc"): + cmd = FILE_BINARY + " -bi " + infile + " -m " + \ + os.path.dirname(FILE_BINARY) + "/../magic/magic" else: - cmd = FILE_BINARY + " -bi " + infile + " -m .mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + COMPILED_SUFFIX + cmd = FILE_BINARY + " -bi " + infile + " -m .mgc_temp/" + \ + FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + \ + COMPILED_SUFFIX popen = Popen(cmd, shell=True, bufsize=4096, stdout=PIPE) pipe = popen.stdout mime = pipe.read() if popen.wait() != 0: - return {'output':None, 'mime':None, 'pattern':None, "suffix":None, "err":(cmd, mime.strip())} + return {'output':None, 'mime':None, 'pattern':None, + "suffix":None, "err":(cmd, mime.strip())} tlist.append(last) index = infile.find('.') if index == -1: @@ -254,9 +289,11 @@ def get_full_metadata(infile, file_name = "file", compiled = True, file_binary = def is_compilation_supported(file_name = "file", file_binary = "file"): FILE_BINARY_HASH = hashlib.sha224(file_name).hexdigest() - if os.system(file_binary + " /bin/sh -m .mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp.0.mgc > /dev/null") != 0: + if os.system(file_binary + " /bin/sh -m .mgc_temp/" + FILE_BINARY_HASH + + "/.find-magic.tmp.0.mgc > /dev/null") != 0: print '' - print "This file version doesn't support compiled patterns => they won't be used" + print "This file version doesn't support compiled patterns " \ + "=> they won't be used" return False else: print 'Compiled patterns will be used' From a10d724c189565590d7acebde75a2a75c5be3d81 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Mon, 14 Jan 2019 10:34:56 +0100 Subject: [PATCH 32/42] pep8-ify, pylint-ify file.py: whitespace --- pyfile/file.py | 100 ++++++++++++++++++++++++++----------------------- 1 file changed, 54 insertions(+), 46 deletions(-) diff --git a/pyfile/file.py b/pyfile/file.py index a9dd436..7884ac3 100644 --- a/pyfile/file.py +++ b/pyfile/file.py @@ -24,7 +24,8 @@ import hashlib import re -def print_file_info(file_binary = 'file'): + +def print_file_info(file_binary='file'): if not file_binary.startswith("/") and not file_binary.startswith("./") \ and not file_binary.startswith("../"): popen = Popen('which ' + file_binary, shell=True, bufsize=4096, @@ -35,12 +36,12 @@ def print_file_info(file_binary = 'file'): raise ValueError('could not query {0} for its version ({1})!' .format(file_binary, output_which)) else: - output_which = file_binary + output_which = file_binary popen = Popen(file_binary + " --version", shell=True, bufsize=4096, stdout=PIPE) pipe = popen.stdout output_ver = pipe.read().strip() - if popen.wait() not in (0,1): + if popen.wait() not in (0, 1): raise ValueError('could not query {0} for its version ({1})!' .format(file_binary, output_ver)) print 'using file from', output_which @@ -50,12 +51,14 @@ def print_file_info(file_binary = 'file'): def mkdir_p(path): try: os.makedirs(path) - except OSError as exc: # Python >2.5 + except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST: pass - else: raise + else: + raise + -def get_file_output(filename, binary = "file"): +def get_file_output(filename, binary="file"): popen = Popen(binary + " -b " + filename, shell=True, bufsize=4096, stdout=PIPE, stderr=PIPE) pipe = popen.stdout @@ -65,7 +68,8 @@ def get_file_output(filename, binary = "file"): return "Error while calling file, output: " + str(output) return output -def get_file_mime(filename, binary = "file"): + +def get_file_mime(filename, binary="file"): popen = Popen(binary + " -ib " + filename, shell=True, bufsize=4096, stdout=PIPE, stderr=PIPE) pipe = popen.stdout @@ -75,14 +79,16 @@ def get_file_mime(filename, binary = "file"): return "Error while calling file, output: " + str(output) return output -def get_simple_metadata(filename, binary = "file"): + +def get_simple_metadata(filename, binary="file"): metadata = {} metadata['output'] = get_file_output(filename, binary) - metadata['mime'] = get_file_mime(filename, binary) + metadata['mime'] = get_file_mime(filename, binary) return metadata -def _split_patterns(pattern_id = 0, magdir = "Magdir", file_name = "file", - only_name = False): + +def _split_patterns(pattern_id=0, magdir="Magdir", file_name="file", + only_name=False): FILE_BINARY_HASH = hashlib.sha224(file_name).hexdigest() outputdir = ".mgc_temp/" + FILE_BINARY_HASH + "/output" mkdir_p(outputdir) @@ -91,7 +97,7 @@ def _split_patterns(pattern_id = 0, magdir = "Magdir", file_name = "file", files.sort() if len(files) == 0: raise ValueError('no files found in Magdir {0}' - .format( os.path.join(os.getcwd(), magdir) )) + .format(os.path.join(os.getcwd(), magdir))) prog = ProgressBar(0, len(files), 50, mode='fixed', char='#') for f in files: mfile = os.path.join(magdir, f) @@ -104,10 +110,10 @@ def _split_patterns(pattern_id = 0, magdir = "Magdir", file_name = "file", print prog, "Splitting patterns", '\r', sys.stdout.flush() lines = fd.readlines() - for i,line in enumerate(lines): + for i, line in enumerate(lines): if line.strip().startswith("#") or len(line.strip()) == 0: continue - #print line.strip() + # print line.strip() if line.strip()[0].isdigit() or \ (line.strip()[0] == '-' and line.strip()[1].isdigit()): if in_pattern: @@ -122,7 +128,7 @@ def _split_patterns(pattern_id = 0, magdir = "Magdir", file_name = "file", continue in_pattern = True pattern_id += 1 - buff += "#" + f +"\n" + buff += "#" + f + "\n" buff += "# Automatically generated from:\n" buff += "#" + f + ":" + str(i) + "\n" buff += line @@ -138,26 +144,28 @@ def _split_patterns(pattern_id = 0, magdir = "Magdir", file_name = "file", fd.close() return pattern_id -def split_patterns(magdir = "Magdir", file_name = "file"): + +def split_patterns(magdir="Magdir", file_name="file"): pattern_id = _split_patterns(0, magdir, file_name, True) _split_patterns(pattern_id, magdir, file_name) print '' -def compile_patterns(file_name = "file", file_binary = "file"): + +def compile_patterns(file_name="file", file_binary="file"): FILE_BINARY_HASH = hashlib.sha224(file_name).hexdigest() magdir = ".mgc_temp/" + FILE_BINARY_HASH + "/output" files = os.listdir(magdir) if len(files) == 0: raise ValueError('no files found in Magdir {0}' - .format( os.path.join(os.getcwd(), magdir) )) + .format(os.path.join(os.getcwd(), magdir))) files.sort(key=lambda x: [int(x)]) mkdir_p(".mgc_temp") mkdir_p(".mgc_temp/" + FILE_BINARY_HASH) mkdir_p(".mgc_temp/" + FILE_BINARY_HASH + "/tmp") prog = ProgressBar(0, len(files), 50, mode='fixed', char='#') - for i,f in enumerate(files): + for i, f in enumerate(files): out_file = ".mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp." + \ str(i) + ".mgc" if not os.path.exists(out_file): @@ -170,19 +178,19 @@ def compile_patterns(file_name = "file", file_binary = "file"): tmp.write(buf) tmp.flush() tmp.close() - ##tmp = open(".mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp", - ## "a") - ##tmp.write(buf) - ##tmp.flush() - ##tmp.close() - #os.chdir(".mgc_temp") - #print "cp .mgc_temp/.find-magic.tmp " + \ - # ".mgc_temp/.find-magic.tmp." + str(i) + ";" + \ - # FILE_BINARY + " -C -m .mgc_temp/.find-magic.tmp." + str(i) \ - # + ";" - #mv .find-magic.tmp." + str(i) + ".mgc .mgc_temp/; + # tmp = open(".mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp", + # "a") + # tmp.write(buf) + # tmp.flush() + # tmp.close() + # os.chdir(".mgc_temp") + # print "cp .mgc_temp/.find-magic.tmp " + \ + # ".mgc_temp/.find-magic.tmp." + str(i) + ";" + \ + # FILE_BINARY + " -C -m .mgc_temp/.find-magic.tmp." + str(i)\ + # + ";" + # mv .find-magic.tmp." + str(i) + ".mgc .mgc_temp/; - ##os.system("cp .mgc_temp/" + FILE_BINARY_HASH + + # os.system("cp .mgc_temp/" + FILE_BINARY_HASH + # "/.find-magic.tmp .mgc_temp/" + FILE_BINARY_HASH + # "/.find-magic.tmp." + str(i) + ";" + # "file -C -m .mgc_temp/" + FILE_BINARY_HASH + @@ -197,14 +205,15 @@ def compile_patterns(file_name = "file", file_binary = "file"): if ret_code != 0: raise ValueError('moving tmp.mgc to {0} failed with code ' '{1}!'.format(out_file, ret_code)) - #os.chdir("..") + # os.chdir("..") prog.increment_amount() print prog, "Compiling patterns", '\r', sys.stdout.flush() print "" -def get_full_metadata(infile, file_name = "file", compiled = True, - file_binary = "file"): + +def get_full_metadata(infile, file_name="file", compiled=True, + file_binary="file"): """ file-output plus binary search to find the relevant line in magic file """ @@ -229,14 +238,14 @@ def get_full_metadata(infile, file_name = "file", compiled = True, f = files[i] cmd = FILE_BINARY + " -b " + infile + " -m .mgc_temp/" + \ FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + COMPILED_SUFFIX - #print FILE_BINARY + " " + infile + " -m .mgc_temp/" + \ + # print FILE_BINARY + " " + infile + " -m .mgc_temp/" + \ # FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + COMPILED_SUFFIX popen = Popen(cmd, shell=True, bufsize=4096, stdout=PIPE) pipe = popen.stdout last = pipe.read() if popen.wait() != 0: - return {'output':None, 'mime':None, 'pattern':None, "suffix":None, - "err":(cmd, last.strip())} + return dict(output=None, mime=None, pattern=None, suffix=None, + err=(cmd, last.strip())) if b_out == None: b_out = last # a---------i---------b @@ -255,9 +264,9 @@ def get_full_metadata(infile, file_name = "file", compiled = True, i += 1 last = b_out f = files[i] - #if f in PATTERNS: - #PATTERNS.remove(f); - #print i, f + # if f in PATTERNS: + # PATTERNS.remove(f); + # print i, f fd = open(os.path.join(magdir, f), "r") buf = fd.read() fd.close() @@ -273,8 +282,8 @@ def get_full_metadata(infile, file_name = "file", compiled = True, pipe = popen.stdout mime = pipe.read() if popen.wait() != 0: - return {'output':None, 'mime':None, 'pattern':None, - "suffix":None, "err":(cmd, mime.strip())} + return dict(output=None, mime=None, pattern=None, suffix=None, + err=(cmd, mime.strip())) tlist.append(last) index = infile.find('.') if index == -1: @@ -283,11 +292,12 @@ def get_full_metadata(infile, file_name = "file", compiled = True, suffix = infile[index:] if last == "data\n" and i == 0: buf = "" - return {'output':last, 'mime':mime, 'pattern':buf, "suffix":suffix} + return dict(output=last, mime=mime, pattern=buf, suffix=suffix) else: i = a + (b - a) / 2 -def is_compilation_supported(file_name = "file", file_binary = "file"): + +def is_compilation_supported(file_name="file", file_binary="file"): FILE_BINARY_HASH = hashlib.sha224(file_name).hexdigest() if os.system(file_binary + " /bin/sh -m .mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp.0.mgc > /dev/null") != 0: @@ -299,5 +309,3 @@ def is_compilation_supported(file_name = "file", file_binary = "file"): print 'Compiled patterns will be used' print '' return True - - From 5ea1246bb4bc24efb7e472e600d70458d731939a Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Mon, 14 Jan 2019 10:43:31 +0100 Subject: [PATCH 33/42] pep8-ify, pylint-ify file.py: import order --- pyfile/file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyfile/file.py b/pyfile/file.py index 7884ac3..d2d2e4c 100644 --- a/pyfile/file.py +++ b/pyfile/file.py @@ -20,9 +20,9 @@ import sys import errno from subprocess import Popen, PIPE -from progressbar import ProgressBar import hashlib import re +from progressbar import ProgressBar def print_file_info(file_binary='file'): From 652c1e9b7e927de56c4244b8e7f2279cd299d027 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Mon, 14 Jan 2019 10:43:42 +0100 Subject: [PATCH 34/42] pep8-ify, pylint-ify file.py: unused variables --- pyfile/file.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pyfile/file.py b/pyfile/file.py index d2d2e4c..2f0ab35 100644 --- a/pyfile/file.py +++ b/pyfile/file.py @@ -65,7 +65,8 @@ def get_file_output(filename, binary="file"): output = pipe.read() output_err = popen.stderr.read() if popen.wait() != 0: - return "Error while calling file, output: " + str(output) + return "Error while calling file, output: " + str(output) + \ + str(output_err) return output @@ -76,7 +77,8 @@ def get_file_mime(filename, binary="file"): output = pipe.read() output_err = popen.stderr.read() if popen.wait() != 0: - return "Error while calling file, output: " + str(output) + return "Error while calling file, output: " + str(output) + \ + str(output_err) return output @@ -231,7 +233,7 @@ def get_full_metadata(infile, file_name="file", compiled=True, b = len(files) - 1 i = b - a_out = "" + # a_out = "" unused! b_out = None while True: From feebd42dd86fec8a0a90f4cbd5c68abfe77cb0a9 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Mon, 14 Jan 2019 10:49:13 +0100 Subject: [PATCH 35/42] pep8-ify, pylint-ify file.py: print function --- pyfile/file.py | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/pyfile/file.py b/pyfile/file.py index 2f0ab35..ddb57f3 100644 --- a/pyfile/file.py +++ b/pyfile/file.py @@ -16,8 +16,9 @@ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, # USA. +from __future__ import print_function + import os -import sys import errno from subprocess import Popen, PIPE import hashlib @@ -44,8 +45,8 @@ def print_file_info(file_binary='file'): if popen.wait() not in (0, 1): raise ValueError('could not query {0} for its version ({1})!' .format(file_binary, output_ver)) - print 'using file from', output_which - print 'version is', output_ver + print('using file from', output_which) + print('version is', output_ver) def mkdir_p(path): @@ -109,13 +110,12 @@ def _split_patterns(pattern_id=0, magdir="Magdir", file_name="file", buff = "" in_pattern = False prog.increment_amount() - print prog, "Splitting patterns", '\r', - sys.stdout.flush() + print(prog, "Splitting patterns", end='\r', flush=True) lines = fd.readlines() for i, line in enumerate(lines): if line.strip().startswith("#") or len(line.strip()) == 0: continue - # print line.strip() + # print(line.strip() if line.strip()[0].isdigit() or \ (line.strip()[0] == '-' and line.strip()[1].isdigit()): if in_pattern: @@ -138,7 +138,7 @@ def _split_patterns(pattern_id=0, magdir="Magdir", file_name="file", if in_pattern: buff += line elif only_name == False: - print "broken pattern in file '" + f + "':" + str(i) + print("broken pattern in file '" + f + "':" + str(i)) if in_pattern: fd_out = open(os.path.join(outputdir, str(pattern_id)), "w") fd_out.write(buff) @@ -151,7 +151,7 @@ def split_patterns(magdir="Magdir", file_name="file"): pattern_id = _split_patterns(0, magdir, file_name, True) _split_patterns(pattern_id, magdir, file_name) - print '' + print('') def compile_patterns(file_name="file", file_binary="file"): @@ -186,10 +186,10 @@ def compile_patterns(file_name="file", file_binary="file"): # tmp.flush() # tmp.close() # os.chdir(".mgc_temp") - # print "cp .mgc_temp/.find-magic.tmp " + \ - # ".mgc_temp/.find-magic.tmp." + str(i) + ";" + \ - # FILE_BINARY + " -C -m .mgc_temp/.find-magic.tmp." + str(i)\ - # + ";" + # print("cp .mgc_temp/.find-magic.tmp " + + # ".mgc_temp/.find-magic.tmp." + str(i) + ";" + + # FILE_BINARY + " -C -m .mgc_temp/.find-magic.tmp." + str(i) + # + ";") # mv .find-magic.tmp." + str(i) + ".mgc .mgc_temp/; # os.system("cp .mgc_temp/" + FILE_BINARY_HASH + @@ -209,9 +209,8 @@ def compile_patterns(file_name="file", file_binary="file"): '{1}!'.format(out_file, ret_code)) # os.chdir("..") prog.increment_amount() - print prog, "Compiling patterns", '\r', - sys.stdout.flush() - print "" + print(prog, "Compiling patterns", end='\r', flush=True) + print("") def get_full_metadata(infile, file_name="file", compiled=True, @@ -240,8 +239,9 @@ def get_full_metadata(infile, file_name="file", compiled=True, f = files[i] cmd = FILE_BINARY + " -b " + infile + " -m .mgc_temp/" + \ FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + COMPILED_SUFFIX - # print FILE_BINARY + " " + infile + " -m .mgc_temp/" + \ - # FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + COMPILED_SUFFIX + # print(FILE_BINARY + " " + infile + " -m .mgc_temp/" + + # FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + + # COMPILED_SUFFIX) popen = Popen(cmd, shell=True, bufsize=4096, stdout=PIPE) pipe = popen.stdout last = pipe.read() @@ -268,7 +268,7 @@ def get_full_metadata(infile, file_name="file", compiled=True, f = files[i] # if f in PATTERNS: # PATTERNS.remove(f); - # print i, f + # print(i, f) fd = open(os.path.join(magdir, f), "r") buf = fd.read() fd.close() @@ -303,11 +303,11 @@ def is_compilation_supported(file_name="file", file_binary="file"): FILE_BINARY_HASH = hashlib.sha224(file_name).hexdigest() if os.system(file_binary + " /bin/sh -m .mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp.0.mgc > /dev/null") != 0: - print '' - print "This file version doesn't support compiled patterns " \ - "=> they won't be used" + print('') + print("This file version doesn't support compiled patterns " + "=> they won't be used") return False else: - print 'Compiled patterns will be used' - print '' + print('Compiled patterns will be used') + print('') return True From 4afbf46ef3350afc73928b552afa131819bc8036 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Mon, 14 Jan 2019 11:30:03 +0100 Subject: [PATCH 36/42] pep8-ify, pylint-ify file.py: short variable names Since "b" is now "idx_rigt", adjust "b_out" to "out_rigt". Since "i" is now "idx_curr", adjust "last" to "out_curr" --- pyfile/file.py | 109 ++++++++++++++++++++++++++----------------------- 1 file changed, 57 insertions(+), 52 deletions(-) diff --git a/pyfile/file.py b/pyfile/file.py index ddb57f3..789a0a6 100644 --- a/pyfile/file.py +++ b/pyfile/file.py @@ -102,8 +102,8 @@ def _split_patterns(pattern_id=0, magdir="Magdir", file_name="file", raise ValueError('no files found in Magdir {0}' .format(os.path.join(os.getcwd(), magdir))) prog = ProgressBar(0, len(files), 50, mode='fixed', char='#') - for f in files: - mfile = os.path.join(magdir, f) + for loop_file_name in files: + mfile = os.path.join(magdir, loop_file_name) if os.path.isdir(mfile): continue fd = open(mfile, "r") @@ -112,7 +112,7 @@ def _split_patterns(pattern_id=0, magdir="Magdir", file_name="file", prog.increment_amount() print(prog, "Splitting patterns", end='\r', flush=True) lines = fd.readlines() - for i, line in enumerate(lines): + for line_idx, line in enumerate(lines): if line.strip().startswith("#") or len(line.strip()) == 0: continue # print(line.strip() @@ -130,15 +130,16 @@ def _split_patterns(pattern_id=0, magdir="Magdir", file_name="file", continue in_pattern = True pattern_id += 1 - buff += "#" + f + "\n" + buff += "#" + loop_file_name + "\n" buff += "# Automatically generated from:\n" - buff += "#" + f + ":" + str(i) + "\n" + buff += "#" + loop_file_name + ":" + str(line_idx) + "\n" buff += line elif line.strip().startswith(">") or line.strip().startswith("!"): if in_pattern: buff += line elif only_name == False: - print("broken pattern in file '" + f + "':" + str(i)) + print("broken pattern in file '" + loop_file_name + "':" + + str(line_idx)) if in_pattern: fd_out = open(os.path.join(outputdir, str(pattern_id)), "w") fd_out.write(buff) @@ -167,16 +168,16 @@ def compile_patterns(file_name="file", file_binary="file"): mkdir_p(".mgc_temp/" + FILE_BINARY_HASH + "/tmp") prog = ProgressBar(0, len(files), 50, mode='fixed', char='#') - for i, f in enumerate(files): + for file_index, loop_file_name in enumerate(files): out_file = ".mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp." + \ - str(i) + ".mgc" + str(file_index) + ".mgc" if not os.path.exists(out_file): - fd = open(os.path.join(magdir, f), "r") + fd = open(os.path.join(magdir, loop_file_name), "r") buf = fd.read() fd.close() - x = buf.split("\n")[0][1:len(buf.split("\n")[0])] + first_line = buf.split("\n")[0][1:len(buf.split("\n")[0])] tmp = open(os.path.join(".mgc_temp/" + FILE_BINARY_HASH + \ - "/tmp/" + x), "a") + "/tmp/" + first_line), "a") tmp.write(buf) tmp.flush() tmp.close() @@ -187,16 +188,16 @@ def compile_patterns(file_name="file", file_binary="file"): # tmp.close() # os.chdir(".mgc_temp") # print("cp .mgc_temp/.find-magic.tmp " + - # ".mgc_temp/.find-magic.tmp." + str(i) + ";" + - # FILE_BINARY + " -C -m .mgc_temp/.find-magic.tmp." + str(i) - # + ";") - # mv .find-magic.tmp." + str(i) + ".mgc .mgc_temp/; + # ".mgc_temp/.find-magic.tmp." + str(file_index) + ";" + + # FILE_BINARY + " -C -m .mgc_temp/.find-magic.tmp." + + # str(file_index) + ";") + # mv .find-magic.tmp." + str(file_index) + ".mgc .mgc_temp/; # os.system("cp .mgc_temp/" + FILE_BINARY_HASH + # "/.find-magic.tmp .mgc_temp/" + FILE_BINARY_HASH + - # "/.find-magic.tmp." + str(i) + ";" + + # "/.find-magic.tmp." + str(file_index) + ";" + # "file -C -m .mgc_temp/" + FILE_BINARY_HASH + - # "/.find-magic.tmp." + str(i) + ";") + # "/.find-magic.tmp." + str(file_index) + ";") cmd = file_binary + " -C -m .mgc_temp/" + FILE_BINARY_HASH + "/tmp" ret_code = os.system(cmd) if ret_code != 0: @@ -228,48 +229,51 @@ def get_full_metadata(infile, file_name="file", compiled=True, files.sort(key=lambda x: [int(x)]) tlist = [] mkdir_p(".mgc_temp") - a = 0 - b = len(files) - 1 - i = b - # a_out = "" unused! - b_out = None + # Divide and conquer + idx_left = 0 # left-most index to consider + idx_rigt = len(files) - 1 # right-most index to consider + idx_curr = idx_rigt # some index in the middle we currently test + + # out_left = "" # ouput at idx_left, unused + out_rigt = None # output at idx_rigt while True: - f = files[i] + file_curr = files[idx_curr] # file name at idx_curr cmd = FILE_BINARY + " -b " + infile + " -m .mgc_temp/" + \ - FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + COMPILED_SUFFIX + FILE_BINARY_HASH + "/.find-magic.tmp." + str(idx_curr) + \ + COMPILED_SUFFIX # print(FILE_BINARY + " " + infile + " -m .mgc_temp/" + - # FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + + # FILE_BINARY_HASH + "/.find-magic.tmp." + str(idx_curr) + # COMPILED_SUFFIX) popen = Popen(cmd, shell=True, bufsize=4096, stdout=PIPE) pipe = popen.stdout - last = pipe.read() + out_curr = pipe.read() if popen.wait() != 0: return dict(output=None, mime=None, pattern=None, suffix=None, - err=(cmd, last.strip())) - if b_out == None: - b_out = last - # a---------i---------b - # a_out == last \solution here - if last != b_out: - a = i - a_out = last - # a-------------------i-------------------b - # solution here/ last == b_out + err=(cmd, out_curr.strip())) + if out_rigt == None: + out_rigt = out_curr + # idx_left---------idx_curr---------idx_rigt + # out_left == out_curr \solution here + if out_curr != out_rigt: + idx_left = idx_curr + # out_left = out_curr + # idx_left-------------------idx_curr-------------------idx_rigt + # solution here/ out_curr == out_rigt else: - b = i - b_out = last + idx_rigt = idx_curr + out_rigt = out_curr - if i == a + (b - a) / 2: - if b_out != last: - i += 1 - last = b_out - f = files[i] - # if f in PATTERNS: - # PATTERNS.remove(f); - # print(i, f) - fd = open(os.path.join(magdir, f), "r") + if idx_curr == idx_left + (idx_rigt - idx_left) / 2: + if out_rigt != out_curr: + idx_curr += 1 + out_curr = out_rigt + file_curr = files[idx_curr] + # if file_curr in PATTERNS: + # PATTERNS.remove(file_curr); + # print(idx_curr, file_curr) + fd = open(os.path.join(magdir, file_curr), "r") buf = fd.read() fd.close() if os.path.exists(os.path.dirname(FILE_BINARY) + @@ -278,7 +282,7 @@ def get_full_metadata(infile, file_name="file", compiled=True, os.path.dirname(FILE_BINARY) + "/../magic/magic" else: cmd = FILE_BINARY + " -bi " + infile + " -m .mgc_temp/" + \ - FILE_BINARY_HASH + "/.find-magic.tmp." + str(i) + \ + FILE_BINARY_HASH + "/.find-magic.tmp." + str(idx_curr) +\ COMPILED_SUFFIX popen = Popen(cmd, shell=True, bufsize=4096, stdout=PIPE) pipe = popen.stdout @@ -286,17 +290,18 @@ def get_full_metadata(infile, file_name="file", compiled=True, if popen.wait() != 0: return dict(output=None, mime=None, pattern=None, suffix=None, err=(cmd, mime.strip())) - tlist.append(last) + tlist.append(out_curr) index = infile.find('.') if index == -1: suffix = "" else: suffix = infile[index:] - if last == "data\n" and i == 0: + if out_curr == "data\n" and idx_curr == 0: buf = "" - return dict(output=last, mime=mime, pattern=buf, suffix=suffix) + return dict(output=out_curr, mime=mime, pattern=buf, suffix=suffix) else: - i = a + (b - a) / 2 + # set idx_curr to middle between idx_left and idx_rigt + idx_curr = idx_left + (idx_rigt - idx_left) / 2 def is_compilation_supported(file_name="file", file_binary="file"): From eb4520bf51b00f67ba583be115b95597c9141e9e Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Mon, 14 Jan 2019 11:36:26 +0100 Subject: [PATCH 37/42] pep8-ify, pylint-ify file.py: use with open(...) --- pyfile/file.py | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/pyfile/file.py b/pyfile/file.py index 789a0a6..2eb545c 100644 --- a/pyfile/file.py +++ b/pyfile/file.py @@ -106,12 +106,12 @@ def _split_patterns(pattern_id=0, magdir="Magdir", file_name="file", mfile = os.path.join(magdir, loop_file_name) if os.path.isdir(mfile): continue - fd = open(mfile, "r") buff = "" in_pattern = False prog.increment_amount() print(prog, "Splitting patterns", end='\r', flush=True) - lines = fd.readlines() + with open(mfile, "r") as reader: + lines = reader.readlines() for line_idx, line in enumerate(lines): if line.strip().startswith("#") or len(line.strip()) == 0: continue @@ -119,10 +119,9 @@ def _split_patterns(pattern_id=0, magdir="Magdir", file_name="file", if line.strip()[0].isdigit() or \ (line.strip()[0] == '-' and line.strip()[1].isdigit()): if in_pattern: - fd_out = open(os.path.join(outputdir, str(pattern_id)), - "w") - fd_out.write(buff) - fd_out.close() + with open(os.path.join(outputdir, str(pattern_id)), "w") \ + as writer: + writer.write(buff) in_pattern = False buff = "" if only_name: @@ -141,10 +140,8 @@ def _split_patterns(pattern_id=0, magdir="Magdir", file_name="file", print("broken pattern in file '" + loop_file_name + "':" + str(line_idx)) if in_pattern: - fd_out = open(os.path.join(outputdir, str(pattern_id)), "w") - fd_out.write(buff) - fd_out.close() - fd.close() + with open(os.path.join(outputdir, str(pattern_id)), "w") as writer: + writer.write(buff) return pattern_id @@ -172,15 +169,13 @@ def compile_patterns(file_name="file", file_binary="file"): out_file = ".mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp." + \ str(file_index) + ".mgc" if not os.path.exists(out_file): - fd = open(os.path.join(magdir, loop_file_name), "r") - buf = fd.read() - fd.close() + with open(os.path.join(magdir, loop_file_name), "r") as reader: + buf = reader.read() first_line = buf.split("\n")[0][1:len(buf.split("\n")[0])] - tmp = open(os.path.join(".mgc_temp/" + FILE_BINARY_HASH + \ - "/tmp/" + first_line), "a") - tmp.write(buf) - tmp.flush() - tmp.close() + with open(os.path.join(".mgc_temp/" + FILE_BINARY_HASH + \ + "/tmp/" + first_line), "a") as appender: + appender.write(buf) + appender.flush() # tmp = open(".mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp", # "a") # tmp.write(buf) @@ -273,9 +268,8 @@ def get_full_metadata(infile, file_name="file", compiled=True, # if file_curr in PATTERNS: # PATTERNS.remove(file_curr); # print(idx_curr, file_curr) - fd = open(os.path.join(magdir, file_curr), "r") - buf = fd.read() - fd.close() + with open(os.path.join(magdir, file_curr), "r") as reader: + buf = reader.read() if os.path.exists(os.path.dirname(FILE_BINARY) + "/../magic/magic.mime.mgc"): cmd = FILE_BINARY + " -bi " + infile + " -m " + \ From 159e7b7137e9b3485cd902fb6a65fc061ca3860e Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Mon, 14 Jan 2019 13:08:14 +0100 Subject: [PATCH 38/42] pep8-ify, pylint-ify file.py: doc strings --- pyfile/file.py | 68 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 62 insertions(+), 6 deletions(-) diff --git a/pyfile/file.py b/pyfile/file.py index 2eb545c..b526453 100644 --- a/pyfile/file.py +++ b/pyfile/file.py @@ -16,6 +16,8 @@ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, # USA. +"""Wrapper for `file(1)` with additional pattern compilation & search.""" + from __future__ import print_function import os @@ -27,6 +29,7 @@ def print_file_info(file_binary='file'): + """`print()` absolute path and version of given `file(1)` binary.""" if not file_binary.startswith("/") and not file_binary.startswith("./") \ and not file_binary.startswith("../"): popen = Popen('which ' + file_binary, shell=True, bufsize=4096, @@ -50,6 +53,7 @@ def print_file_info(file_binary='file'): def mkdir_p(path): + """Wrapper around :py:func:`os.makedirs` that catches EEXIST.""" try: os.makedirs(path) except OSError as exc: # Python >2.5 @@ -60,6 +64,7 @@ def mkdir_p(path): def get_file_output(filename, binary="file"): + """Run file(1) binary on given filename, return output.""" popen = Popen(binary + " -b " + filename, shell=True, bufsize=4096, stdout=PIPE, stderr=PIPE) pipe = popen.stdout @@ -72,6 +77,7 @@ def get_file_output(filename, binary="file"): def get_file_mime(filename, binary="file"): + """Run file(1) binary with mime option on given filename, return output.""" popen = Popen(binary + " -ib " + filename, shell=True, bufsize=4096, stdout=PIPE, stderr=PIPE) pipe = popen.stdout @@ -84,6 +90,14 @@ def get_file_mime(filename, binary="file"): def get_simple_metadata(filename, binary="file"): + """ + Get output of `file` and `file -i` on given filename. + + Calls :py:func:`get_file_output` and :py:func:`get_file_mime` and saves + them in a `dict` as fields `output` and `mime`. + + Quick version of :py:func:`get_full_metadata`. + """ metadata = {} metadata['output'] = get_file_output(filename, binary) metadata['mime'] = get_file_mime(filename, binary) @@ -92,6 +106,20 @@ def get_simple_metadata(filename, binary="file"): def _split_patterns(pattern_id=0, magdir="Magdir", file_name="file", only_name=False): + """ + Actual worker function for :py:func:split_patterns`. + + Creates `output` dir in `.mgc_temp`. Loops over pattern files in `magdir` + and for each pattern found in each file creates an extra file in `output` + dir with just that pattern. + + Output file name are just their pattern_id, starting with id given as arg. + + Arg `file_name` only used for getting dir name through hashing. `file(1)` + is not called here. + + Returns number of pattern files thus created. + """ FILE_BINARY_HASH = hashlib.sha224(file_name).hexdigest() outputdir = ".mgc_temp/" + FILE_BINARY_HASH + "/output" mkdir_p(outputdir) @@ -118,6 +146,7 @@ def _split_patterns(pattern_id=0, magdir="Magdir", file_name="file", # print(line.strip() if line.strip()[0].isdigit() or \ (line.strip()[0] == '-' and line.strip()[1].isdigit()): + # start of next pattern. first write finished pattern to file if in_pattern: with open(os.path.join(outputdir, str(pattern_id)), "w") \ as writer: @@ -146,6 +175,12 @@ def _split_patterns(pattern_id=0, magdir="Magdir", file_name="file", def split_patterns(magdir="Magdir", file_name="file"): + """ + Given a dir with magic pattern files, create dir with isolated patterns. + + First create isolated pattern files for patterns with a "name" attribute. + Then create pattern files for all patterns. + """ pattern_id = _split_patterns(0, magdir, file_name, True) _split_patterns(pattern_id, magdir, file_name) @@ -153,6 +188,16 @@ def split_patterns(magdir="Magdir", file_name="file"): def compile_patterns(file_name="file", file_binary="file"): + """ + Creates increasingly complex magic files. + + Loops over isolated patterns, re-assembles original magic files pattern by + pattern and always re-creates a magic file. Creates files + `.mgc_temp/HASH/.find-magic.tmp.PATTERN-ID.mgc` used by + :py:func:`get_full_metadata`. + + This requires quite some space on disc. + """ FILE_BINARY_HASH = hashlib.sha224(file_name).hexdigest() magdir = ".mgc_temp/" + FILE_BINARY_HASH + "/output" files = os.listdir(magdir) @@ -171,9 +216,12 @@ def compile_patterns(file_name="file", file_binary="file"): if not os.path.exists(out_file): with open(os.path.join(magdir, loop_file_name), "r") as reader: buf = reader.read() - first_line = buf.split("\n")[0][1:len(buf.split("\n")[0])] + # read name of original pattern file in magic dir from first line + mfile = buf.split("\n")[0][1:] + + # iteratively re-assemble original pattern file with open(os.path.join(".mgc_temp/" + FILE_BINARY_HASH + \ - "/tmp/" + first_line), "a") as appender: + "/tmp/" + mfile), "a") as appender: appender.write(buf) appender.flush() # tmp = open(".mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp", @@ -212,7 +260,11 @@ def compile_patterns(file_name="file", file_binary="file"): def get_full_metadata(infile, file_name="file", compiled=True, file_binary="file"): """ - file-output plus binary search to find the relevant line in magic file + file-output plus binary search to find the relevant line in magic file. + + Run `file(1)` repeatedly with different magic files created in + :py:func`compile_patterns` until the one pattern is identified that defines + the `file(1)` output of the given `infile`. """ COMPILED_SUFFIX = ".mgc" if not compiled: @@ -225,7 +277,7 @@ def get_full_metadata(infile, file_name="file", compiled=True, tlist = [] mkdir_p(".mgc_temp") - # Divide and conquer + # Divide and conquer: find the relevant pattern idx_left = 0 # left-most index to consider idx_rigt = len(files) - 1 # right-most index to consider idx_curr = idx_rigt # some index in the middle we currently test @@ -247,7 +299,7 @@ def get_full_metadata(infile, file_name="file", compiled=True, if popen.wait() != 0: return dict(output=None, mime=None, pattern=None, suffix=None, err=(cmd, out_curr.strip())) - if out_rigt == None: + if out_rigt == None: # first iteration, uses complete magic file out_rigt = out_curr # idx_left---------idx_curr---------idx_rigt # out_left == out_curr \solution here @@ -260,7 +312,10 @@ def get_full_metadata(infile, file_name="file", compiled=True, idx_rigt = idx_curr out_rigt = out_curr + # are we done? if idx_curr == idx_left + (idx_rigt - idx_left) / 2: + # idx_* are so close together that next iteration idx_curr would + # not change --> we are done if out_rigt != out_curr: idx_curr += 1 out_curr = out_rigt @@ -294,11 +349,12 @@ def get_full_metadata(infile, file_name="file", compiled=True, buf = "" return dict(output=out_curr, mime=mime, pattern=buf, suffix=suffix) else: - # set idx_curr to middle between idx_left and idx_rigt + # continue: set idx_curr to middle between idx_left and idx_rigt idx_curr = idx_left + (idx_rigt - idx_left) / 2 def is_compilation_supported(file_name="file", file_binary="file"): + """Determine whether data from :py:func:`compile_patterns` is available.""" FILE_BINARY_HASH = hashlib.sha224(file_name).hexdigest() if os.system(file_binary + " /bin/sh -m .mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp.0.mgc > /dev/null") != 0: From ec853ed760977ed39b84a88f8e27bddcb4f20b5b Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Mon, 14 Jan 2019 13:08:28 +0100 Subject: [PATCH 39/42] pep8-ify, pylint-ify file.py: todos --- pyfile/file.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyfile/file.py b/pyfile/file.py index b526453..125c3a8 100644 --- a/pyfile/file.py +++ b/pyfile/file.py @@ -125,7 +125,7 @@ def _split_patterns(pattern_id=0, magdir="Magdir", file_name="file", mkdir_p(outputdir) files = os.listdir(magdir) - files.sort() + files.sort() # TODO: sort like the others? if len(files) == 0: raise ValueError('no files found in Magdir {0}' .format(os.path.join(os.getcwd(), magdir))) @@ -246,7 +246,7 @@ def compile_patterns(file_name="file", file_binary="file"): if ret_code != 0: raise ValueError('command {0} returned non-zero exit code {1}!' .format(cmd, ret_code)) - if os.path.exists("tmp.mgc"): + if os.path.exists("tmp.mgc"): # TODO: move without forking shell ret_code = os.system("mv tmp.mgc " + out_file) if ret_code != 0: raise ValueError('moving tmp.mgc to {0} failed with code ' From 291d24913046bd9af6bc13588dd589094e5004e0 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Wed, 16 Jan 2019 14:16:22 +0100 Subject: [PATCH 40/42] pep8-ify, pylint-ify file.py: miscellaneous Lowercase variable names for non-constant variables Replace "if len(...) == 0", "if a == False" or "if a == None" --- pyfile/file.py | 83 +++++++++++++++++++++++++------------------------- 1 file changed, 41 insertions(+), 42 deletions(-) diff --git a/pyfile/file.py b/pyfile/file.py index 125c3a8..904967c 100644 --- a/pyfile/file.py +++ b/pyfile/file.py @@ -120,13 +120,13 @@ def _split_patterns(pattern_id=0, magdir="Magdir", file_name="file", Returns number of pattern files thus created. """ - FILE_BINARY_HASH = hashlib.sha224(file_name).hexdigest() - outputdir = ".mgc_temp/" + FILE_BINARY_HASH + "/output" + file_binary_hash = hashlib.sha224(file_name).hexdigest() + outputdir = ".mgc_temp/" + file_binary_hash + "/output" mkdir_p(outputdir) files = os.listdir(magdir) files.sort() # TODO: sort like the others? - if len(files) == 0: + if not files: raise ValueError('no files found in Magdir {0}' .format(os.path.join(os.getcwd(), magdir))) prog = ProgressBar(0, len(files), 50, mode='fixed', char='#') @@ -141,7 +141,7 @@ def _split_patterns(pattern_id=0, magdir="Magdir", file_name="file", with open(mfile, "r") as reader: lines = reader.readlines() for line_idx, line in enumerate(lines): - if line.strip().startswith("#") or len(line.strip()) == 0: + if line.strip().startswith("#") or not line.strip(): continue # print(line.strip() if line.strip()[0].isdigit() or \ @@ -165,7 +165,7 @@ def _split_patterns(pattern_id=0, magdir="Magdir", file_name="file", elif line.strip().startswith(">") or line.strip().startswith("!"): if in_pattern: buff += line - elif only_name == False: + elif not only_name: print("broken pattern in file '" + loop_file_name + "':" + str(line_idx)) if in_pattern: @@ -198,20 +198,20 @@ def compile_patterns(file_name="file", file_binary="file"): This requires quite some space on disc. """ - FILE_BINARY_HASH = hashlib.sha224(file_name).hexdigest() - magdir = ".mgc_temp/" + FILE_BINARY_HASH + "/output" + file_binary_hash = hashlib.sha224(file_name).hexdigest() + magdir = ".mgc_temp/" + file_binary_hash + "/output" files = os.listdir(magdir) - if len(files) == 0: + if not files: raise ValueError('no files found in Magdir {0}' .format(os.path.join(os.getcwd(), magdir))) files.sort(key=lambda x: [int(x)]) mkdir_p(".mgc_temp") - mkdir_p(".mgc_temp/" + FILE_BINARY_HASH) - mkdir_p(".mgc_temp/" + FILE_BINARY_HASH + "/tmp") + mkdir_p(".mgc_temp/" + file_binary_hash) + mkdir_p(".mgc_temp/" + file_binary_hash + "/tmp") prog = ProgressBar(0, len(files), 50, mode='fixed', char='#') for file_index, loop_file_name in enumerate(files): - out_file = ".mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp." + \ + out_file = ".mgc_temp/" + file_binary_hash + "/.find-magic.tmp." + \ str(file_index) + ".mgc" if not os.path.exists(out_file): with open(os.path.join(magdir, loop_file_name), "r") as reader: @@ -220,11 +220,11 @@ def compile_patterns(file_name="file", file_binary="file"): mfile = buf.split("\n")[0][1:] # iteratively re-assemble original pattern file - with open(os.path.join(".mgc_temp/" + FILE_BINARY_HASH + \ + with open(os.path.join(".mgc_temp/" + file_binary_hash + "/tmp/" + mfile), "a") as appender: appender.write(buf) appender.flush() - # tmp = open(".mgc_temp/" + FILE_BINARY_HASH + "/.find-magic.tmp", + # tmp = open(".mgc_temp/" + file_binary_hash + "/.find-magic.tmp", # "a") # tmp.write(buf) # tmp.flush() @@ -232,16 +232,16 @@ def compile_patterns(file_name="file", file_binary="file"): # os.chdir(".mgc_temp") # print("cp .mgc_temp/.find-magic.tmp " + # ".mgc_temp/.find-magic.tmp." + str(file_index) + ";" + - # FILE_BINARY + " -C -m .mgc_temp/.find-magic.tmp." + + # file_binary + " -C -m .mgc_temp/.find-magic.tmp." + # str(file_index) + ";") # mv .find-magic.tmp." + str(file_index) + ".mgc .mgc_temp/; - # os.system("cp .mgc_temp/" + FILE_BINARY_HASH + - # "/.find-magic.tmp .mgc_temp/" + FILE_BINARY_HASH + + # os.system("cp .mgc_temp/" + file_binary_hash + + # "/.find-magic.tmp .mgc_temp/" + file_binary_hash + # "/.find-magic.tmp." + str(file_index) + ";" + - # "file -C -m .mgc_temp/" + FILE_BINARY_HASH + + # "file -C -m .mgc_temp/" + file_binary_hash + # "/.find-magic.tmp." + str(file_index) + ";") - cmd = file_binary + " -C -m .mgc_temp/" + FILE_BINARY_HASH + "/tmp" + cmd = file_binary + " -C -m .mgc_temp/" + file_binary_hash + "/tmp" ret_code = os.system(cmd) if ret_code != 0: raise ValueError('command {0} returned non-zero exit code {1}!' @@ -266,12 +266,11 @@ def get_full_metadata(infile, file_name="file", compiled=True, :py:func`compile_patterns` until the one pattern is identified that defines the `file(1)` output of the given `infile`. """ - COMPILED_SUFFIX = ".mgc" + compiled_suffix = ".mgc" if not compiled: - COMPILED_SUFFIX = "" - FILE_BINARY_HASH = hashlib.sha224(file_name).hexdigest() - magdir = ".mgc_temp/" + FILE_BINARY_HASH + "/output" - FILE_BINARY = file_binary + compiled_suffix = "" + file_binary_hash = hashlib.sha224(file_name).hexdigest() + magdir = ".mgc_temp/" + file_binary_hash + "/output" files = os.listdir(magdir) files.sort(key=lambda x: [int(x)]) tlist = [] @@ -287,19 +286,19 @@ def get_full_metadata(infile, file_name="file", compiled=True, while True: file_curr = files[idx_curr] # file name at idx_curr - cmd = FILE_BINARY + " -b " + infile + " -m .mgc_temp/" + \ - FILE_BINARY_HASH + "/.find-magic.tmp." + str(idx_curr) + \ - COMPILED_SUFFIX - # print(FILE_BINARY + " " + infile + " -m .mgc_temp/" + - # FILE_BINARY_HASH + "/.find-magic.tmp." + str(idx_curr) + - # COMPILED_SUFFIX) + cmd = file_binary + " -b " + infile + " -m .mgc_temp/" + \ + file_binary_hash + "/.find-magic.tmp." + str(idx_curr) + \ + compiled_suffix + # print(file_binary + " " + infile + " -m .mgc_temp/" + + # file_binary_hash + "/.find-magic.tmp." + str(idx_curr) + + # compiled_suffix) popen = Popen(cmd, shell=True, bufsize=4096, stdout=PIPE) pipe = popen.stdout out_curr = pipe.read() if popen.wait() != 0: return dict(output=None, mime=None, pattern=None, suffix=None, err=(cmd, out_curr.strip())) - if out_rigt == None: # first iteration, uses complete magic file + if out_rigt is None: # first iteration, uses complete magic file out_rigt = out_curr # idx_left---------idx_curr---------idx_rigt # out_left == out_curr \solution here @@ -325,14 +324,14 @@ def get_full_metadata(infile, file_name="file", compiled=True, # print(idx_curr, file_curr) with open(os.path.join(magdir, file_curr), "r") as reader: buf = reader.read() - if os.path.exists(os.path.dirname(FILE_BINARY) + + if os.path.exists(os.path.dirname(file_binary) + "/../magic/magic.mime.mgc"): - cmd = FILE_BINARY + " -bi " + infile + " -m " + \ - os.path.dirname(FILE_BINARY) + "/../magic/magic" + cmd = file_binary + " -bi " + infile + " -m " + \ + os.path.dirname(file_binary) + "/../magic/magic" else: - cmd = FILE_BINARY + " -bi " + infile + " -m .mgc_temp/" + \ - FILE_BINARY_HASH + "/.find-magic.tmp." + str(idx_curr) +\ - COMPILED_SUFFIX + cmd = file_binary + " -bi " + infile + " -m .mgc_temp/" + \ + file_binary_hash + "/.find-magic.tmp." + str(idx_curr) +\ + compiled_suffix popen = Popen(cmd, shell=True, bufsize=4096, stdout=PIPE) pipe = popen.stdout mime = pipe.read() @@ -355,14 +354,14 @@ def get_full_metadata(infile, file_name="file", compiled=True, def is_compilation_supported(file_name="file", file_binary="file"): """Determine whether data from :py:func:`compile_patterns` is available.""" - FILE_BINARY_HASH = hashlib.sha224(file_name).hexdigest() - if os.system(file_binary + " /bin/sh -m .mgc_temp/" + FILE_BINARY_HASH + + file_binary_hash = hashlib.sha224(file_name).hexdigest() + if os.system(file_binary + " /bin/sh -m .mgc_temp/" + file_binary_hash + "/.find-magic.tmp.0.mgc > /dev/null") != 0: print('') print("This file version doesn't support compiled patterns " "=> they won't be used") return False - else: - print('Compiled patterns will be used') - print('') - return True + + print('Compiled patterns will be used') + print('') + return True From 7b99417634ae0fe55cd8c59a1733e3aae598c24e Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Wed, 16 Jan 2019 18:23:38 +0100 Subject: [PATCH 41/42] Bring back old flush() the flush=True argument to print() was only added with py3.3 --- pyfile/file.py | 7 +++++-- update-db.py | 3 ++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pyfile/file.py b/pyfile/file.py index 904967c..bfc909e 100644 --- a/pyfile/file.py +++ b/pyfile/file.py @@ -21,6 +21,7 @@ from __future__ import print_function import os +import sys import errno from subprocess import Popen, PIPE import hashlib @@ -137,7 +138,8 @@ def _split_patterns(pattern_id=0, magdir="Magdir", file_name="file", buff = "" in_pattern = False prog.increment_amount() - print(prog, "Splitting patterns", end='\r', flush=True) + print(prog, "Splitting patterns", end='\r') + sys.stdout.flush() with open(mfile, "r") as reader: lines = reader.readlines() for line_idx, line in enumerate(lines): @@ -253,7 +255,8 @@ def compile_patterns(file_name="file", file_binary="file"): '{1}!'.format(out_file, ret_code)) # os.chdir("..") prog.increment_amount() - print(prog, "Compiling patterns", end='\r', flush=True) + print(prog, "Compiling patterns", end='\r') + sys.stdout.flush() print("") diff --git a/update-db.py b/update-db.py index 4961def..e761125 100755 --- a/update-db.py +++ b/update-db.py @@ -79,7 +79,8 @@ def data_stored(data): return prog.increment_amount() if not hide: - print(prog, "Updating database", end='\r', flush=True) + print(prog, "Updating database", end='\r') + sys.stdout.flush() # create thread pool here, so program exits if error occurs earlier n_threads = 4 # TODO: probably need this instead of 2 in queueTasks From a6bda95097012d5f7fe8df4ad5ae8415481901e5 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Wed, 16 Jan 2019 18:23:58 +0100 Subject: [PATCH 42/42] Fix wrong argument name --- update-db.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/update-db.py b/update-db.py index e761125..2d3b887 100755 --- a/update-db.py +++ b/update-db.py @@ -88,7 +88,7 @@ def data_stored(data): for index, entry in enumerate(entries): # Insert tasks into the queue and let them run pool.queueTask(store_mimedata, args=(entry, index % 2), - callback=data_stored) + taskCallback=data_stored) if global_error: print("Error when executing File binary") break