diff --git a/oletools/ftguess.py b/oletools/ftguess.py index 6db2c8644..75657e4d6 100644 --- a/oletools/ftguess.py +++ b/oletools/ftguess.py @@ -866,8 +866,6 @@ def main(): python_version = '%d.%d.%d' % sys.version_info[0:3] print ('ftguess %s on Python %s - http://decalage.info/python/oletools' % (__version__, python_version)) - print ('THIS IS WORK IN PROGRESS - Check updates regularly!') - print ('Please report any issue at https://github.com/decalage2/oletools/issues') print ('') DEFAULT_LOG_LEVEL = "warning" # Default log level diff --git a/oletools/mraptor.py b/oletools/mraptor.py index 35bf6ed6d..069318eb0 100644 --- a/oletools/mraptor.py +++ b/oletools/mraptor.py @@ -253,8 +253,6 @@ def main(): # Print help if no arguments are passed if len(args) == 0: print('MacroRaptor %s - http://decalage.info/python/oletools' % __version__) - print('This is work in progress, please report issues at %s' % URL_ISSUES) - print(__doc__) parser.print_help() print('\nAn exit code is returned based on the analysis result:') for result in (Result_NoMacro, Result_NotMSOffice, Result_MacroOK, Result_Error, Result_Suspicious): @@ -263,7 +261,6 @@ def main(): # print banner with version print('MacroRaptor %s - http://decalage.info/python/oletools' % __version__) - print('This is work in progress, please report issues at %s' % URL_ISSUES) log_helper.enable_logging(level=options.loglevel) # enable logging in the modules: diff --git a/oletools/msodde.py b/oletools/msodde.py index 303d97476..ee1932148 100644 --- a/oletools/msodde.py +++ b/oletools/msodde.py @@ -225,8 +225,6 @@ # banner to be printed at program start BANNER = """msodde %s - http://decalage.info/python/oletools -THIS IS WORK IN PROGRESS - Check updates regularly! -Please report any issue at https://github.com/decalage2/oletools/issues """ % __version__ # === LOGGING ================================================================= diff --git a/oletools/oleid.py b/oletools/oleid.py index 294f073be..bd3b9929c 100644 --- a/oletools/oleid.py +++ b/oletools/oleid.py @@ -513,9 +513,6 @@ def main(): """Called when running this file as script. Shows all info on input file.""" # print banner with version print('oleid %s - http://decalage.info/oletools' % __version__) - print('THIS IS WORK IN PROGRESS - Check updates regularly!') - print('Please report any issue at ' - 'https://github.com/decalage2/oletools/issues') print('') parser = argparse.ArgumentParser(description=__doc__) diff --git a/oletools/olemeta.py b/oletools/olemeta.py index 61317460b..ee539ace7 100644 --- a/oletools/olemeta.py +++ b/oletools/olemeta.py @@ -132,8 +132,6 @@ def process_ole(ole): def main(): # print banner with version print('olemeta %s - http://decalage.info/python/oletools' % __version__) - print ('THIS IS WORK IN PROGRESS - Check updates regularly!') - print ('Please report any issue at https://github.com/decalage2/oletools/issues') usage = 'usage: olemeta [options] [filename2 ...]' parser = optparse.OptionParser(usage=usage) diff --git a/oletools/oleobj.py b/oletools/oleobj.py index 9f67752ea..f75af9fc3 100644 --- a/oletools/oleobj.py +++ b/oletools/oleobj.py @@ -967,9 +967,6 @@ def main(cmd_line_args=None): # print banner with version ensure_stdout_handles_unicode() print('oleobj %s - http://decalage.info/oletools' % __version__) - print('THIS IS WORK IN PROGRESS - Check updates regularly!') - print('Please report any issue at ' - 'https://github.com/decalage2/oletools/issues') print('') usage = 'usage: %(prog)s [options] [filename2 ...]' diff --git a/oletools/oletimes.py b/oletools/oletimes.py index 5d7809a26..9783b85c0 100644 --- a/oletools/oletimes.py +++ b/oletools/oletimes.py @@ -111,8 +111,6 @@ def process_ole(ole): def main(): # print banner with version print('oletimes %s - http://decalage.info/python/oletools' % __version__) - print ('THIS IS WORK IN PROGRESS - Check updates regularly!') - print ('Please report any issue at https://github.com/decalage2/oletools/issues') usage = 'usage: oletimes [options] [filename2 ...]' parser = optparse.OptionParser(usage=usage) diff --git a/oletools/olevba.py b/oletools/olevba.py index 52ffd5126..92be40deb 100644 --- a/oletools/olevba.py +++ b/oletools/olevba.py @@ -284,6 +284,7 @@ import email.feedparser import string # for printable import json # for json output mode (argument --json) +from random import random # import lxml or ElementTree for XML parsing: try: @@ -2372,7 +2373,7 @@ def detect_vba_strings(vba_code): # Otherwise, start and end offsets are incorrect. vba_code = vba_code.expandtabs() # Split the VBA code line by line to avoid MemoryError on large scripts: - for vba_line in vba_code.splitlines(): + for vba_line in split_vba_code(vba_code): for tokens, start, end in vba_expr_str.scanString(vba_line): encoded = vba_line[start:end] decoded = tokens[0] @@ -2393,6 +2394,50 @@ def detect_vba_strings(vba_code): return results +#: max length of vba code lines that is analyzed in one go. Bigger code chunks +#: are split. Reduce this if you run into memory trouble +MAX_CODE_LINE_LEN = 32000 +MAX_CODE_LINE_OVERLAP = 500 + + +def split_vba_code(vba_code): + """ Split vba code (or what is suspected to be one) into manageable parts + + Tries a regular :py:meth:`str.splitlines`, and if that fails (e.g. in case + of non-vba-code in text files or mis-interpreted rtf) splits the string at + random into large overlapping chunks. + + This prevents MemoryErrors in the following parsing of that line, most of + all if deobfuscating. + """ + if MAX_CODE_LINE_LEN < 10: + raise ValueError('unreasonably small value for max code line length') + if MAX_CODE_LINE_OVERLAP < 0: + raise ValueError('unreasonably small value for max code line overlap') + if MAX_CODE_LINE_OVERLAP > MAX_CODE_LINE_LEN: + raise ValueError('overlap must be smaller than chunks') + HALF_LEN = int(MAX_CODE_LINE_LEN//2) + HALF_OVERLAP = int(MAX_CODE_LINE_OVERLAP//2) + + for line in vba_code.splitlines(): + line_len = len(line) + mean_idx_add = 1.5 * HALF_LEN - 1.5 * HALF_OVERLAP + n_chunks = int(line_len / mean_idx_add) # only an approximation + start_idx = 0 + chunk_idx = 0 + while (line_len - start_idx) > MAX_CODE_LINE_LEN: + chunk_idx += 1 + chunk_size = HALF_LEN + int(random() * HALF_LEN) + log.debug('splitting line of size {0}, yielding chunk of size {1},' + ' starting at {2} (number {3} of approx. {4})' + .format(line_len, chunk_size, start_idx, chunk_idx, + n_chunks)) + yield line[start_idx:start_idx+chunk_size] + overlap = HALF_OVERLAP + int(random() * HALF_OVERLAP) + start_idx += max(1, chunk_size - overlap) + yield line[start_idx:] # yield the rest + + def json2ascii(json_obj, encoding='utf8', errors='replace'): """ ensure there is no unicode in json and all strings are safe to decode @@ -2792,6 +2837,11 @@ def __init__(self, filename, data=None, container=None, relaxed=True, encoding=D # It must start with "ID" in uppercase, no whitespace or newline allowed before by Excel: if data.startswith(b'ID'): self.open_slk(data) + # check whether this is mso data + if is_mso_file(data): + log.debug('Found ActiveMime header, decompressing MSO container') + ole_data = mso_file_extract(data) + self.open_ole(ole_data) # Check if this is a plain text VBA or VBScript file: # To avoid scanning binary files, we simply check for some control chars: if self.type is None and b'\x00' not in data: @@ -3569,6 +3619,9 @@ def extract_macros(self): log.debug('Error processing stream %r in file %r (%s)' % (d.name, self.filename, exc)) log.debug('Traceback:', exc_info=True) # do not raise the error, as it is unlikely to be a compressed macro stream + # instead, yield the code as-is, maybe it just was not compressed + log.debug('Try analyzing uncompressed code') + yield (self.filename, d.name, d.name, compressed_code) if self.xlm_macros: vba_code = '' for line in self.xlm_macros: diff --git a/oletools/rtfobj.py b/oletools/rtfobj.py index f0b4e654e..3685c6ff5 100644 --- a/oletools/rtfobj.py +++ b/oletools/rtfobj.py @@ -1011,8 +1011,6 @@ def main(): python_version = '%d.%d.%d' % sys.version_info[0:3] print ('rtfobj %s on Python %s - http://decalage.info/python/oletools' % (__version__, python_version)) - print ('THIS IS WORK IN PROGRESS - Check updates regularly!') - print ('Please report any issue at https://github.com/decalage2/oletools/issues') print ('') DEFAULT_LOG_LEVEL = "warning" # Default log level diff --git a/tests/msodde/test_basic.py b/tests/msodde/test_basic.py index 7eed57998..89807ac02 100644 --- a/tests/msodde/test_basic.py +++ b/tests/msodde/test_basic.py @@ -73,6 +73,8 @@ def test_invalid_text(self): """ check that text file argument leads to non-zero exit status """ self.do_test_validity(join(BASE_DIR, 'basic/text'), Exception) + @unittest.skipIf('OLETOOLS_TEST_SKIP_SLOW' in os.environ and os.environ['OLETOOLS_TEST_SKIP_SLOW'] == '1', + "Skip slower tests") def test_encrypted(self): """ check that encrypted files lead to non-zero exit status @@ -119,6 +121,8 @@ def do_test_validity(self, filename, expect_error=None): class TestErrorOutput(unittest.TestCase): """msodde does not specify error by return code but text output.""" + @unittest.skipIf('OLETOOLS_TEST_SKIP_SLOW' in os.environ and os.environ['OLETOOLS_TEST_SKIP_SLOW'] == '1', + "Skip slower tests") def test_crypt_output(self): """Check for helpful error message when failing to decrypt.""" for suffix in 'doc', 'docm', 'docx', 'ppt', 'pptm', 'pptx', 'xls', \ diff --git a/tests/oleobj/test_basic.py b/tests/oleobj/test_basic.py index 3fdcab037..2f750abf2 100644 --- a/tests/oleobj/test_basic.py +++ b/tests/oleobj/test_basic.py @@ -3,6 +3,7 @@ import unittest from tempfile import mkdtemp from shutil import rmtree +from os import listdir, environ from os.path import join, isfile from hashlib import md5 from glob import glob @@ -91,10 +92,14 @@ def tearDown(self): elif self.temp_dir: rmtree(self.temp_dir) + @unittest.skipIf('OLETOOLS_TEST_SKIP_SLOW' in environ and environ['OLETOOLS_TEST_SKIP_SLOW'] == '1', + "Skip slower tests") def test_md5(self): """ test all files in oleobj test dir """ self.do_test_md5(['-d', self.temp_dir]) + @unittest.skipIf('OLETOOLS_TEST_SKIP_SLOW' in environ and environ['OLETOOLS_TEST_SKIP_SLOW'] == '1', + "Skip slower tests") def test_md5_args(self): """ test that oleobj can be called with -i and -v @@ -158,6 +163,19 @@ def test_non_streamed(self): return self.do_test_md5(['-d', self.temp_dir], test_fun=preread_file, only_run_every=4) + @unittest.skipIf('OLETOOLS_TEST_SKIP_SLOW' in environ and environ['OLETOOLS_TEST_SKIP_SLOW'] == '1', + "Skip slower tests") + def test_nodump(self): + """Ensure that with --nodump nothing is ever written to disc.""" + data_dir = join(DATA_BASE_DIR, 'oleobj') + for sample_name, _, _ in SAMPLES: + args = ['-d', self.temp_dir, '--nodump', join(data_dir, sample_name)] + call_and_capture('oleobj', args, + accept_nonzero_exit=True) + temp_dir_contents = listdir(self.temp_dir) + if temp_dir_contents: + self.fail('Found file in temp dir despite "--nodump": {}'.format(temp_dir_contents)) + class TestSaneFilenameCreation(unittest.TestCase): """ Test sanitization / creation of sane filenames """ diff --git a/tests/olevba/test_basic.py b/tests/olevba/test_basic.py index 5be1269a8..988c131e5 100644 --- a/tests/olevba/test_basic.py +++ b/tests/olevba/test_basic.py @@ -75,6 +75,8 @@ def test_rtf_behaviour(self): raise self.fail('Found "warn" in output line: "{}"' .format(line.rstrip())) + @unittest.skipIf('OLETOOLS_TEST_SKIP_SLOW' in os.environ and os.environ['OLETOOLS_TEST_SKIP_SLOW'] == '1', + "Skip slower tests") def test_crypt_return(self): """ Test that encrypted files give a certain return code. @@ -105,7 +107,7 @@ def test_crypt_return(self): .format(ret_code, args + [filename, ])) # test only first file with all arg combinations, others just - # without arg (test takes too long otherwise + # without arg (test takes too long otherwise) ADD_ARGS = ([], ) def test_xlm(self):