decalage2 · christian-intra2net · Jun 24, 2022 · Jul 19, 2022 · Oct 10, 2022 · Dec 22, 2017
diff --git a/oletools/ftguess.py b/oletools/ftguess.py
@@ -866,8 +866,6 @@ def main():
     python_version = '%d.%d.%d' % sys.version_info[0:3]
     print ('ftguess %s on Python %s - http://decalage.info/python/oletools' %
            (__version__, python_version))
-    print ('THIS IS WORK IN PROGRESS - Check updates regularly!')
-    print ('Please report any issue at https://github.com/decalage2/oletools/issues')
     print ('')
 
     DEFAULT_LOG_LEVEL = "warning" # Default log level

diff --git a/oletools/mraptor.py b/oletools/mraptor.py
@@ -253,8 +253,6 @@ def main():
     # Print help if no arguments are passed
     if len(args) == 0:
         print('MacroRaptor %s - http://decalage.info/python/oletools' % __version__)
-        print('This is work in progress, please report issues at %s' % URL_ISSUES)
-        print(__doc__)
         parser.print_help()
         print('\nAn exit code is returned based on the analysis result:')
         for result in (Result_NoMacro, Result_NotMSOffice, Result_MacroOK, Result_Error, Result_Suspicious):
@@ -263,7 +261,6 @@ def main():
 
     # print banner with version
     print('MacroRaptor %s - http://decalage.info/python/oletools' % __version__)
-    print('This is work in progress, please report issues at %s' % URL_ISSUES)
 
     log_helper.enable_logging(level=options.loglevel)
     # enable logging in the modules:

diff --git a/oletools/msodde.py b/oletools/msodde.py
@@ -225,8 +225,6 @@
 
 # banner to be printed at program start
 BANNER = """msodde %s - http://decalage.info/python/oletools
-THIS IS WORK IN PROGRESS - Check updates regularly!
-Please report any issue at https://github.com/decalage2/oletools/issues
 """ % __version__
 
 # === LOGGING =================================================================

diff --git a/oletools/oleid.py b/oletools/oleid.py
@@ -513,9 +513,6 @@ def main():
     """Called when running this file as script. Shows all info on input file."""
     # print banner with version
     print('oleid %s - http://decalage.info/oletools' % __version__)
-    print('THIS IS WORK IN PROGRESS - Check updates regularly!')
-    print('Please report any issue at '
-          'https://github.com/decalage2/oletools/issues')
     print('')
 
     parser = argparse.ArgumentParser(description=__doc__)

diff --git a/oletools/olemeta.py b/oletools/olemeta.py
@@ -132,8 +132,6 @@ def process_ole(ole):
 def main():
     # print banner with version
     print('olemeta %s - http://decalage.info/python/oletools' % __version__)
-    print ('THIS IS WORK IN PROGRESS - Check updates regularly!')
-    print ('Please report any issue at https://github.com/decalage2/oletools/issues')
 
     usage = 'usage: olemeta [options] <filename> [filename2 ...]'
     parser = optparse.OptionParser(usage=usage)

diff --git a/oletools/oleobj.py b/oletools/oleobj.py
@@ -967,9 +967,6 @@ def main(cmd_line_args=None):
     # print banner with version
     ensure_stdout_handles_unicode()
     print('oleobj %s - http://decalage.info/oletools' % __version__)
-    print('THIS IS WORK IN PROGRESS - Check updates regularly!')
-    print('Please report any issue at '
-          'https://github.com/decalage2/oletools/issues')
     print('')
 
     usage = 'usage: %(prog)s [options] <filename> [filename2 ...]'

diff --git a/oletools/oletimes.py b/oletools/oletimes.py
@@ -111,8 +111,6 @@ def process_ole(ole):
 def main():
     # print banner with version
     print('oletimes %s - http://decalage.info/python/oletools' % __version__)
-    print ('THIS IS WORK IN PROGRESS - Check updates regularly!')
-    print ('Please report any issue at https://github.com/decalage2/oletools/issues')
 
     usage = 'usage: oletimes [options] <filename> [filename2 ...]'
     parser = optparse.OptionParser(usage=usage)

diff --git a/oletools/olevba.py b/oletools/olevba.py
@@ -284,6 +284,7 @@
 import email.feedparser
 import string  # for printable
 import json   # for json output mode (argument --json)
+from random import random
 
 # import lxml or ElementTree for XML parsing:
 try:
@@ -2372,7 +2373,7 @@ def detect_vba_strings(vba_code):
     #            Otherwise, start and end offsets are incorrect.
     vba_code = vba_code.expandtabs()
     # Split the VBA code line by line to avoid MemoryError on large scripts:
-    for vba_line in vba_code.splitlines():
+    for vba_line in split_vba_code(vba_code):
         for tokens, start, end in vba_expr_str.scanString(vba_line):
             encoded = vba_line[start:end]
             decoded = tokens[0]
@@ -2393,6 +2394,50 @@ def detect_vba_strings(vba_code):
     return results
 
 
+#: max length of vba code lines that is analyzed in one go. Bigger code chunks
+#: are split. Reduce this if you run into memory trouble
+MAX_CODE_LINE_LEN = 32000
+MAX_CODE_LINE_OVERLAP = 500
+
+
+def split_vba_code(vba_code):
+    """ Split vba code (or what is suspected to be one) into manageable parts
+
+    Tries a regular :py:meth:`str.splitlines`, and if that fails (e.g. in case
+    of non-vba-code in text files or mis-interpreted rtf) splits the string at
+    random into large overlapping chunks.
+
+    This prevents MemoryErrors in the following parsing of that line, most of
+    all if deobfuscating.
+    """
+    if MAX_CODE_LINE_LEN < 10:
+        raise ValueError('unreasonably small value for max code line length')
+    if MAX_CODE_LINE_OVERLAP < 0:
+        raise ValueError('unreasonably small value for max code line overlap')
+    if MAX_CODE_LINE_OVERLAP > MAX_CODE_LINE_LEN:
+        raise ValueError('overlap must be smaller than chunks')
+    HALF_LEN = int(MAX_CODE_LINE_LEN//2)
+    HALF_OVERLAP = int(MAX_CODE_LINE_OVERLAP//2)
+
+    for line in vba_code.splitlines():
+        line_len = len(line)
+        mean_idx_add = 1.5 * HALF_LEN - 1.5 * HALF_OVERLAP
+        n_chunks = int(line_len / mean_idx_add)    # only an approximation
+        start_idx = 0
+        chunk_idx = 0
+        while (line_len - start_idx) > MAX_CODE_LINE_LEN:
+            chunk_idx += 1
+            chunk_size = HALF_LEN + int(random() * HALF_LEN)
+            log.debug('splitting line of size {0}, yielding chunk of size {1},'
+                      ' starting at {2} (number {3} of approx. {4})'
+                      .format(line_len, chunk_size, start_idx, chunk_idx,
+                              n_chunks))
+            yield line[start_idx:start_idx+chunk_size]
+            overlap = HALF_OVERLAP + int(random() * HALF_OVERLAP)
+            start_idx += max(1, chunk_size - overlap)
+        yield line[start_idx:]   # yield the rest
+
+
 def json2ascii(json_obj, encoding='utf8', errors='replace'):
     """
     ensure there is no unicode in json and all strings are safe to decode
@@ -2792,6 +2837,11 @@ def __init__(self, filename, data=None, container=None, relaxed=True, encoding=D
             # It must start with "ID" in uppercase, no whitespace or newline allowed before by Excel:
             if data.startswith(b'ID'):
                 self.open_slk(data)
+            # check whether this is mso data
+            if is_mso_file(data):
+                log.debug('Found ActiveMime header, decompressing MSO container')
+                ole_data = mso_file_extract(data)
+                self.open_ole(ole_data)
             # Check if this is a plain text VBA or VBScript file:
             # To avoid scanning binary files, we simply check for some control chars:
             if self.type is None and b'\x00' not in data:
@@ -3569,6 +3619,9 @@ def extract_macros(self):
                             log.debug('Error processing stream %r in file %r (%s)' % (d.name, self.filename, exc))
                             log.debug('Traceback:', exc_info=True)
                             # do not raise the error, as it is unlikely to be a compressed macro stream
+                            # instead, yield the code as-is, maybe it just was not compressed
+                            log.debug('Try analyzing uncompressed code')
+                            yield (self.filename, d.name, d.name, compressed_code)
             if self.xlm_macros:
                 vba_code = ''
                 for line in self.xlm_macros:

diff --git a/oletools/rtfobj.py b/oletools/rtfobj.py
@@ -1011,8 +1011,6 @@ def main():
     python_version = '%d.%d.%d' % sys.version_info[0:3]
     print ('rtfobj %s on Python %s - http://decalage.info/python/oletools' %
            (__version__, python_version))
-    print ('THIS IS WORK IN PROGRESS - Check updates regularly!')
-    print ('Please report any issue at https://github.com/decalage2/oletools/issues')
     print ('')
 
     DEFAULT_LOG_LEVEL = "warning" # Default log level

diff --git a/tests/msodde/test_basic.py b/tests/msodde/test_basic.py
@@ -73,6 +73,8 @@ def test_invalid_text(self):
         """ check that text file argument leads to non-zero exit status """
         self.do_test_validity(join(BASE_DIR, 'basic/text'), Exception)
 
+    @unittest.skipIf('OLETOOLS_TEST_SKIP_SLOW' in os.environ and os.environ['OLETOOLS_TEST_SKIP_SLOW'] == '1',
+                     "Skip slower tests")
     def test_encrypted(self):
         """
         check that encrypted files lead to non-zero exit status
@@ -119,6 +121,8 @@ def do_test_validity(self, filename, expect_error=None):
 class TestErrorOutput(unittest.TestCase):
     """msodde does not specify error by return code but text output."""
 
+    @unittest.skipIf('OLETOOLS_TEST_SKIP_SLOW' in os.environ and os.environ['OLETOOLS_TEST_SKIP_SLOW'] == '1',
+                     "Skip slower tests")
     def test_crypt_output(self):
         """Check for helpful error message when failing to decrypt."""
         for suffix in 'doc', 'docm', 'docx', 'ppt', 'pptm', 'pptx', 'xls', \

diff --git a/tests/oleobj/test_basic.py b/tests/oleobj/test_basic.py
@@ -3,6 +3,7 @@
 import unittest
 from tempfile import mkdtemp
 from shutil import rmtree
+from os import listdir, environ
 from os.path import join, isfile
 from hashlib import md5
 from glob import glob
@@ -91,10 +92,14 @@ def tearDown(self):
         elif self.temp_dir:
             rmtree(self.temp_dir)
 
+    @unittest.skipIf('OLETOOLS_TEST_SKIP_SLOW' in environ and environ['OLETOOLS_TEST_SKIP_SLOW'] == '1',
+                     "Skip slower tests")
     def test_md5(self):
         """ test all files in oleobj test dir """
         self.do_test_md5(['-d', self.temp_dir])
 
+    @unittest.skipIf('OLETOOLS_TEST_SKIP_SLOW' in environ and environ['OLETOOLS_TEST_SKIP_SLOW'] == '1',
+                     "Skip slower tests")
     def test_md5_args(self):
         """
         test that oleobj can be called with -i and -v
@@ -158,6 +163,19 @@ def test_non_streamed(self):
         return self.do_test_md5(['-d', self.temp_dir], test_fun=preread_file,
                                 only_run_every=4)
 
+    @unittest.skipIf('OLETOOLS_TEST_SKIP_SLOW' in environ and environ['OLETOOLS_TEST_SKIP_SLOW'] == '1',
+                     "Skip slower tests")
+    def test_nodump(self):
+        """Ensure that with --nodump nothing is ever written to disc."""
+        data_dir = join(DATA_BASE_DIR, 'oleobj')
+        for sample_name, _, _ in SAMPLES:
+            args = ['-d', self.temp_dir, '--nodump', join(data_dir, sample_name)]
+            call_and_capture('oleobj', args,
+                             accept_nonzero_exit=True)
+        temp_dir_contents = listdir(self.temp_dir)
+        if temp_dir_contents:
+            self.fail('Found file in temp dir despite "--nodump": {}'.format(temp_dir_contents))
+
 
 class TestSaneFilenameCreation(unittest.TestCase):
     """ Test sanitization / creation of sane filenames """

diff --git a/tests/olevba/test_basic.py b/tests/olevba/test_basic.py
@@ -75,6 +75,8 @@ def test_rtf_behaviour(self):
                 raise self.fail('Found "warn" in output line: "{}"'
                                 .format(line.rstrip()))
 
+    @unittest.skipIf('OLETOOLS_TEST_SKIP_SLOW' in os.environ and os.environ['OLETOOLS_TEST_SKIP_SLOW'] == '1',
+                     "Skip slower tests")
     def test_crypt_return(self):
         """
         Test that encrypted files give a certain return code.
@@ -105,7 +107,7 @@ def test_crypt_return(self):
                                      .format(ret_code, args + [filename, ]))
 
                 # test only first file with all arg combinations, others just
-                # without arg (test takes too long otherwise
+                # without arg (test takes too long otherwise)
                 ADD_ARGS = ([], )
 
     def test_xlm(self):