Merge pull request #48 from regisb/regisb/multiline-with-singlelinecomment

robrap · web-flow · commit f61565ddd150 · 2020-09-02T15:48:41.000-04:00
[BD-21] Multiline annotations with single-line comment prefix ("#")
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -11,6 +11,11 @@ Change Log
 
 .. There should always be an "Unreleased" section for changes pending release.
 
+[0.6.0] - 2020-08-27
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+* Add support for multiline annotations for lines prefixed with single-line comment signs ("#")
+
 [0.5.1] - 2020-08-25
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/code_annotations/__init__.py b/code_annotations/__init__.py
@@ -2,4 +2,4 @@
 Extensible tools for parsing annotations in codebases.
 """
 
-__version__ = '0.5.1'
+__version__ = '0.6.0'
diff --git a/code_annotations/extensions/base.py b/code_annotations/extensions/base.py
@@ -42,22 +42,27 @@ class SimpleRegexAnnotationExtension(AnnotationExtension, metaclass=ABCMeta):
     # Javascript and Python extensions for examples.
     lang_comment_definition = None
 
-    r"""
-    This format string/regex finds all comments in the file. The format tokens will be replaced with the
-    language-specific comment definitions defined in the sub-classes.
-
-    {multi_start} - start of the language-specific multi-line comment (ex. /*)
-    ([\d\D]*?)    - capture all of the characters...
-    {multi_end}   - until you find the end of the language-specific multi-line comment (ex. */)
-    |             - If you don't find any of those...
-    {single}      - start by finding the single-line comment token (ex. //)
-    (.*)          - and capture all characters until the end of the line
-
-    Returns a 2-tuple of:
-     - ("Comment text", None) in the case of a multi-line comment OR
-     - (None, "Comment text") in the case of a single-line comment
+    # This format string/regex finds all comments in the file. The format tokens will be replaced with the
+    # language-specific comment definitions defined in the sub-classes.
+    #
+    # Match groupdict will contain two named subgroups: 'comment' and 'prefixed_comment', of which at most
+    # one will be non-None.
+    comment_regex_fmt = r"""
+        {multi_start}           # start of the language-specific multi-line comment (ex. /*)
+        (?P<comment>            # Look for a multiline comment
+            [\d\D]*?            # capture all of the characters...
+        )
+        {multi_end}             # until you find the end of the language-specific multi-line comment (ex. */)
+        |                       # If you don't find any of those...
+        (?P<prefixed_comment>   # Look for a group of single-line comments
+            (?:                 # Non-capture mode
+                {single}        # start by finding the single-line comment token (ex. //)
+                .*              # and capture all characters until the end of the line
+                \n?             # followed by an optional carriage return
+                \ *             # and some empty space
+            )*                  # multiple times
+        )
     """
-    comment_regex_fmt = r'{multi_start}([\d\D]*?){multi_end}|{single}(.*)'
 
     def __init__(self, config, echo):
         """
@@ -74,7 +79,12 @@ def __init__(self, config, echo):
 
         # pylint: disable=not-a-mapping
         self.comment_regex = re.compile(
-            self.comment_regex_fmt.format(**self.lang_comment_definition)
+            self.comment_regex_fmt.format(**self.lang_comment_definition),
+            flags=re.VERBOSE
+        )
+        self.prefixed_comment_regex = re.compile(
+            r"^ *{single}".format(**self.lang_comment_definition),
+            flags=re.MULTILINE
         )
 
         # Parent class will allow this class to populate self.strings_to_search via
@@ -102,15 +112,15 @@ def search(self, file_handle):
         if any(anno in txt for anno in self.config.annotation_tokens):
             fname = clean_abs_path(file_handle.name, self.config.source_path)
 
+            # Iterate on all comments: both prefixed- and non-prefixed.
             for match in self.comment_regex.finditer(txt):
-                # Should only be one match
-                comment_content = [item for item in match.groups() if item is not None][0]
-                for inner_match in self.query.finditer(comment_content):
-                    # Get the line number by counting newlines + 1 (for the first line).
-                    # Note that this is the line number of the beginning of the comment, not the
-                    # annotation token itself.
-                    line = txt.count('\n', 0, match.start()) + 1
+                # Get the line number by counting newlines + 1 (for the first line).
+                # Note that this is the line number of the beginning of the comment, not the
+                # annotation token itself.
+                line = txt.count('\n', 0, match.start()) + 1
 
+                comment_content = self._find_comment_content(match)
+                for inner_match in self.query.finditer(comment_content):
                     try:
                         annotation_token = inner_match.group('token')
                         annotation_data = inner_match.group('data')
@@ -131,3 +141,27 @@ def search(self, file_handle):
                     })
 
         return found_annotations
+
+    def _find_comment_content(self, match):
+        """
+        Return the comment content as text.
+
+        Args:
+            match (sre.SRE_MATCH): one of the matches of the self.comment_regex regular expression.
+        """
+        comment_content = match.groupdict()["comment"]
+        if comment_content:
+            return comment_content
+
+        # Find single-line comments and strip comment tokens
+        comment_content = match.groupdict()["prefixed_comment"]
+        return self._strip_single_line_comment_tokens(comment_content)
+
+    def _strip_single_line_comment_tokens(self, content):
+        """
+        Strip the leading single-line comment tokens from a comment text.
+
+        Args:
+            content (str): token-prefixed multi-line comment string.
+        """
+        return self.prefixed_comment_regex.sub("", content)
diff --git a/tests/extensions/python_test_files/multiline_singlelinecomment.pyt b/tests/extensions/python_test_files/multiline_singlelinecomment.pyt
@@ -0,0 +1,7 @@
+# Docstring
+#.. pii: A long description that
+#  spans multiple
+#  lines
+# A comment that is not indented and not part of the above multi-line annotation
+#.. pii_types: id, name
+# Some comment that comes after the multiple-line annotation
diff --git a/tests/extensions/test_base_extensions.py b/tests/extensions/test_base_extensions.py
@@ -28,3 +28,19 @@ def test_nothing_found():
     r = FakeExtension(config, VerboseEcho())
     with open('tests/extensions/base_test_files/empty.foo') as f:
         r.search(f)
+
+
+def test_strip_single_line_comment_tokens():
+    config = FakeConfig()
+
+    extension = FakeExtension(config, VerboseEcho())
+    text = """baz line1
+  baz line2
+bazline3
+baz   line4"""
+    expected_result = """ line1
+ line2
+line3
+   line4"""
+    # pylint: disable=protected-access
+    assert expected_result == extension._strip_single_line_comment_tokens(text)
diff --git a/tests/extensions/test_extension_python.py b/tests/extensions/test_extension_python.py
@@ -76,6 +76,15 @@ def test_grouping_and_choice_failures(test_file, expected_exit_code, expected_me
      Multi-line and multi-paragraph.""")
         ]
     ),
+    (
+        'multiline_singlelinecomment.pyt',
+        [
+            ('.. pii:', """A long description that
+  spans multiple
+  lines"""),
+            ('.. pii_types:', 'id, name'),
+        ]
+    ),
 ])
 def test_multi_line_annotations(test_file, annotations):
     config = AnnotationConfig('tests/test_configurations/.annotations_test')