@@ -42,22 +42,27 @@ class SimpleRegexAnnotationExtension(AnnotationExtension, metaclass=ABCMeta):
4242 # Javascript and Python extensions for examples.
4343 lang_comment_definition = None
4444
45- r"""
46- This format string/regex finds all comments in the file. The format tokens will be replaced with the
47- language-specific comment definitions defined in the sub-classes.
48-
49- {multi_start} - start of the language-specific multi-line comment (ex. /*)
50- ([\d\D]*?) - capture all of the characters...
51- {multi_end} - until you find the end of the language-specific multi-line comment (ex. */)
52- | - If you don't find any of those...
53- {single} - start by finding the single-line comment token (ex. //)
54- (.*) - and capture all characters until the end of the line
55-
56- Returns a 2-tuple of:
57- - ("Comment text", None) in the case of a multi-line comment OR
58- - (None, "Comment text") in the case of a single-line comment
45+ # This format string/regex finds all comments in the file. The format tokens will be replaced with the
46+ # language-specific comment definitions defined in the sub-classes.
47+ #
48+ # Match groupdict will contain two named subgroups: 'comment' and 'prefixed_comment', of which at most
49+ # one will be non-None.
50+ comment_regex_fmt = r"""
51+ {multi_start} # start of the language-specific multi-line comment (ex. /*)
52+ (?P<comment> # Look for a multiline comment
53+ [\d\D]*? # capture all of the characters...
54+ )
55+ {multi_end} # until you find the end of the language-specific multi-line comment (ex. */)
56+ | # If you don't find any of those...
57+ (?P<prefixed_comment> # Look for a group of single-line comments
58+ (?: # Non-capture mode
59+ {single} # start by finding the single-line comment token (ex. //)
60+ .* # and capture all characters until the end of the line
61+ \n? # followed by an optional carriage return
62+ \ * # and some empty space
63+ )* # multiple times
64+ )
5965 """
60- comment_regex_fmt = r'{multi_start}([\d\D]*?){multi_end}|{single}(.*)'
6166
6267 def __init__ (self , config , echo ):
6368 """
@@ -74,7 +79,12 @@ def __init__(self, config, echo):
7479
7580 # pylint: disable=not-a-mapping
7681 self .comment_regex = re .compile (
77- self .comment_regex_fmt .format (** self .lang_comment_definition )
82+ self .comment_regex_fmt .format (** self .lang_comment_definition ),
83+ flags = re .VERBOSE
84+ )
85+ self .prefixed_comment_regex = re .compile (
86+ r"^ *{single}" .format (** self .lang_comment_definition ),
87+ flags = re .MULTILINE
7888 )
7989
8090 # Parent class will allow this class to populate self.strings_to_search via
@@ -102,15 +112,15 @@ def search(self, file_handle):
102112 if any (anno in txt for anno in self .config .annotation_tokens ):
103113 fname = clean_abs_path (file_handle .name , self .config .source_path )
104114
115+ # Iterate on all comments: both prefixed- and non-prefixed.
105116 for match in self .comment_regex .finditer (txt ):
106- # Should only be one match
107- comment_content = [item for item in match .groups () if item is not None ][0 ]
108- for inner_match in self .query .finditer (comment_content ):
109- # Get the line number by counting newlines + 1 (for the first line).
110- # Note that this is the line number of the beginning of the comment, not the
111- # annotation token itself.
112- line = txt .count ('\n ' , 0 , match .start ()) + 1
117+ # Get the line number by counting newlines + 1 (for the first line).
118+ # Note that this is the line number of the beginning of the comment, not the
119+ # annotation token itself.
120+ line = txt .count ('\n ' , 0 , match .start ()) + 1
113121
122+ comment_content = self ._find_comment_content (match )
123+ for inner_match in self .query .finditer (comment_content ):
114124 try :
115125 annotation_token = inner_match .group ('token' )
116126 annotation_data = inner_match .group ('data' )
@@ -131,3 +141,27 @@ def search(self, file_handle):
131141 })
132142
133143 return found_annotations
144+
145+ def _find_comment_content (self , match ):
146+ """
147+ Return the comment content as text.
148+
149+ Args:
150+ match (sre.SRE_MATCH): one of the matches of the self.comment_regex regular expression.
151+ """
152+ comment_content = match .groupdict ()["comment" ]
153+ if comment_content :
154+ return comment_content
155+
156+ # Find single-line comments and strip comment tokens
157+ comment_content = match .groupdict ()["prefixed_comment" ]
158+ return self ._strip_single_line_comment_tokens (comment_content )
159+
160+ def _strip_single_line_comment_tokens (self , content ):
161+ """
162+ Strip the leading single-line comment tokens from a comment text.
163+
164+ Args:
165+ content (str): token-prefixed multi-line comment string.
166+ """
167+ return self .prefixed_comment_regex .sub ("" , content )
0 commit comments