Add utils.parse_unified_diff function

This will read stdin using utils.stdin_get_value and parse it into a dictionary similar to pep8/pycodestyle's parse_udiff function. This differs from that function in the order in which it parses the diff and also in that it does not try to filter the paths in the diff. It lets a different level take care of that for it. This will likely be used to send paths to the FileCheckerManager and also send the ranges to the StyleGuide so it can filter errors not in those ranges.
2026-06-29 08:40:45 +00:00 · 2016-06-06 08:41:22 -05:00 · 2016-06-06 08:41:22 -05:00 · 0cdde196e8
commit 0cdde196e8
parent c4f463e4fb
1 changed files with 71 additions and 0 deletions
--- a/flake8/utils.py
+++ b/flake8/utils.py
@ -1,10 +1,14 @@
 """Utility methods for flake8."""
+import collections
 import fnmatch as _fnmatch
 import inspect
 import io
 import os
+import re
 import sys

+DIFF_HUNK_REGEXP = re.compile(r'^@@ -\d+(?:,\d+)? \+(\d+)(?:,(\d+))? @@.*$')
+

 def parse_comma_separated_list(value):
    # type: (Union[Sequence[str], str]) -> List[str]
@ -70,6 +74,73 @@ def stdin_get_value():
    return cached_value.getvalue()


+def parse_unified_diff():
+    # type: () -> List[str]
+    """Parse the unified diff passed on stdin.
+
+    :returns: dictionary mapping file names to sets of ranges
+    :rtype: dict
+    """
+    diff = stdin_get_value()
+    number_of_rows = None
+    current_path = None
+    parsed_paths = collections.defaultdict(set)
+    for line in diff.splitlines():
+        if number_of_rows:
+            # NOTE(sigmavirus24): Below we use a slice because stdin may be
+            # bytes instead of text on Python 3.
+            if line[:1] != '-':
+                number_of_rows -= 1
+            # We're in the part of the diff that has lines starting with +, -,
+            # and ' ' to show context and the changes made. We skip these
+            # because the information we care about is the filename and the
+            # range within it.
+            # When number_of_rows reaches 0, we will once again start
+            # searching for filenames and ranges.
+            continue
+
+        # NOTE(sigmavirus24): Diffs that we support look roughly like:
+        #    diff a/file.py b/file.py
+        #    ...
+        #    --- a/file.py
+        #    +++ b/file.py
+        # Below we're looking for that last line. Every diff tool that
+        # gives us this output may have additional information after
+        # ``b/file.py`` which it will separate with a \t, e.g.,
+        #    +++ b/file.py\t100644
+        # Which is an example that has the new file permissions/mode.
+        # In this case we only care about the file name.
+        if line[:3] == '+++':
+            current_path = line[4:].split('\t', 1)[0]
+            # NOTE(sigmavirus24): This check is for diff output from git.
+            if current_path[:2] == 'b/':
+                current_path = current_path[2:]
+            # We don't need to do anything else. We have set up our local
+            # ``current_path`` variable. We can skip the rest of this loop.
+            # The next line we will see will give us the hung information
+            # which is in the next section of logic.
+            continue
+
+        hunk_match = DIFF_HUNK_REGEXP.match(line)
+        # NOTE(sigmavirus24): pep8/pycodestyle check for:
+        #    line[:3] == '@@ '
+        # But the DIFF_HUNK_REGEXP enforces that the line start with that
+        # So we can more simply check for a match instead of slicing and
+        # comparing.
+        if hunk_match:
+            (row, number_of_rows) = [
+                1 if not group else int(group)
+                for group in hunk_match.groups()
+            ]
+            parsed_paths[current_path].update(
+                range(row, row + number_of_rows)
+            )
+
+    # We have now parsed our diff into a dictionary that looks like:
+    #    {'file.py': set(range(10, 16), range(18, 20)), ...}
+    return parsed_paths
+
+
 def is_windows():
    # type: () -> bool
    """Determine if we're running on Windows.