Add mixed-line-ending hook

2026-06-30 06:30:47 +00:00 · 2017-06-13 21:38:14 +02:00 · 2017-06-13 21:38:14 +02:00 · fc8a5b27e9
commit fc8a5b27e9
parent 78dffcc819
6 changed files with 388 additions and 0 deletions
--- a/.pre-commit-hooks.yaml
+++ b/.pre-commit-hooks.yaml
@ -191,6 +191,15 @@
    # for backward compatibility
    files: ''
    minimum_pre_commit_version: 0.15.0
+-   id: mixed-line-ending
+    name: Mixed line ending
+    description: Replaces or checks mixed line ending
+    entry: mixed-line-ending
+    language: python
+    types: [text]
+    # for backward compatibility
+    files: ''
+    minimum_pre_commit_version: 0.15.0
 -   id: name-tests-test
    name: Tests should end in _test.py
    description: This verifies that test files are named correctly
--- a/README.md
+++ b/README.md
@ -58,6 +58,11 @@ Add this to your `.pre-commit-config.yaml`
 - `file-contents-sorter` - Sort the lines in specified files (defaults to alphabetical). You must provide list of target files as input to it. Note that this hook WILL remove blank lines and does NOT respect any comments.
 - `flake8` - Run flake8 on your python files.
 - `forbid-new-submodules` - Prevent addition of new git submodules.
+- `mixed-line-ending` - Replaces or checks mixed line ending.
+    - `--fix={auto,crlf,lf,no}`
+        - `auto` - Replaces automatically the most frequent line ending. This is the default argument.
+        - `crlf`, `lf` - Forces to replace line ending by respectively CRLF and LF.
+        - `no` - Checks if there is any mixed line ending without modifying any file.
 - `name-tests-test` - Assert that files in tests/ end in `_test.py`.
    - Use `args: ['--django']` to match `test*.py` instead.
 - `no-commit-to-branch` - Protect specific branches from direct checkins.
--- a/hooks.yaml
+++ b/hooks.yaml
@ -130,6 +130,12 @@
    entry: upgrade-your-pre-commit-version
    files: ''
    minimum_pre_commit_version: 0.15.0
+-   id: mixed-line-ending
+    language: system
+    name: upgrade-your-pre-commit-version
+    entry: upgrade-your-pre-commit-version
+    files: ''
+    minimum_pre_commit_version: 0.15.0
 -   id: name-tests-test
    language: system
    name: upgrade-your-pre-commit-version
--- a/pre_commit_hooks/mixed_line_ending.py
+++ b/pre_commit_hooks/mixed_line_ending.py
@ -0,0 +1,212 @@
+import argparse
+import re
+import sys
+
+from enum import Enum
+
+
+class LineEnding(Enum):
+    CR = b'\r', 'cr', re.compile(b'\r(?!\n)', re.DOTALL)
+    CRLF = b'\r\n', 'crlf', re.compile(b'\r\n', re.DOTALL)
+    LF = b'\n', 'lf', re.compile(b'(?<!\r)\n', re.DOTALL)
+
+    def __init__(self, string, opt_name, regex):
+        self.string = string
+        self.str_print = repr(string)
+        self.opt_name = opt_name
+        self.regex = regex
+
+
+class MixedLineEndingOption(Enum):
+    AUTO = 'auto', None
+    NO = 'no', None
+    CRLF = LineEnding.CRLF.opt_name, LineEnding.CRLF
+    LF = LineEnding.LF.opt_name, LineEnding.LF
+
+    def __init__(self, opt_name, line_ending_enum):
+        self.opt_name = opt_name
+        self.line_ending_enum = line_ending_enum
+
+
+class MixedLineDetection(Enum):
+    NOT_MIXED = 1, False, None
+    UNKNOWN = 2, False, None
+    MIXED_MOSTLY_CRLF = 3, True, LineEnding.CRLF
+    MIXED_MOSTLY_LF = 4, True, LineEnding.LF
+    MIXED_MOSTLY_CR = 5, True, LineEnding.CR
+
+    def __init__(self, index, mle_found, line_ending_enum):
+        # TODO hack to prevent enum overriding
+        self.index = index
+        self.mle_found = mle_found
+        self.line_ending_enum = line_ending_enum
+
+
+ANY_LINE_ENDING_PATTERN = re.compile(
+    b'(' + LineEnding.CRLF.regex.pattern +
+    b'|' + LineEnding.LF.regex.pattern +
+    b'|' + LineEnding.CR.regex.pattern + b')',
+)
+
+
+def mixed_line_ending(argv=None):
+    options = _parse_arguments(argv)
+
+    filenames = options['filenames']
+    fix_option = options['fix']
+
+    if fix_option == MixedLineEndingOption.NO:
+        return _process_no_fix(filenames)
+    elif fix_option == MixedLineEndingOption.AUTO:
+        return _process_fix_auto(filenames)
+    # when a line ending character is forced with --fix option
+    else:
+        return _process_fix_force(filenames, fix_option.line_ending_enum)
+
+
+def _parse_arguments(argv=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '-f',
+        '--fix',
+        choices=[m.opt_name for m in MixedLineEndingOption],
+        default=MixedLineEndingOption.AUTO.opt_name,
+        help='Replace line ending with the specified. Default is "auto"',
+    )
+    parser.add_argument('filenames', nargs='*', help='Filenames to fix')
+    args = parser.parse_args(argv)
+
+    fix, = (
+        member for name, member
+        in MixedLineEndingOption.__members__.items()
+        if member.opt_name == args.fix
+    )
+
+    options = {
+        'fix': fix, 'filenames': args.filenames,
+    }
+
+    return options
+
+
+def _detect_line_ending(filename):
+    with open(filename, 'rb') as f:
+        buf = f.read()
+
+        le_counts = {}
+
+    for le_enum in LineEnding:
+        le_counts[le_enum] = len(le_enum.regex.findall(buf))
+
+    mixed = False
+    le_found_previously = False
+    most_le = None
+    max_le_count = 0
+
+    for le, le_count in le_counts.items():
+        le_found_cur = le_count > 0
+
+        mixed |= le_found_previously and le_found_cur
+        le_found_previously |= le_found_cur
+
+        if le_count == max_le_count:
+            most_le = None
+        elif le_count > max_le_count:
+            max_le_count = le_count
+            most_le = le
+
+    if not mixed:
+        return MixedLineDetection.NOT_MIXED
+
+    for mld in MixedLineDetection:
+        if (
+                mld.line_ending_enum is not None and
+                mld.line_ending_enum == most_le
+        ):
+            return mld
+
+    return MixedLineDetection.UNKNOWN
+
+
+def _process_no_fix(filenames):
+    print('Checking if the files have mixed line ending.')
+
+    mle_filenames = []
+    for filename in filenames:
+        detect_result = _detect_line_ending(filename)
+
+        if detect_result.mle_found:
+            mle_filenames.append(filename)
+
+    mle_found = len(mle_filenames) > 0
+
+    if mle_found:
+        print(
+            'The following files have mixed line endings:\n\t%s',
+            '\n\t'.join(mle_filenames),
+        )
+
+    return 1 if mle_found else 0
+
+
+def _process_fix_auto(filenames):
+    mle_found = False
+
+    for filename in filenames:
+        detect_result = _detect_line_ending(filename)
+
+        if detect_result == MixedLineDetection.NOT_MIXED:
+            print('The file %s has no mixed line ending', filename)
+        elif detect_result == MixedLineDetection.UNKNOWN:
+            print(
+                'Could not define most frequent line ending in '
+                'file %s. File skiped.', filename,
+            )
+
+            mle_found = True
+        else:
+            le_enum = detect_result.line_ending_enum
+
+            print(
+                'The file %s has mixed line ending with a '
+                'majority of %s. Converting...', filename, le_enum.str_print,
+            )
+
+            _convert_line_ending(filename, le_enum.string)
+            mle_found = True
+
+            print(
+                'The file %s has been converted to %s line ending.',
+                filename, le_enum.str_print,
+            )
+
+    return 1 if mle_found else 0
+
+
+def _process_fix_force(filenames, line_ending_enum):
+    for filename in filenames:
+        _convert_line_ending(filename, line_ending_enum.string)
+
+        print(
+            'The file %s has been forced to %s line ending.',
+            filename, line_ending_enum.str_print,
+        )
+
+    return 1
+
+
+def _convert_line_ending(filename, line_ending):
+    with open(filename, 'rb+') as f:
+        bufin = f.read()
+
+        # convert line ending
+        bufout = ANY_LINE_ENDING_PATTERN.sub(line_ending, bufin)
+
+        # write the result in the file replacing the existing content
+        f.seek(0)
+        f.write(bufout)
+        f.truncate()
+
+
+if __name__ == '__main__':
+    sys.exit(mixed_line_ending())
--- a/setup.py
+++ b/setup.py
@ -31,6 +31,7 @@ setup(
        'simplejson',
        'six',
    ],
+    extras_require={':python_version=="2.7"': ['enum34']},
    entry_points={
        'console_scripts': [
            'autopep8-wrapper = pre_commit_hooks.autopep8_wrapper:main',
@ -53,6 +54,7 @@ setup(
            'file-contents-sorter = pre_commit_hooks.file_contents_sorter:main',
            'fix-encoding-pragma = pre_commit_hooks.fix_encoding_pragma:main',
            'forbid-new-submodules = pre_commit_hooks.forbid_new_submodules:main',
+            'mixed-line-ending = pre_commit_hooks.mixed_line_ending:mixed_line_ending',
            'name-tests-test = pre_commit_hooks.tests_should_end_in_test:validate_files',
            'no-commit-to-branch = pre_commit_hooks.no_commit_to_branch:main',
            'pretty-format-json = pre_commit_hooks.pretty_format_json:pretty_format_json',
--- a/tests/mixed_line_ending_test.py
+++ b/tests/mixed_line_ending_test.py
@ -0,0 +1,154 @@
+import pytest
+
+from pre_commit_hooks.mixed_line_ending import mixed_line_ending
+
+# Input, expected return value, expected output
+TESTS_FIX_AUTO = (
+    # only 'LF'
+    (b'foo\nbar\nbaz\n', 0, b'foo\nbar\nbaz\n'),
+    # only 'CRLF'
+    (b'foo\r\nbar\r\nbaz\r\n', 0, b'foo\r\nbar\r\nbaz\r\n'),
+    # only 'CR'
+    (b'foo\rbar\rbaz\r', 0, b'foo\rbar\rbaz\r'),
+    # mixed with majority of 'LF'
+    (b'foo\r\nbar\nbaz\n', 1, b'foo\nbar\nbaz\n'),
+    # mixed with majority of 'CRLF'
+    (b'foo\r\nbar\nbaz\r\n', 1, b'foo\r\nbar\r\nbaz\r\n'),
+    # mixed with majority of 'CR'
+    (b'foo\rbar\nbaz\r', 1, b'foo\rbar\rbaz\r'),
+    # mixed with as much 'LF' as 'CRLF'
+    (b'foo\r\nbar\nbaz', 1, b'foo\r\nbar\nbaz'),
+    # mixed with as much 'LF' as 'CR'
+    (b'foo\rbar\nbaz', 1, b'foo\rbar\nbaz'),
+    # mixed with as much 'CRLF' as 'CR'
+    (b'foo\r\nbar\nbaz', 1, b'foo\r\nbar\nbaz'),
+    # mixed with as much 'CRLF' as 'LF' as 'CR'
+    (b'foo\r\nbar\nbaz\r', 1, b'foo\r\nbar\nbaz\r'),
+)
+
+
+@pytest.mark.parametrize(
+    ('input_s', 'expected_retval', 'output'),
+    TESTS_FIX_AUTO,
+)
+def test_mixed_line_ending_fix_auto(input_s, expected_retval, output, tmpdir):
+    path = tmpdir.join('file.txt')
+    path.write(input_s)
+    ret = mixed_line_ending(('--fix=auto', path.strpath))
+
+    assert ret == expected_retval
+    assert path.read_binary() == output
+
+
+# Input, expected return value, expected output
+TESTS_NO_FIX = (
+    # only 'LF'
+    (b'foo\nbar\nbaz\n', 0, b'foo\nbar\nbaz\n'),
+    # only 'CRLF'
+    (b'foo\r\nbar\r\nbaz\r\n', 0, b'foo\r\nbar\r\nbaz\r\n'),
+    # only 'CR'
+    (b'foo\rbar\rbaz\r', 0, b'foo\rbar\rbaz\r'),
+    # mixed with majority of 'LF'
+    (b'foo\r\nbar\nbaz\n', 1, b'foo\r\nbar\nbaz\n'),
+    # mixed with majority of 'CRLF'
+    (b'foo\r\nbar\nbaz\r\n', 1, b'foo\r\nbar\nbaz\r\n'),
+    # mixed with majority of 'CR'
+    (b'foo\rbar\nbaz\r', 1, b'foo\rbar\nbaz\r'),
+    # mixed with as much 'LF' as 'CR'
+    (b'foo\rbar\nbaz', 0, b'foo\rbar\nbaz'),
+    # mixed with as much 'CRLF' as 'CR'
+    (b'foo\r\nbar\nbaz', 0, b'foo\r\nbar\nbaz'),
+    # mixed with as much 'CRLF' as 'LF' as 'CR'
+    (b'foo\r\nbar\nbaz\r', 0, b'foo\r\nbar\nbaz\r'),
+)
+
+
+@pytest.mark.parametrize(
+    ('input_s', 'expected_retval', 'output'),
+    TESTS_NO_FIX,
+)
+def test_detect_mixed_line_ending(input_s, expected_retval, output, tmpdir):
+    path = tmpdir.join('file.txt')
+    path.write(input_s)
+    ret = mixed_line_ending(('--fix=no', path.strpath))
+
+    assert ret == expected_retval
+    assert path.read_binary() == output
+
+
+# Input, expected return value, expected output
+TESTS_FIX_FORCE_LF = (
+    # only 'LF'
+    (b'foo\nbar\nbaz\n', 1, b'foo\nbar\nbaz\n'),
+    # only 'CRLF'
+    (b'foo\r\nbar\r\nbaz\r\n', 1, b'foo\nbar\nbaz\n'),
+    # only 'CR'
+    (b'foo\rbar\rbaz\r', 1, b'foo\nbar\nbaz\n'),
+    # mixed with majority of 'LF'
+    (b'foo\r\nbar\nbaz\n', 1, b'foo\nbar\nbaz\n'),
+    # mixed with majority of 'CRLF'
+    (b'foo\r\nbar\nbaz\r\n', 1, b'foo\nbar\nbaz\n'),
+    # mixed with majority of 'CR'
+    (b'foo\rbar\nbaz\r', 1, b'foo\nbar\nbaz\n'),
+    # mixed with as much 'LF' as 'CR'
+    (b'foo\rbar\nbaz', 1, b'foo\nbar\nbaz'),
+    # mixed with as much 'CRLF' as 'CR'
+    (b'foo\r\nbar\nbaz', 1, b'foo\nbar\nbaz'),
+    # mixed with as much 'CRLF' as 'LF' as 'CR'
+    (b'foo\r\nbar\nbaz\r', 1, b'foo\nbar\nbaz\n'),
+)
+
+
+@pytest.mark.parametrize(
+    ('input_s', 'expected_retval', 'output'),
+    TESTS_FIX_FORCE_LF,
+)
+def test_mixed_line_ending_fix_force_lf(
+    input_s, expected_retval, output,
+    tmpdir,
+):
+    path = tmpdir.join('file.txt')
+    path.write(input_s)
+    ret = mixed_line_ending(('--fix=lf', path.strpath))
+
+    assert ret == expected_retval
+    assert path.read_binary() == output
+
+
+# Input, expected return value, expected output
+TESTS_FIX_FORCE_CRLF = (
+    # only 'LF'
+    (b'foo\nbar\nbaz\n', 1, b'foo\r\nbar\r\nbaz\r\n'),
+    # only 'CRLF'
+    (b'foo\r\nbar\r\nbaz\r\n', 1, b'foo\r\nbar\r\nbaz\r\n'),
+    # only 'CR'
+    (b'foo\rbar\rbaz\r', 1, b'foo\r\nbar\r\nbaz\r\n'),
+    # mixed with majority of 'LF'
+    (b'foo\r\nbar\nbaz\n', 1, b'foo\r\nbar\r\nbaz\r\n'),
+    # mixed with majority of 'CRLF'
+    (b'foo\r\nbar\nbaz\r\n', 1, b'foo\r\nbar\r\nbaz\r\n'),
+    # mixed with majority of 'CR'
+    (b'foo\rbar\nbaz\r', 1, b'foo\r\nbar\r\nbaz\r\n'),
+    # mixed with as much 'LF' as 'CR'
+    (b'foo\rbar\nbaz', 1, b'foo\r\nbar\r\nbaz'),
+    # mixed with as much 'CRLF' as 'CR'
+    (b'foo\r\nbar\nbaz', 1, b'foo\r\nbar\r\nbaz'),
+    # mixed with as much 'CRLF' as 'LF' as 'CR'
+    (b'foo\r\nbar\nbaz\r', 1, b'foo\r\nbar\r\nbaz\r\n'),
+)
+
+
+@pytest.mark.parametrize(
+    ('input_s', 'expected_retval', 'output'),
+    TESTS_FIX_FORCE_CRLF,
+)
+def test_mixed_line_ending_fix_force_crlf(
+    input_s, expected_retval, output,
+    tmpdir,
+):
+    path = tmpdir.join('file.txt')
+    path.write(input_s)
+    ret = mixed_line_ending(('--fix=crlf', path.strpath))
+
+    assert ret == expected_retval
+    assert path.read_binary() == output