From 9e417077fce618ddb2d8a3cca2eba09a280178b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Skytt=C3=A4?= Date: Thu, 6 Feb 2020 18:43:41 +0200 Subject: [PATCH] Add check for unicode replacement characters --- .pre-commit-hooks.yaml | 6 ++++ README.md | 3 ++ .../check_unicode_replacement_char.py | 32 +++++++++++++++++++ setup.cfg | 1 + tests/check_unicode_replacement_char_test.py | 13 ++++++++ 5 files changed, 55 insertions(+) create mode 100644 pre_commit_hooks/check_unicode_replacement_char.py create mode 100644 tests/check_unicode_replacement_char_test.py diff --git a/.pre-commit-hooks.yaml b/.pre-commit-hooks.yaml index 666a245..e65c928 100644 --- a/.pre-commit-hooks.yaml +++ b/.pre-commit-hooks.yaml @@ -22,6 +22,12 @@ entry: check-byte-order-marker language: python types: [text] +- id: check-unicode-replacement-char + name: Check for Unicode replacement character + description: Forbid files which have a UTF-8 Unicode replacement character + entry: check-unicode-replacement-char + language: python + types: [text] - id: check-builtin-literals name: Check builtin type constructor use description: Require literal syntax when initializing empty or zero Python builtin types. diff --git a/README.md b/README.md index c98ba1b..81a30bf 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,9 @@ Checks for symlinks which do not point to anything. #### `check-toml` Attempts to load all TOML files to verify syntax. +#### `check-unicode-replacement-char` +Forbid files which have a UTF-8 Unicode replacement character. + #### `check-vcs-permalinks` Ensures that links to vcs websites are permalinks. diff --git a/pre_commit_hooks/check_unicode_replacement_char.py b/pre_commit_hooks/check_unicode_replacement_char.py new file mode 100644 index 0000000..02bf888 --- /dev/null +++ b/pre_commit_hooks/check_unicode_replacement_char.py @@ -0,0 +1,32 @@ +import argparse +import fileinput +from typing import Optional +from typing import Sequence + + +def main(argv: Optional[Sequence[str]] = None) -> int: + parser = argparse.ArgumentParser() + parser.add_argument('filenames', nargs='*', help='Filenames to check') + args = parser.parse_args(argv) + + retv = 0 + + for line in fileinput.input(files=args.filenames, mode='rb'): + try: + col = line.index(b'\xEF\xBF\xBD') + except ValueError: + continue + retv = 1 + # Not saying filename:line:col: because that kind of format is usually + # used for character offsets, and we have a byte offset which might be + # different, emphasize that. + print( + f'{fileinput.filename()}:{fileinput.lineno()}: ' + f'UTF-8 Unicode replacement character at byte {col}' + ) + + return retv + + +if __name__ == '__main__': + exit(main()) diff --git a/setup.cfg b/setup.cfg index 6b1a34d..f6e15df 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,6 +34,7 @@ console_scripts = check-ast = pre_commit_hooks.check_ast:main check-builtin-literals = pre_commit_hooks.check_builtin_literals:main check-byte-order-marker = pre_commit_hooks.check_byte_order_marker:main + check-unicode-replacement-char = pre_commit_hooks.check_unicode_replacement_char:main check-case-conflict = pre_commit_hooks.check_case_conflict:main check-docstring-first = pre_commit_hooks.check_docstring_first:main check-executables-have-shebangs = pre_commit_hooks.check_executables_have_shebangs:main diff --git a/tests/check_unicode_replacement_char_test.py b/tests/check_unicode_replacement_char_test.py new file mode 100644 index 0000000..35e80fb --- /dev/null +++ b/tests/check_unicode_replacement_char_test.py @@ -0,0 +1,13 @@ +from pre_commit_hooks import check_unicode_replacement_char + + +def test_failure(tmpdir): + f = tmpdir.join('f.txt') + f.write_text(str(b'\x80abc', errors='replace'), encoding='utf-8') + assert check_unicode_replacement_char.main((f.strpath,)) == 1 + + +def test_success(tmpdir): + f = tmpdir.join('f.txt') + f.write_text(str(b'\x80abc', errors='backslashreplace'), encoding='utf-8') + assert check_unicode_replacement_char.main((f.strpath,)) == 0