Add --exclude to large files filter

This commit is contained in:
Vojta Tuma 2025-10-17 10:59:34 +02:00
parent 3fed74c572
commit 89b1f940f0
3 changed files with 30 additions and 1 deletions

View file

@ -26,6 +26,7 @@ Add this to your `.pre-commit-config.yaml`
#### `check-added-large-files`
Prevent giant files from being committed.
- Specify what is "too large" with `args: ['--maxkb=123']` (default=500kB).
- Optionally exclude glob-like patterns with `args: ['--exclude=uv.lock,examples/*ipynb']`
- Limits checked files to those indicated as staged for addition by git.
- If `git-lfs` is installed, lfs files will be skipped
(requires `git-lfs>=2.2.1`)

View file

@ -5,6 +5,7 @@ import math
import os
import subprocess
from collections.abc import Sequence
from fnmatch import fnmatch
from pre_commit_hooks.util import added_files
from pre_commit_hooks.util import zsplit
@ -34,12 +35,17 @@ def find_large_added_files(
filenames: Sequence[str],
maxkb: int,
*,
exclude: list[str] | None = None,
enforce_all: bool = False,
) -> int:
# Find all added files that are also in the list of files pre-commit tells
# us about
retv = 0
filenames_filtered = set(filenames)
exclude = [] if not exclude else exclude
filenames_filtered = {
fname for fname in filenames
if not any(fnmatch(fname, pat) for pat in exclude)
}
filter_lfs_files(filenames_filtered)
if not enforce_all:
@ -68,12 +74,17 @@ def main(argv: Sequence[str] | None = None) -> int:
'--maxkb', type=int, default=500,
help='Maximum allowable KB for added files',
)
parser.add_argument(
'--exclude', type=str, default='',
help='Comma-separated list of glob-style patterns to be excluded',
)
args = parser.parse_args(argv)
return find_large_added_files(
args.filenames,
args.maxkb,
enforce_all=args.enforce_all,
exclude=args.exclude.split(','),
)

View file

@ -43,6 +43,23 @@ def test_add_something_giant(temp_git_dir):
assert find_large_added_files(['f.py'], 10) == 0
def test_use_exclude(temp_git_dir):
with temp_git_dir.as_cwd():
temp_git_dir.join('uv.lock').write('a' * 10_000)
temp_git_dir.join('big.baddie').write('a' * 10_000)
cmd_output('git', 'add', 'uv.lock')
cmd_output('git', 'add', 'big.baddie')
# should fail due to big baddie as thats not excluded
assert find_large_added_files(
['uv.lock', 'big.baddie'], 1, exclude=['*.lock'],
) == 1
# should pass when all files excluded, with both expand and exact match
assert find_large_added_files(['uv.lock'], 1, exclude=['*.lock']) == 0
assert find_large_added_files(['uv.lock'], 1, exclude=['uv.lock']) == 0
def test_enforce_all(temp_git_dir):
with temp_git_dir.as_cwd():
temp_git_dir.join('f.py').write('a' * 10000)