Skip to content

Commit 819e324

Browse files
committed
feat: add specific filters for hfh
1 parent 5d65364 commit 819e324

File tree

2 files changed

+227
-82
lines changed

2 files changed

+227
-82
lines changed

src/scanoss/file_filters.py

Lines changed: 226 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,19 @@
4646
'copying.lib',
4747
'makefile',
4848
}
49+
50+
DEFAULT_SKIPPED_FILES_HFH = {
51+
'gradlew',
52+
'gradlew.bat',
53+
'mvnw',
54+
'mvnw.cmd',
55+
'gradle-wrapper.jar',
56+
'maven-wrapper.jar',
57+
'thumbs.db',
58+
'babel.config.js',
59+
}
60+
61+
4962
# Folders to skip
5063
DEFAULT_SKIPPED_DIRS = {
5164
'nbproject',
@@ -66,9 +79,26 @@
6679
'test',
6780
}
6881

82+
DEFAULT_SKIPPED_DIRS_HFH = {
83+
'nbproject',
84+
'nbbuild',
85+
'nbdist',
86+
'__pycache__',
87+
'venv',
88+
'_yardoc',
89+
'eggs',
90+
'wheels',
91+
'htmlcov',
92+
'__pypackages__',
93+
'example',
94+
'examples',
95+
}
96+
6997

7098
# Folder endings to skip
7199
DEFAULT_SKIPPED_DIR_EXT = {'.egg-info'}
100+
DEFAULT_SKIPPED_DIR_EXT_HFH = {'.egg-info'}
101+
72102
# File extensions to skip
73103
DEFAULT_SKIPPED_EXT = {
74104
'.1',
@@ -243,6 +273,162 @@
243273
'sqlite3',
244274
}
245275

276+
# TODO: For hfh add the .gitignore patterns
277+
DEFAULT_SKIPPED_EXT_HFH = {
278+
'.1',
279+
'.2',
280+
'.3',
281+
'.4',
282+
'.5',
283+
'.6',
284+
'.7',
285+
'.8',
286+
'.9',
287+
'.ac',
288+
'.adoc',
289+
'.am',
290+
'.asciidoc',
291+
'.bmp',
292+
'.build',
293+
'.cfg',
294+
'.chm',
295+
'.class',
296+
'.cmake',
297+
'.cnf',
298+
'.conf',
299+
'.config',
300+
'.contributors',
301+
'.copying',
302+
'.crt',
303+
'.csproj',
304+
'.css',
305+
'.csv',
306+
'.dat',
307+
'.data',
308+
'.dtd',
309+
'.dts',
310+
'.iws',
311+
'.c9',
312+
'.c9revisions',
313+
'.dtsi',
314+
'.dump',
315+
'.eot',
316+
'.eps',
317+
'.geojson',
318+
'.gif',
319+
'.glif',
320+
'.gmo',
321+
'.guess',
322+
'.hex',
323+
'.htm',
324+
'.html',
325+
'.ico',
326+
'.iml',
327+
'.in',
328+
'.inc',
329+
'.info',
330+
'.ini',
331+
'.ipynb',
332+
'.jpeg',
333+
'.jpg',
334+
'.json',
335+
'.jsonld',
336+
'.lock',
337+
'.log',
338+
'.m4',
339+
'.map',
340+
'.md5',
341+
'.meta',
342+
'.mk',
343+
'.mxml',
344+
'.o',
345+
'.otf',
346+
'.out',
347+
'.pbtxt',
348+
'.pdf',
349+
'.pem',
350+
'.phtml',
351+
'.plist',
352+
'.png',
353+
'.prefs',
354+
'.properties',
355+
'.pyc',
356+
'.qdoc',
357+
'.result',
358+
'.rgb',
359+
'.rst',
360+
'.scss',
361+
'.sha',
362+
'.sha1',
363+
'.sha2',
364+
'.sha256',
365+
'.sln',
366+
'.spec',
367+
'.sub',
368+
'.svg',
369+
'.svn-base',
370+
'.tab',
371+
'.template',
372+
'.test',
373+
'.tex',
374+
'.tiff',
375+
'.ttf',
376+
'.txt',
377+
'.utf-8',
378+
'.vim',
379+
'.wav',
380+
'.woff',
381+
'.woff2',
382+
'.xht',
383+
'.xhtml',
384+
'.xml',
385+
'.xpm',
386+
'.xsd',
387+
'.xul',
388+
'.yaml',
389+
'.yml',
390+
'.wfp',
391+
'.editorconfig',
392+
'.dotcover',
393+
'.pid',
394+
'.lcov',
395+
'.egg',
396+
'.manifest',
397+
'.cache',
398+
'.coverage',
399+
'.cover',
400+
'.gem',
401+
'.lst',
402+
'.pickle',
403+
'.pdb',
404+
'.gml',
405+
'.pot',
406+
'.plt',
407+
'.whml',
408+
'.pom',
409+
'.smtml',
410+
'.min.js',
411+
'.mf',
412+
'.base64',
413+
'.s',
414+
'.diff',
415+
'.patch',
416+
'.rules',
417+
# File endings
418+
'-doc',
419+
'config',
420+
'news',
421+
'readme',
422+
'swiftdoc',
423+
'texidoc',
424+
'todo',
425+
'version',
426+
'ignore',
427+
'manifest',
428+
'sqlite',
429+
'sqlite3',
430+
}
431+
246432

247433
class FileFilters(ScanossBase):
248434
"""
@@ -267,6 +453,7 @@ def __init__(self, debug: bool = False, trace: bool = False, quiet: bool = False
267453
skip_size (int): Size to skip
268454
skip_extensions (list): Extensions to skip
269455
skip_folders (list): Folders to skip
456+
is_folder_hashing_scan (bool): Whether the operation is a folder hashing scan
270457
"""
271458
super().__init__(debug, trace, quiet)
272459

@@ -277,6 +464,7 @@ def __init__(self, debug: bool = False, trace: bool = False, quiet: bool = False
277464
self.skip_folders = kwargs.get('skip_folders', [])
278465
self.skip_size = kwargs.get('skip_size', 0)
279466
self.skip_extensions = kwargs.get('skip_extensions', [])
467+
self.is_folder_hashing_scan = kwargs.get('is_folder_hashing_scan', False)
280468
self.file_folder_pat_spec = self._get_file_folder_pattern_spec(kwargs.get('operation_type', 'scanning'))
281469
self.size_pat_rules = self._get_size_limit_pattern_rules(kwargs.get('operation_type', 'scanning'))
282470

@@ -336,36 +524,36 @@ def get_filtered_files_from_files(self, files: List[str], scan_root: str = None)
336524
"""
337525
filtered_files = []
338526
for file_path in files:
339-
if not os.path.exists(file_path) or not os.path.isfile(file_path) or os.path.islink(file_path):
340-
self.print_debug(
341-
f'WARNING: File {file_path} does not exist, is not a file, or is a symbolic link. Ignoring.'
342-
)
343-
continue
344-
345527
path_obj = Path(file_path)
346-
if not self.hidden_files_folders and any(part.startswith('.') for part in path_obj.parts):
347-
self.print_debug(f'Skipping file: {file_path} (in hidden directory or is hidden file)')
348-
continue
349-
350528
try:
351529
if scan_root:
352-
rel_path = os.path.relpath(file_path, scan_root)
530+
rel_path = path_obj.relative_to(scan_root)
353531
else:
354-
rel_path = os.path.relpath(file_path)
532+
rel_path = str(path_obj)
355533
except ValueError:
356-
# If file_path is broken, symlink ignore it
357534
self.print_debug(f'Ignoring file: {file_path} (broken symlink)')
358535
continue
536+
537+
if not path_obj.exists() or not path_obj.is_file() or path_obj.is_symlink():
538+
self.print_debug(
539+
f'WARNING: File {rel_path} does not exist, is not a file, or is a symbolic link. Ignoring.'
540+
)
541+
continue
542+
543+
if not self.hidden_files_folders and any(part.startswith('.') for part in path_obj.parts):
544+
self.print_debug(f'Skipping file: {rel_path} (in hidden directory or is hidden file)')
545+
continue
546+
359547
if self._should_skip_file(rel_path):
360548
continue
361549
try:
362-
file_size = os.path.getsize(file_path)
550+
file_size = path_obj.stat().st_size
363551
if file_size == 0:
364552
self.print_debug(f'Skipping file: {rel_path} (empty file)')
365553
continue
366554
min_size, max_size = self._get_operation_size_limits(file_path)
367555
if min_size <= file_size <= max_size:
368-
filtered_files.append(rel_path)
556+
filtered_files.append(str(rel_path))
369557
else:
370558
self.print_debug(
371559
f'Skipping file: {rel_path} (size {file_size} outside limits {min_size}-{max_size})'
@@ -379,8 +567,11 @@ def _get_file_folder_pattern_spec(self, operation_type: str = 'scanning'):
379567
"""
380568
Get file path pattern specification.
381569
382-
:param operation_type: which operation is being performed
383-
:return: List of file path patterns
570+
Args:
571+
operation_type (str): Type of operation ('scanning' or 'fingerprinting')
572+
573+
Returns:
574+
GitIgnoreSpec: GitIgnoreSpec object containing the file path patterns
384575
"""
385576
patterns = self._get_operation_patterns(operation_type)
386577
if patterns:
@@ -391,8 +582,11 @@ def _get_size_limit_pattern_rules(self, operation_type: str = 'scanning'):
391582
"""
392583
Get size limit pattern rules.
393584
394-
:param operation_type: which operation is being performed
395-
:return: List of size limit pattern rules
585+
Args:
586+
operation_type (str): Type of operation ('scanning' or 'fingerprinting')
587+
588+
Returns:
589+
List of size limit pattern rules
396590
"""
397591
if self.scanoss_settings:
398592
size_rules = self.scanoss_settings.get_skip_sizes(operation_type)
@@ -417,6 +611,14 @@ def _get_operation_patterns(self, operation_type: str) -> List[str]:
417611
List[str]: Combined list of patterns to skip
418612
"""
419613
patterns = []
614+
615+
# Default patterns for skipping directories
616+
if not self.all_folders:
617+
DEFAULT_SKIPPED_DIR_LIST = DEFAULT_SKIPPED_DIRS_HFH if self.is_folder_hashing_scan else DEFAULT_SKIPPED_DIRS
618+
for dir_name in DEFAULT_SKIPPED_DIR_LIST:
619+
patterns.append(f'{dir_name}/')
620+
621+
# Custom patterns added in SCANOSS settings file
420622
if self.scanoss_settings:
421623
patterns.extend(self.scanoss_settings.get_skip_patterns(operation_type))
422624
return patterns
@@ -505,18 +707,21 @@ def _should_skip_file(self, file_rel_path: str) -> bool: # noqa: PLR0911
505707
"""
506708
file_name = os.path.basename(file_rel_path)
507709

710+
DEFAULT_SKIPPED_FILES_LIST = DEFAULT_SKIPPED_FILES_HFH if self.is_folder_hashing_scan else DEFAULT_SKIPPED_FILES
711+
DEFAULT_SKIPPED_EXT_LIST = DEFAULT_SKIPPED_EXT_HFH if self.is_folder_hashing_scan else DEFAULT_SKIPPED_EXT
712+
508713
if not self.hidden_files_folders and file_name.startswith('.'):
509714
self.print_debug(f'Skipping file: {file_rel_path} (hidden file)')
510715
return True
511716
if self.all_extensions:
512717
return False
513718
file_name_lower = file_name.lower()
514719
# Look for exact files
515-
if file_name_lower in DEFAULT_SKIPPED_FILES:
720+
if file_name_lower in DEFAULT_SKIPPED_FILES_LIST:
516721
self.print_debug(f'Skipping file: {file_rel_path} (matches default skip file)')
517722
return True
518723
# Look for file endings
519-
for ending in DEFAULT_SKIPPED_EXT:
724+
for ending in DEFAULT_SKIPPED_EXT_LIST:
520725
if file_name_lower.endswith(ending):
521726
self.print_debug(f'Skipping file: {file_rel_path} (matches default skip ending: {ending})')
522727
return True
@@ -531,39 +736,3 @@ def _should_skip_file(self, file_rel_path: str) -> bool: # noqa: PLR0911
531736
self.print_debug(f'Skipping file: {file_rel_path} (matches custom pattern)')
532737
return True
533738
return False
534-
535-
def _should_skip_file_for_hfh(self, file_path: Path) -> bool:
536-
"""
537-
Check if a file should be skipped during folder hashing scan.
538-
539-
Args:
540-
file_path (Path): The path to the file to check.
541-
542-
Returns:
543-
bool: True if the file should be skipped, False otherwise.
544-
"""
545-
try:
546-
if (
547-
any(part.startswith('.') for part in file_path.parts) # Hidden files/folders
548-
or file_path.is_symlink() # Symlinks
549-
or file_path.stat().st_size == 0 # Empty files
550-
):
551-
self.print_debug(f'Skipping file: {file_path} (hidden/symlink/empty)')
552-
return True
553-
554-
# Files ending with null
555-
if file_path.suffix.lower() == '.txt':
556-
try:
557-
with open(file_path, 'rb') as f:
558-
if f.read().endswith(b'\x00'):
559-
self.print_debug(f'Skipping file: {file_path} (text file ending with null)')
560-
return True
561-
except (OSError, IOError):
562-
self.print_debug(f'Skipping file: {file_path} (cannot read file content)')
563-
return True
564-
565-
return False
566-
567-
except Exception as e:
568-
self.print_debug(f'Error checking file {file_path}: {str(e)}')
569-
return True

0 commit comments

Comments
 (0)