46
46
'copying.lib' ,
47
47
'makefile' ,
48
48
}
49
+
50
+ DEFAULT_SKIPPED_FILES_HFH = {
51
+ 'gradlew' ,
52
+ 'gradlew.bat' ,
53
+ 'mvnw' ,
54
+ 'mvnw.cmd' ,
55
+ 'gradle-wrapper.jar' ,
56
+ 'maven-wrapper.jar' ,
57
+ 'thumbs.db' ,
58
+ 'babel.config.js' ,
59
+ }
60
+
61
+
49
62
# Folders to skip
50
63
DEFAULT_SKIPPED_DIRS = {
51
64
'nbproject' ,
66
79
'test' ,
67
80
}
68
81
82
+ DEFAULT_SKIPPED_DIRS_HFH = {
83
+ 'nbproject' ,
84
+ 'nbbuild' ,
85
+ 'nbdist' ,
86
+ '__pycache__' ,
87
+ 'venv' ,
88
+ '_yardoc' ,
89
+ 'eggs' ,
90
+ 'wheels' ,
91
+ 'htmlcov' ,
92
+ '__pypackages__' ,
93
+ 'example' ,
94
+ 'examples' ,
95
+ }
96
+
69
97
70
98
# Folder endings to skip
71
99
DEFAULT_SKIPPED_DIR_EXT = {'.egg-info' }
100
+ DEFAULT_SKIPPED_DIR_EXT_HFH = {'.egg-info' }
101
+
72
102
# File extensions to skip
73
103
DEFAULT_SKIPPED_EXT = {
74
104
'.1' ,
243
273
'sqlite3' ,
244
274
}
245
275
276
+ # TODO: For hfh add the .gitignore patterns
277
+ DEFAULT_SKIPPED_EXT_HFH = {
278
+ '.1' ,
279
+ '.2' ,
280
+ '.3' ,
281
+ '.4' ,
282
+ '.5' ,
283
+ '.6' ,
284
+ '.7' ,
285
+ '.8' ,
286
+ '.9' ,
287
+ '.ac' ,
288
+ '.adoc' ,
289
+ '.am' ,
290
+ '.asciidoc' ,
291
+ '.bmp' ,
292
+ '.build' ,
293
+ '.cfg' ,
294
+ '.chm' ,
295
+ '.class' ,
296
+ '.cmake' ,
297
+ '.cnf' ,
298
+ '.conf' ,
299
+ '.config' ,
300
+ '.contributors' ,
301
+ '.copying' ,
302
+ '.crt' ,
303
+ '.csproj' ,
304
+ '.css' ,
305
+ '.csv' ,
306
+ '.dat' ,
307
+ '.data' ,
308
+ '.dtd' ,
309
+ '.dts' ,
310
+ '.iws' ,
311
+ '.c9' ,
312
+ '.c9revisions' ,
313
+ '.dtsi' ,
314
+ '.dump' ,
315
+ '.eot' ,
316
+ '.eps' ,
317
+ '.geojson' ,
318
+ '.gif' ,
319
+ '.glif' ,
320
+ '.gmo' ,
321
+ '.guess' ,
322
+ '.hex' ,
323
+ '.htm' ,
324
+ '.html' ,
325
+ '.ico' ,
326
+ '.iml' ,
327
+ '.in' ,
328
+ '.inc' ,
329
+ '.info' ,
330
+ '.ini' ,
331
+ '.ipynb' ,
332
+ '.jpeg' ,
333
+ '.jpg' ,
334
+ '.json' ,
335
+ '.jsonld' ,
336
+ '.lock' ,
337
+ '.log' ,
338
+ '.m4' ,
339
+ '.map' ,
340
+ '.md5' ,
341
+ '.meta' ,
342
+ '.mk' ,
343
+ '.mxml' ,
344
+ '.o' ,
345
+ '.otf' ,
346
+ '.out' ,
347
+ '.pbtxt' ,
348
+ '.pdf' ,
349
+ '.pem' ,
350
+ '.phtml' ,
351
+ '.plist' ,
352
+ '.png' ,
353
+ '.prefs' ,
354
+ '.properties' ,
355
+ '.pyc' ,
356
+ '.qdoc' ,
357
+ '.result' ,
358
+ '.rgb' ,
359
+ '.rst' ,
360
+ '.scss' ,
361
+ '.sha' ,
362
+ '.sha1' ,
363
+ '.sha2' ,
364
+ '.sha256' ,
365
+ '.sln' ,
366
+ '.spec' ,
367
+ '.sub' ,
368
+ '.svg' ,
369
+ '.svn-base' ,
370
+ '.tab' ,
371
+ '.template' ,
372
+ '.test' ,
373
+ '.tex' ,
374
+ '.tiff' ,
375
+ '.ttf' ,
376
+ '.txt' ,
377
+ '.utf-8' ,
378
+ '.vim' ,
379
+ '.wav' ,
380
+ '.woff' ,
381
+ '.woff2' ,
382
+ '.xht' ,
383
+ '.xhtml' ,
384
+ '.xml' ,
385
+ '.xpm' ,
386
+ '.xsd' ,
387
+ '.xul' ,
388
+ '.yaml' ,
389
+ '.yml' ,
390
+ '.wfp' ,
391
+ '.editorconfig' ,
392
+ '.dotcover' ,
393
+ '.pid' ,
394
+ '.lcov' ,
395
+ '.egg' ,
396
+ '.manifest' ,
397
+ '.cache' ,
398
+ '.coverage' ,
399
+ '.cover' ,
400
+ '.gem' ,
401
+ '.lst' ,
402
+ '.pickle' ,
403
+ '.pdb' ,
404
+ '.gml' ,
405
+ '.pot' ,
406
+ '.plt' ,
407
+ '.whml' ,
408
+ '.pom' ,
409
+ '.smtml' ,
410
+ '.min.js' ,
411
+ '.mf' ,
412
+ '.base64' ,
413
+ '.s' ,
414
+ '.diff' ,
415
+ '.patch' ,
416
+ '.rules' ,
417
+ # File endings
418
+ '-doc' ,
419
+ 'config' ,
420
+ 'news' ,
421
+ 'readme' ,
422
+ 'swiftdoc' ,
423
+ 'texidoc' ,
424
+ 'todo' ,
425
+ 'version' ,
426
+ 'ignore' ,
427
+ 'manifest' ,
428
+ 'sqlite' ,
429
+ 'sqlite3' ,
430
+ }
431
+
246
432
247
433
class FileFilters (ScanossBase ):
248
434
"""
@@ -267,6 +453,7 @@ def __init__(self, debug: bool = False, trace: bool = False, quiet: bool = False
267
453
skip_size (int): Size to skip
268
454
skip_extensions (list): Extensions to skip
269
455
skip_folders (list): Folders to skip
456
+ is_folder_hashing_scan (bool): Whether the operation is a folder hashing scan
270
457
"""
271
458
super ().__init__ (debug , trace , quiet )
272
459
@@ -277,6 +464,7 @@ def __init__(self, debug: bool = False, trace: bool = False, quiet: bool = False
277
464
self .skip_folders = kwargs .get ('skip_folders' , [])
278
465
self .skip_size = kwargs .get ('skip_size' , 0 )
279
466
self .skip_extensions = kwargs .get ('skip_extensions' , [])
467
+ self .is_folder_hashing_scan = kwargs .get ('is_folder_hashing_scan' , False )
280
468
self .file_folder_pat_spec = self ._get_file_folder_pattern_spec (kwargs .get ('operation_type' , 'scanning' ))
281
469
self .size_pat_rules = self ._get_size_limit_pattern_rules (kwargs .get ('operation_type' , 'scanning' ))
282
470
@@ -336,36 +524,36 @@ def get_filtered_files_from_files(self, files: List[str], scan_root: str = None)
336
524
"""
337
525
filtered_files = []
338
526
for file_path in files :
339
- if not os .path .exists (file_path ) or not os .path .isfile (file_path ) or os .path .islink (file_path ):
340
- self .print_debug (
341
- f'WARNING: File { file_path } does not exist, is not a file, or is a symbolic link. Ignoring.'
342
- )
343
- continue
344
-
345
527
path_obj = Path (file_path )
346
- if not self .hidden_files_folders and any (part .startswith ('.' ) for part in path_obj .parts ):
347
- self .print_debug (f'Skipping file: { file_path } (in hidden directory or is hidden file)' )
348
- continue
349
-
350
528
try :
351
529
if scan_root :
352
- rel_path = os . path . relpath ( file_path , scan_root )
530
+ rel_path = path_obj . relative_to ( scan_root )
353
531
else :
354
- rel_path = os . path . relpath ( file_path )
532
+ rel_path = str ( path_obj )
355
533
except ValueError :
356
- # If file_path is broken, symlink ignore it
357
534
self .print_debug (f'Ignoring file: { file_path } (broken symlink)' )
358
535
continue
536
+
537
+ if not path_obj .exists () or not path_obj .is_file () or path_obj .is_symlink ():
538
+ self .print_debug (
539
+ f'WARNING: File { rel_path } does not exist, is not a file, or is a symbolic link. Ignoring.'
540
+ )
541
+ continue
542
+
543
+ if not self .hidden_files_folders and any (part .startswith ('.' ) for part in path_obj .parts ):
544
+ self .print_debug (f'Skipping file: { rel_path } (in hidden directory or is hidden file)' )
545
+ continue
546
+
359
547
if self ._should_skip_file (rel_path ):
360
548
continue
361
549
try :
362
- file_size = os . path . getsize ( file_path )
550
+ file_size = path_obj . stat (). st_size
363
551
if file_size == 0 :
364
552
self .print_debug (f'Skipping file: { rel_path } (empty file)' )
365
553
continue
366
554
min_size , max_size = self ._get_operation_size_limits (file_path )
367
555
if min_size <= file_size <= max_size :
368
- filtered_files .append (rel_path )
556
+ filtered_files .append (str ( rel_path ) )
369
557
else :
370
558
self .print_debug (
371
559
f'Skipping file: { rel_path } (size { file_size } outside limits { min_size } -{ max_size } )'
@@ -379,8 +567,11 @@ def _get_file_folder_pattern_spec(self, operation_type: str = 'scanning'):
379
567
"""
380
568
Get file path pattern specification.
381
569
382
- :param operation_type: which operation is being performed
383
- :return: List of file path patterns
570
+ Args:
571
+ operation_type (str): Type of operation ('scanning' or 'fingerprinting')
572
+
573
+ Returns:
574
+ GitIgnoreSpec: GitIgnoreSpec object containing the file path patterns
384
575
"""
385
576
patterns = self ._get_operation_patterns (operation_type )
386
577
if patterns :
@@ -391,8 +582,11 @@ def _get_size_limit_pattern_rules(self, operation_type: str = 'scanning'):
391
582
"""
392
583
Get size limit pattern rules.
393
584
394
- :param operation_type: which operation is being performed
395
- :return: List of size limit pattern rules
585
+ Args:
586
+ operation_type (str): Type of operation ('scanning' or 'fingerprinting')
587
+
588
+ Returns:
589
+ List of size limit pattern rules
396
590
"""
397
591
if self .scanoss_settings :
398
592
size_rules = self .scanoss_settings .get_skip_sizes (operation_type )
@@ -417,6 +611,14 @@ def _get_operation_patterns(self, operation_type: str) -> List[str]:
417
611
List[str]: Combined list of patterns to skip
418
612
"""
419
613
patterns = []
614
+
615
+ # Default patterns for skipping directories
616
+ if not self .all_folders :
617
+ DEFAULT_SKIPPED_DIR_LIST = DEFAULT_SKIPPED_DIRS_HFH if self .is_folder_hashing_scan else DEFAULT_SKIPPED_DIRS
618
+ for dir_name in DEFAULT_SKIPPED_DIR_LIST :
619
+ patterns .append (f'{ dir_name } /' )
620
+
621
+ # Custom patterns added in SCANOSS settings file
420
622
if self .scanoss_settings :
421
623
patterns .extend (self .scanoss_settings .get_skip_patterns (operation_type ))
422
624
return patterns
@@ -505,18 +707,21 @@ def _should_skip_file(self, file_rel_path: str) -> bool: # noqa: PLR0911
505
707
"""
506
708
file_name = os .path .basename (file_rel_path )
507
709
710
+ DEFAULT_SKIPPED_FILES_LIST = DEFAULT_SKIPPED_FILES_HFH if self .is_folder_hashing_scan else DEFAULT_SKIPPED_FILES
711
+ DEFAULT_SKIPPED_EXT_LIST = DEFAULT_SKIPPED_EXT_HFH if self .is_folder_hashing_scan else DEFAULT_SKIPPED_EXT
712
+
508
713
if not self .hidden_files_folders and file_name .startswith ('.' ):
509
714
self .print_debug (f'Skipping file: { file_rel_path } (hidden file)' )
510
715
return True
511
716
if self .all_extensions :
512
717
return False
513
718
file_name_lower = file_name .lower ()
514
719
# Look for exact files
515
- if file_name_lower in DEFAULT_SKIPPED_FILES :
720
+ if file_name_lower in DEFAULT_SKIPPED_FILES_LIST :
516
721
self .print_debug (f'Skipping file: { file_rel_path } (matches default skip file)' )
517
722
return True
518
723
# Look for file endings
519
- for ending in DEFAULT_SKIPPED_EXT :
724
+ for ending in DEFAULT_SKIPPED_EXT_LIST :
520
725
if file_name_lower .endswith (ending ):
521
726
self .print_debug (f'Skipping file: { file_rel_path } (matches default skip ending: { ending } )' )
522
727
return True
@@ -531,39 +736,3 @@ def _should_skip_file(self, file_rel_path: str) -> bool: # noqa: PLR0911
531
736
self .print_debug (f'Skipping file: { file_rel_path } (matches custom pattern)' )
532
737
return True
533
738
return False
534
-
535
- def _should_skip_file_for_hfh (self , file_path : Path ) -> bool :
536
- """
537
- Check if a file should be skipped during folder hashing scan.
538
-
539
- Args:
540
- file_path (Path): The path to the file to check.
541
-
542
- Returns:
543
- bool: True if the file should be skipped, False otherwise.
544
- """
545
- try :
546
- if (
547
- any (part .startswith ('.' ) for part in file_path .parts ) # Hidden files/folders
548
- or file_path .is_symlink () # Symlinks
549
- or file_path .stat ().st_size == 0 # Empty files
550
- ):
551
- self .print_debug (f'Skipping file: { file_path } (hidden/symlink/empty)' )
552
- return True
553
-
554
- # Files ending with null
555
- if file_path .suffix .lower () == '.txt' :
556
- try :
557
- with open (file_path , 'rb' ) as f :
558
- if f .read ().endswith (b'\x00 ' ):
559
- self .print_debug (f'Skipping file: { file_path } (text file ending with null)' )
560
- return True
561
- except (OSError , IOError ):
562
- self .print_debug (f'Skipping file: { file_path } (cannot read file content)' )
563
- return True
564
-
565
- return False
566
-
567
- except Exception as e :
568
- self .print_debug (f'Error checking file { file_path } : { str (e )} ' )
569
- return True
0 commit comments