Skip to content

Commit 8254685

Browse files
committed
[SP-2991] feat: add depth and min-cutoff-threshold arguments to folder hashing commands
1 parent e1320cc commit 8254685

File tree

5 files changed

+70
-11
lines changed

5 files changed

+70
-11
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
99
### Added
1010
- Upcoming changes...
1111

12+
## [1.31.0] - 2025-08-05
13+
### Added
14+
- Add `--min-cutoff-threshold` argument to folder scan command
15+
- Add `--depth` argument to `folder-scan` and `folder-hash` commands
16+
1217
## [1.31.4] - 2025-08-20
1318
### Added
1419
- Added support for empty dependency track project policy checks

src/scanoss/cli.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@
5757
from .components import Components
5858
from .constants import (
5959
DEFAULT_API_TIMEOUT,
60+
DEFAULT_HFH_DEPTH,
61+
DEFAULT_HFH_MIN_CUTOFF_THRESHOLD,
6062
DEFAULT_HFH_RANK_THRESHOLD,
6163
DEFAULT_POST_SIZE,
6264
DEFAULT_RETRY,
@@ -887,6 +889,18 @@ def setup_args() -> None: # noqa: PLR0912, PLR0915
887889
help='Filter results to only show those with rank value at or below this threshold (e.g., --rank-threshold 3 '
888890
'returns results with rank 1, 2, or 3). Lower rank values indicate higher quality matches.',
889891
)
892+
p_folder_scan.add_argument(
893+
'--depth',
894+
type=int,
895+
default=DEFAULT_HFH_DEPTH,
896+
help=f'Defines how deep to scan the root directory (optional - default {DEFAULT_HFH_DEPTH})',
897+
)
898+
p_folder_scan.add_argument(
899+
'--min-cutoff-threshold',
900+
type=float,
901+
default=DEFAULT_HFH_MIN_CUTOFF_THRESHOLD,
902+
help=f'Minimum score threshold to consider a match (optional - default: {DEFAULT_HFH_MIN_CUTOFF_THRESHOLD})',
903+
)
890904
p_folder_scan.set_defaults(func=folder_hashing_scan)
891905

892906
# Sub-command: folder-hash
@@ -905,6 +919,12 @@ def setup_args() -> None: # noqa: PLR0912, PLR0915
905919
default='json',
906920
help='Result output format (optional - default: json)',
907921
)
922+
p_folder_hash.add_argument(
923+
'--depth',
924+
type=int,
925+
default=DEFAULT_HFH_DEPTH,
926+
help=f'Defines how deep to hash the root directory (optional - default {DEFAULT_HFH_DEPTH})',
927+
)
908928
p_folder_hash.set_defaults(func=folder_hash)
909929

910930
# Output options
@@ -2397,6 +2417,8 @@ def folder_hashing_scan(parser, args):
23972417
client=client,
23982418
scanoss_settings=scanoss_settings,
23992419
rank_threshold=args.rank_threshold,
2420+
depth=args.depth,
2421+
min_cutoff_threshold=args.min_cutoff_threshold,
24002422
)
24012423

24022424
if scanner.scan():
@@ -2430,6 +2452,7 @@ def folder_hash(parser, args):
24302452
scan_dir=args.scan_dir,
24312453
config=folder_hasher_config,
24322454
scanoss_settings=scanoss_settings,
2455+
depth=args.depth,
24332456
)
24342457

24352458
folder_hasher.hash_directory(args.scan_dir)

src/scanoss/constants.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,6 @@
1313

1414
DEFAULT_API_TIMEOUT = 600
1515

16-
DEFAULT_HFH_RANK_THRESHOLD = 5
16+
DEFAULT_HFH_RANK_THRESHOLD = 5
17+
DEFAULT_HFH_DEPTH = 1
18+
DEFAULT_HFH_MIN_CUTOFF_THRESHOLD = 0.5

src/scanoss/scanners/folder_hasher.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from progress.bar import Bar
88

9+
from scanoss.constants import DEFAULT_HFH_DEPTH
910
from scanoss.file_filters import FileFilters
1011
from scanoss.scanoss_settings import ScanossSettings
1112
from scanoss.scanossbase import ScanossBase
@@ -72,13 +73,20 @@ class FolderHasher:
7273
7374
It builds a directory tree (DirectoryNode) and computes the associated
7475
hash data for the folder.
76+
77+
Args:
78+
scan_dir (str): The directory to be hashed.
79+
config (FolderHasherConfig): Configuration parameters for the folder hasher.
80+
scanoss_settings (Optional[ScanossSettings]): Optional settings for Scanoss.
81+
depth (int): How many levels to hash from the root directory (default: 1).
7582
"""
7683

7784
def __init__(
7885
self,
7986
scan_dir: str,
8087
config: FolderHasherConfig,
8188
scanoss_settings: Optional[ScanossSettings] = None,
89+
depth: int = DEFAULT_HFH_DEPTH,
8290
):
8391
self.base = ScanossBase(
8492
debug=config.debug,
@@ -101,6 +109,7 @@ def __init__(
101109

102110
self.scan_dir = scan_dir
103111
self.tree = None
112+
self.depth = depth
104113

105114
def hash_directory(self, path: str) -> dict:
106115
"""
@@ -123,7 +132,10 @@ def hash_directory(self, path: str) -> dict:
123132

124133
return tree
125134

126-
def _build_root_node(self, path: str) -> DirectoryNode:
135+
def _build_root_node(
136+
self,
137+
path: str,
138+
) -> DirectoryNode:
127139
"""
128140
Build a directory tree from the given path with file information.
129141
@@ -180,7 +192,7 @@ def _build_root_node(self, path: str) -> DirectoryNode:
180192
bar.finish()
181193
return root_node
182194

183-
def _hash_calc_from_node(self, node: DirectoryNode) -> dict:
195+
def _hash_calc_from_node(self, node: DirectoryNode, current_depth: int = 1) -> dict:
184196
"""
185197
Recursively compute folder hash data for a directory node.
186198
@@ -189,12 +201,13 @@ def _hash_calc_from_node(self, node: DirectoryNode) -> dict:
189201
190202
Args:
191203
node (DirectoryNode): The directory node to compute the hash for.
204+
current_depth (int): The current depth level (1-based, root is depth 1).
192205
193206
Returns:
194207
dict: The computed hash data for the node.
195208
"""
196209
hash_data = self._hash_calc(node)
197-
210+
198211
# Safely calculate relative path
199212
try:
200213
node_path = Path(node.path).resolve()
@@ -204,13 +217,18 @@ def _hash_calc_from_node(self, node: DirectoryNode) -> dict:
204217
# If relative_to fails, use the node path as is or a fallback
205218
rel_path = Path(node.path).name if node.path else Path('.')
206219

220+
# Only process children if we haven't reached the depth limit
221+
children = []
222+
if current_depth < self.depth:
223+
children = [self._hash_calc_from_node(child, current_depth + 1) for child in node.children.values()]
224+
207225
return {
208226
'path_id': str(rel_path),
209227
'sim_hash_names': f'{hash_data["name_hash"]:02x}' if hash_data['name_hash'] is not None else None,
210228
'sim_hash_content': f'{hash_data["content_hash"]:02x}' if hash_data['content_hash'] is not None else None,
211229
'sim_hash_dir_names': f'{hash_data["dir_hash"]:02x}' if hash_data['dir_hash'] is not None else None,
212230
'lang_extensions': hash_data['lang_extensions'],
213-
'children': [self._hash_calc_from_node(child) for child in node.children.values()],
231+
'children': children,
214232
}
215233

216234
def _hash_calc(self, node: DirectoryNode) -> dict:

src/scanoss/scanners/scanner_hfh.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,11 @@
2929

3030
from progress.spinner import Spinner
3131

32-
from scanoss.constants import DEFAULT_HFH_RANK_THRESHOLD
32+
from scanoss.constants import (
33+
DEFAULT_HFH_DEPTH,
34+
DEFAULT_HFH_MIN_CUTOFF_THRESHOLD,
35+
DEFAULT_HFH_RANK_THRESHOLD,
36+
)
3337
from scanoss.cyclonedx import CycloneDx
3438
from scanoss.file_filters import FileFilters
3539
from scanoss.scanners.folder_hasher import FolderHasher
@@ -48,13 +52,15 @@ class ScannerHFH:
4852
and calculates simhash values based on file names and content to detect folder-level similarities.
4953
"""
5054

51-
def __init__(
55+
def __init__( # noqa: PLR0913
5256
self,
5357
scan_dir: str,
5458
config: ScannerConfig,
5559
client: Optional[ScanossGrpc] = None,
5660
scanoss_settings: Optional[ScanossSettings] = None,
5761
rank_threshold: int = DEFAULT_HFH_RANK_THRESHOLD,
62+
depth: int = DEFAULT_HFH_DEPTH,
63+
min_cutoff_threshold: float = DEFAULT_HFH_MIN_CUTOFF_THRESHOLD,
5864
):
5965
"""
6066
Initialize the ScannerHFH.
@@ -65,6 +71,8 @@ def __init__(
6571
client (ScanossGrpc): gRPC client for communicating with the scanning service.
6672
scanoss_settings (Optional[ScanossSettings]): Optional settings for Scanoss.
6773
rank_threshold (int): Get results with rank below this threshold (default: 5).
74+
depth (int): How many levels to scan (default: 1).
75+
min_cutoff_threshold (float): Minimum score threshold to consider a match (default: 0.5).
6876
"""
6977
self.base = ScanossBase(
7078
debug=config.debug,
@@ -87,12 +95,14 @@ def __init__(
8795
scan_dir=scan_dir,
8896
config=config,
8997
scanoss_settings=scanoss_settings,
98+
depth=depth,
9099
)
91100

92101
self.scan_dir = scan_dir
93102
self.client = client
94103
self.scan_results = None
95104
self.rank_threshold = rank_threshold
105+
self.min_cutoff_threshold = min_cutoff_threshold
96106

97107
def scan(self) -> Optional[Dict]:
98108
"""
@@ -102,8 +112,9 @@ def scan(self) -> Optional[Dict]:
102112
Optional[Dict]: The folder hash response from the gRPC client, or None if an error occurs.
103113
"""
104114
hfh_request = {
105-
'root': self.folder_hasher.hash_directory(self.scan_dir),
115+
'root': self.folder_hasher.hash_directory(path=self.scan_dir),
106116
'rank_threshold': self.rank_threshold,
117+
'min_cutoff_threshold': self.min_cutoff_threshold,
107118
}
108119

109120
spinner = Spinner('Scanning folder...')
@@ -193,7 +204,7 @@ def _format_cyclonedx_output(self) -> str: # noqa: PLR0911
193204
}
194205
]
195206
}
196-
207+
197208
get_vulnerabilities_json_request = {
198209
'purls': [{'purl': purl, 'requirement': best_match_version['version']}],
199210
}
@@ -210,10 +221,10 @@ def _format_cyclonedx_output(self) -> str: # noqa: PLR0911
210221
error_msg = 'ERROR: Failed to produce CycloneDX output'
211222
self.base.print_stderr(error_msg)
212223
return None
213-
224+
214225
if vulnerabilities:
215226
cdx_output = cdx.append_vulnerabilities(cdx_output, vulnerabilities, purl)
216-
227+
217228
return json.dumps(cdx_output, indent=2)
218229
except Exception as e:
219230
self.base.print_stderr(f'ERROR: Failed to get license information: {e}')

0 commit comments

Comments
 (0)