Skip to content

Commit a26dae1

Browse files
committed
[SP-2993] feat: add min_accepted_score and modify logic to not report duplicated components
1 parent 7a6da2b commit a26dae1

File tree

3 files changed

+102
-14
lines changed

3 files changed

+102
-14
lines changed

internal/domain/entities/scan_request.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ type ScanRequest struct {
66
RankThreshold int `validate:"omitempty,min=0"`
77
// Recursive threshold (e.g i only want to see results with score above this threshold)
88
RecursiveThreshold float32 `validate:"omitempty,min=0"`
9+
// Minimum accepted score - only matches with score bigger than this value will be reported (default: 0.15)
10+
MinAcceptedScore float32 `validate:"omitempty,min=0"`
911
// Filter results by category (e.g i only want to see results from github projects, npm, etc)
1012
Category string
1113
// Maximum number of results to query

internal/mapper/scan_mapper_impl.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ func (m *ScanMapperImpl) ProtoToDomain(req *scanningv2.HFHRequest) *entities.Sca
2222
return &entities.ScanRequest{
2323
RankThreshold: int(req.RankThreshold),
2424
RecursiveThreshold: req.RecursiveThreshold,
25+
MinAcceptedScore: req.MinAcceptedScore,
2526
Category: req.Category,
2627
QueryLimit: int(req.QueryLimit),
2728
Root: m.ChildrenToDomain(req.Root),

internal/service/scan_service_impl.go

Lines changed: 99 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,14 @@ func (s *ScanServiceImpl) ScanFolder(ctx context.Context, req *entities.ScanRequ
2727
return nil, err
2828
}
2929

30-
results, err := s.scanNode(ctx, req.Root, req.RankThreshold, req.RecursiveThreshold, true)
30+
results, err := s.scanNode(ctx, req.Root, req.RankThreshold, req.RecursiveThreshold, req.MinAcceptedScore, true)
3131
if err != nil {
3232
return nil, err
3333
}
3434

35+
// Deduplicate components across folders, keeping only the highest scoring instance
36+
results = s.deduplicateComponents(results)
37+
3538
response := &entities.ScanResponse{
3639
Results: results,
3740
}
@@ -41,29 +44,45 @@ func (s *ScanServiceImpl) ScanFolder(ctx context.Context, req *entities.ScanRequ
4144
return response, nil
4245
}
4346

44-
func (s *ScanServiceImpl) processComponentGroups(componentGroups []entities.ComponentGroup, path string) []*entities.ScanResult {
47+
func (s *ScanServiceImpl) processComponentGroups(componentGroups []entities.ComponentGroup, path string, minAcceptedScore float32) []*entities.ScanResult {
4548
if len(componentGroups) == 0 {
4649
return []*entities.ScanResult{}
4750
}
4851

4952
var results []*entities.ScanResult
53+
var filteredGroups []*entities.ComponentGroup
5054

51-
result := &entities.ScanResult{
52-
PathID: path,
53-
ComponentGroups: make([]*entities.ComponentGroup, len(componentGroups)),
54-
}
55+
// Filter component groups based on minimum accepted score
56+
for _, group := range componentGroups {
57+
// Filter versions within the group
58+
var filteredVersions []entities.Version
59+
for _, version := range group.Versions {
60+
if version.Score > minAcceptedScore {
61+
filteredVersions = append(filteredVersions, version)
62+
}
63+
}
5564

56-
for i, group := range componentGroups {
57-
groupCopy := group
58-
result.ComponentGroups[i] = &groupCopy
65+
// Only include the group if it has at least one version above the threshold
66+
if len(filteredVersions) > 0 {
67+
groupCopy := group
68+
groupCopy.Versions = filteredVersions
69+
filteredGroups = append(filteredGroups, &groupCopy)
70+
}
5971
}
6072

61-
results = append(results, result)
73+
// Only create a result if we have filtered groups
74+
if len(filteredGroups) > 0 {
75+
result := &entities.ScanResult{
76+
PathID: path,
77+
ComponentGroups: filteredGroups,
78+
}
79+
results = append(results, result)
80+
}
6281

6382
return results
6483
}
6584

66-
func (s *ScanServiceImpl) scanNode(ctx context.Context, node *entities.FolderNode, rankThreshold int, recursiveThreshold float32, isRoot bool) ([]*entities.ScanResult, error) {
85+
func (s *ScanServiceImpl) scanNode(ctx context.Context, node *entities.FolderNode, rankThreshold int, recursiveThreshold float32, minAcceptedScore float32, isRoot bool) ([]*entities.ScanResult, error) {
6786
logger := ctxzap.Extract(ctx).Sugar()
6887

6988
if node.SimHashDirNames == "" && node.SimHashNames == "" && node.SimHashContent == "" {
@@ -86,15 +105,15 @@ func (s *ScanServiceImpl) scanNode(ctx context.Context, node *entities.FolderNod
86105
// Check if any component group has a version with score >= recursiveThreshold
87106
if shouldCheckThreshold && recursiveThreshold > 0 && s.hasHighScoreMatch(componentGroups, recursiveThreshold) {
88107
logger.Infof("Found high score match (>= %f) for node %s, stopping search", recursiveThreshold, node.PathID)
89-
results := s.processComponentGroups(componentGroups, node.PathID)
108+
results := s.processComponentGroups(componentGroups, node.PathID, minAcceptedScore)
90109
return results, nil
91110
}
92111

93-
results := s.processComponentGroups(componentGroups, node.PathID)
112+
results := s.processComponentGroups(componentGroups, node.PathID, minAcceptedScore)
94113

95114
if len(node.Children) > 0 {
96115
for _, child := range node.Children {
97-
childResults, err := s.scanNode(ctx, child, rankThreshold, recursiveThreshold, false)
116+
childResults, err := s.scanNode(ctx, child, rankThreshold, recursiveThreshold, minAcceptedScore, false)
98117
if err != nil {
99118
return nil, err
100119
}
@@ -116,3 +135,69 @@ func (s *ScanServiceImpl) hasHighScoreMatch(componentGroups []entities.Component
116135
}
117136
return false
118137
}
138+
139+
// deduplicateComponents removes duplicate components across folders, keeping only the highest scoring instance
140+
func (s *ScanServiceImpl) deduplicateComponents(results []*entities.ScanResult) []*entities.ScanResult {
141+
if len(results) == 0 {
142+
return results
143+
}
144+
145+
// Map to track the best component instance: PURL -> (pathID, componentGroup, maxScore)
146+
type componentInfo struct {
147+
pathID string
148+
component *entities.ComponentGroup
149+
maxScore float32
150+
}
151+
bestComponents := make(map[string]*componentInfo)
152+
153+
// Find the highest scoring instance of each component
154+
for _, result := range results {
155+
for _, group := range result.ComponentGroups {
156+
// Find the maximum score for this component group
157+
var maxScore float32
158+
for _, version := range group.Versions {
159+
if version.Score > maxScore {
160+
maxScore = version.Score
161+
}
162+
}
163+
164+
// Check if we've seen this component before
165+
if existing, exists := bestComponents[group.PURL]; exists {
166+
// Keep the one with higher score
167+
if maxScore > existing.maxScore {
168+
bestComponents[group.PURL] = &componentInfo{
169+
pathID: result.PathID,
170+
component: group,
171+
maxScore: maxScore,
172+
}
173+
}
174+
} else {
175+
// First time seeing this component
176+
bestComponents[group.PURL] = &componentInfo{
177+
pathID: result.PathID,
178+
component: group,
179+
maxScore: maxScore,
180+
}
181+
}
182+
}
183+
}
184+
185+
// Rebuild results with deduplicated components
186+
pathToComponents := make(map[string][]*entities.ComponentGroup)
187+
for _, info := range bestComponents {
188+
pathToComponents[info.pathID] = append(pathToComponents[info.pathID], info.component)
189+
}
190+
191+
// Create new result set
192+
var deduplicatedResults []*entities.ScanResult
193+
for pathID, components := range pathToComponents {
194+
if len(components) > 0 {
195+
deduplicatedResults = append(deduplicatedResults, &entities.ScanResult{
196+
PathID: pathID,
197+
ComponentGroups: components,
198+
})
199+
}
200+
}
201+
202+
return deduplicatedResults
203+
}

0 commit comments

Comments
 (0)