Skip to content

Commit c29efef

Browse files
committed
refactor(scanoss): ScanOssResultParser to improve snippets findings
* Generate one Snippet for each detected line range * Remove duplicate licenses to optimize results * Remove identified snippets from the summary Signed-off-by: Agustin Isasmendi <[email protected]>
1 parent f8b8fda commit c29efef

File tree

4 files changed

+196
-28
lines changed

4 files changed

+196
-28
lines changed

plugins/scanners/scanoss/src/main/kotlin/ScanOssResultParser.kt

Lines changed: 53 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,17 @@
1818
*/
1919

2020
package org.ossreviewtoolkit.plugins.scanners.scanoss
21-
21+
import com.scanoss.dto.LicenseDetails
2222
import com.scanoss.dto.ScanFileDetails
2323
import com.scanoss.dto.ScanFileResult
2424
import com.scanoss.dto.enums.MatchType
25+
import com.scanoss.dto.enums.StatusType
2526

27+
import java.lang.invoke.MethodHandles
2628
import java.time.Instant
2729

30+
import org.apache.logging.log4j.kotlin.loggerOf
31+
2832
import org.ossreviewtoolkit.downloader.VcsHost
2933
import org.ossreviewtoolkit.model.CopyrightFinding
3034
import org.ossreviewtoolkit.model.LicenseFinding
@@ -36,7 +40,8 @@ import org.ossreviewtoolkit.model.TextLocation
3640
import org.ossreviewtoolkit.utils.spdx.SpdxConstants
3741
import org.ossreviewtoolkit.utils.spdx.SpdxExpression
3842
import org.ossreviewtoolkit.utils.spdx.SpdxLicenseIdExpression
39-
import org.ossreviewtoolkit.utils.spdx.toExpression
43+
44+
private val logger = loggerOf(MethodHandles.lookup().lookupClass())
4045

4146
/**
4247
* Generate a summary from the given SCANOSS [result], using [startTime], [endTime] as metadata. This variant can be
@@ -56,16 +61,29 @@ internal fun generateSummary(startTime: Instant, endTime: Instant, results: List
5661
}
5762

5863
MatchType.snippet -> {
59-
val file = requireNotNull(details.file)
60-
val lines = requireNotNull(details.lines)
61-
val sourceLocations = convertLines(file, lines)
62-
val snippets = getSnippets(details)
63-
64-
snippets.forEach { snippet ->
65-
sourceLocations.forEach { sourceLocation ->
66-
// TODO: Aggregate the snippet by source file location.
67-
snippetFindings += SnippetFinding(sourceLocation, setOf(snippet))
64+
val file = requireNotNull(result.filePath)
65+
if (details.status == StatusType.pending) {
66+
val lines = requireNotNull(details.lines)
67+
val sourceLocations = convertLines(file, lines)
68+
val snippets = getSnippets(details)
69+
70+
// The number of snippets should match the number of source locations.
71+
if (sourceLocations.size != snippets.size) {
72+
logger.warn {
73+
"Unexpected mismatch in '$file': " +
74+
"${sourceLocations.size} source locations vs ${snippets.size} snippets. " +
75+
"This indicates a potential issue with line range conversion."
76+
}
6877
}
78+
79+
// Associate each source location with its corresponding snippet.
80+
sourceLocations.zip(snippets).forEach { (location, snippet) ->
81+
snippetFindings += SnippetFinding(location, setOf(snippet))
82+
}
83+
} else {
84+
logger.warn { "File '$file' is identified, not including on snippet findings" }
85+
licenseFindings += getLicenseFindings(details)
86+
copyrightFindings += getCopyrightFindings(details)
6987
}
7088
}
7189

@@ -134,36 +152,34 @@ private fun getCopyrightFindings(details: ScanFileDetails): List<CopyrightFindin
134152
}
135153

136154
/**
137-
* Get the snippet findings from the given [details]. If a snippet returned by ScanOSS contains several Purls,
138-
* several snippets are created in ORT each containing a single Purl.
155+
* Get the snippet findings from the given [details]. If a snippet returned by SCANOSS contains several Purls,
156+
* the function uses the first PURL as the primary identifier while storing all PURLs in additionalData
157+
* to preserve the complete information.
139158
*/
140-
private fun getSnippets(details: ScanFileDetails): Set<Snippet> {
159+
private fun getSnippets(details: ScanFileDetails): List<Snippet> {
141160
val matched = requireNotNull(details.matched)
142161
val fileUrl = requireNotNull(details.fileUrl)
143162
val ossLines = requireNotNull(details.ossLines)
144163
val url = requireNotNull(details.url)
145164
val purls = requireNotNull(details.purls)
146165

147-
val licenses = details.licenseDetails.orEmpty().mapTo(mutableSetOf()) { license ->
148-
SpdxExpression.parse(license.name)
149-
}
166+
val license = getUniqueLicenseExpression(details.licenseDetails.toList())
150167

151168
val score = matched.substringBeforeLast("%").toFloat()
152169
val locations = convertLines(fileUrl, ossLines)
153170
// TODO: No resolved revision is available. Should a ArtifactProvenance be created instead ?
154171
val vcsInfo = VcsHost.parseUrl(url.takeUnless { it == "none" }.orEmpty())
155172
val provenance = RepositoryProvenance(vcsInfo, ".")
156173

157-
val additionalData = mapOf("release_date" to details.releaseDate)
158-
159-
return buildSet {
160-
purls.forEach { purl ->
161-
locations.forEach { snippetLocation ->
162-
val license = licenses.toExpression()?.sorted() ?: SpdxLicenseIdExpression(SpdxConstants.NOASSERTION)
174+
// Store all PURLs in additionalData to preserve the complete information.
175+
val additionalData = mapOf(
176+
"release_date" to details.releaseDate,
177+
"all_purls" to purls.joinToString(" ")
178+
)
163179

164-
add(Snippet(score, snippetLocation, provenance, purl, license, additionalData))
165-
}
166-
}
180+
// Create one snippet per location, using the first PURL as the primary identifier.
181+
return locations.map { snippetLocation ->
182+
Snippet(score, snippetLocation, provenance, purls.firstOrNull().orEmpty(), license, additionalData)
167183
}
168184
}
169185

@@ -180,3 +196,14 @@ private fun convertLines(file: String, lineRanges: String): List<TextLocation> =
180196
else -> throw IllegalArgumentException("Unsupported line range '$lineRange'.")
181197
}
182198
}
199+
200+
fun getUniqueLicenseExpression(licensesDetails: List<LicenseDetails>): SpdxExpression {
201+
if (licensesDetails.isEmpty()) {
202+
return SpdxLicenseIdExpression(SpdxConstants.NOASSERTION)
203+
}
204+
205+
return licensesDetails
206+
.map { license -> SpdxExpression.parse(license.name) }
207+
.reduce { acc, expr -> acc and expr }
208+
.simplify()
209+
}
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
{
2+
"hung_task.c": [
3+
{
4+
"component": "proton_bluecross",
5+
"file": "kernel/hung_task.c",
6+
"file_hash": "581734935cfbe570d280a1265aaa2a6b",
7+
"file_url": "https://api.scanoss.com/file_contents/581734935cfbe570d280a1265aaa2a6b",
8+
"id": "snippet",
9+
"latest": "17",
10+
"licenses": [
11+
{
12+
"checklist_url": "https://www.osadl.org/fileadmin/checklists/unreflicenses/GPL-2.0-only.txt",
13+
"copyleft": "yes",
14+
"incompatible_with": "Apache-1.0, Apache-1.1, Apache-2.0, BSD-4-Clause, BSD-4-Clause-UC, BSD-4.3TAHOE, ECL-2.0, FTL, IJG, LicenseRef-scancode-bsla-no-advert, Minpack, OpenSSL, PHP-3.01, Python-2.0, zlib-acknowledgement, XFree86-1.1",
15+
"name": "GPL-2.0-only",
16+
"osadl_updated": "2025-02-10T14:26:00+0000",
17+
"patent_hints": "yes",
18+
"source": "scancode",
19+
"url": "https://spdx.org/licenses/GPL-2.0-only.html"
20+
},
21+
{
22+
"name": "GPL-2.0-only WITH Linux-syscall-note",
23+
"source": "scancode",
24+
"url": "https://spdx.org/licenses/GPL-2.0-only WITH Linux-syscall-note.html"
25+
},
26+
{
27+
"checklist_url": "https://www.osadl.org/fileadmin/checklists/unreflicenses/GPL-2.0-only.txt",
28+
"copyleft": "yes",
29+
"incompatible_with": "Apache-1.0, Apache-1.1, Apache-2.0, BSD-4-Clause, BSD-4-Clause-UC, BSD-4.3TAHOE, ECL-2.0, FTL, IJG, LicenseRef-scancode-bsla-no-advert, Minpack, OpenSSL, PHP-3.01, Python-2.0, zlib-acknowledgement, XFree86-1.1",
30+
"name": "GPL-2.0-only",
31+
"osadl_updated": "2025-02-10T14:26:00+0000",
32+
"patent_hints": "yes",
33+
"source": "scancode",
34+
"url": "https://spdx.org/licenses/GPL-2.0-only.html"
35+
}
36+
],
37+
"lines": "12-150,540-561",
38+
"matched": "35%",
39+
"oss_lines": "10-148,86-107",
40+
"purl": [
41+
"pkg:github/kdrag0n/proton_bluecross",
42+
"pkg:github/fake/fake_repository"
43+
],
44+
"release_date": "2019-02-21",
45+
"server": {
46+
"kb_version": {
47+
"daily": "25.03.27",
48+
"monthly": "25.03"
49+
},
50+
"version": "5.4.10"
51+
},
52+
"source_hash": "45dd1e50621a8a32f88fbe0251a470ab",
53+
"status": "pending",
54+
"url": "https://github.com/kdrag0n/proton_bluecross",
55+
"url_hash": "a9c1c67f0930dc42dbd40c29e565bcdd",
56+
"vendor": "kdrag0n",
57+
"version": "15"
58+
}
59+
]
60+
}

plugins/scanners/scanoss/src/test/kotlin/ScanOssResultParserTest.kt

Lines changed: 78 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
package org.ossreviewtoolkit.plugins.scanners.scanoss
2121

22+
import com.scanoss.dto.LicenseDetails
2223
import com.scanoss.utils.JsonUtils
2324

2425
import io.kotest.core.spec.style.WordSpec
@@ -27,6 +28,7 @@ import io.kotest.matchers.collections.containExactlyInAnyOrder
2728
import io.kotest.matchers.collections.haveSize
2829
import io.kotest.matchers.collections.shouldContain
2930
import io.kotest.matchers.should
31+
import io.kotest.matchers.shouldBe
3032

3133
import java.io.File
3234
import java.time.Instant
@@ -39,9 +41,44 @@ import org.ossreviewtoolkit.model.SnippetFinding
3941
import org.ossreviewtoolkit.model.TextLocation
4042
import org.ossreviewtoolkit.model.VcsInfo
4143
import org.ossreviewtoolkit.model.VcsType
44+
import org.ossreviewtoolkit.utils.spdx.SpdxConstants
4245
import org.ossreviewtoolkit.utils.spdx.SpdxExpression
46+
import org.ossreviewtoolkit.utils.spdx.SpdxLicenseIdExpression
4347

4448
class ScanOssResultParserTest : WordSpec({
49+
"getUniqueLicenseDetails()" should {
50+
"deduplicate complex license expressions" {
51+
val uniqueLicenses = getUniqueLicenseExpression(
52+
listOf(
53+
LicenseDetails.builder().name("MIT").build(),
54+
LicenseDetails.builder().name("MIT").build(),
55+
LicenseDetails.builder().name("GPL-2.0-only").build(),
56+
LicenseDetails.builder().name("GPL-2.0-only WITH Linux-syscall-note").build(),
57+
LicenseDetails.builder().name("GPL-2.0-only AND MIT").build()
58+
)
59+
)
60+
61+
val decomposed = uniqueLicenses.decompose().toList()
62+
63+
val expressionStrings = decomposed.map { it.toString() }
64+
65+
// Check that each license appears exactly once
66+
expressionStrings.count { it == "MIT" } shouldBe 1
67+
expressionStrings.count { it == "GPL-2.0-only" } shouldBe 1
68+
expressionStrings.count { it == "GPL-2.0-only WITH Linux-syscall-note" } shouldBe 1
69+
70+
// Ensure no unexpected elements
71+
expressionStrings.size shouldBe 3
72+
}
73+
74+
"handle empty license list" {
75+
val emptyLicenses = getUniqueLicenseExpression(listOf())
76+
77+
// Verify empty license list returns NOASSERTION
78+
emptyLicenses shouldBe SpdxLicenseIdExpression(SpdxConstants.NOASSERTION)
79+
}
80+
}
81+
4582
"generateSummary()" should {
4683
"properly summarize JUnit 4.12 findings" {
4784
val results = File("src/test/assets/scanoss-junit-4.12.json").readText().let {
@@ -126,11 +163,51 @@ class ScanOssResultParserTest : WordSpec({
126163
),
127164
"pkg:github/vdurmont/semver4j",
128165
SpdxExpression.parse("CC-BY-SA-2.0"),
129-
additionalData = mapOf("release_date" to "2019-09-13")
166+
additionalData = mapOf(
167+
"release_date" to "2019-09-13",
168+
"all_purls" to "pkg:github/vdurmont/semver4j"
169+
)
130170
)
131171
)
132172
)
133173
)
134174
}
175+
176+
"should handle multiple PURLs by selecting first as primary and preserving all in metadata" {
177+
val results = File("src/test/assets/scanoss-multiple-purls.json").readText().let {
178+
JsonUtils.toScanFileResultsFromObject(JsonUtils.toJsonObject(it))
179+
}
180+
181+
val time = Instant.now()
182+
val summary = generateSummary(time, time, results)
183+
184+
// Should have one finding per source location, not per PURL.
185+
summary.snippetFindings should haveSize(2)
186+
187+
with(summary.snippetFindings.first()) {
188+
// Check source location (local file).
189+
sourceLocation shouldBe TextLocation("hung_task.c", 12, 150)
190+
191+
// Should use first PURL as primary identifier.
192+
snippets should haveSize(1)
193+
snippets.first().purl shouldBe "pkg:github/kdrag0n/proton_bluecross"
194+
195+
// Should preserve all PURLs in additionalData.
196+
snippets.first().additionalData["all_purls"] shouldBe
197+
"pkg:github/kdrag0n/proton_bluecross pkg:github/fake/fake_repository"
198+
199+
// Check OSS location.
200+
snippets.first().location shouldBe
201+
TextLocation("https://api.scanoss.com/file_contents/581734935cfbe570d280a1265aaa2a6b", 10, 148)
202+
}
203+
204+
// Verify same behavior for second snippet.
205+
with(summary.snippetFindings.last()) {
206+
sourceLocation shouldBe TextLocation("hung_task.c", 540, 561)
207+
snippets.first().purl shouldBe "pkg:github/kdrag0n/proton_bluecross"
208+
snippets.first().location shouldBe
209+
TextLocation("https://api.scanoss.com/file_contents/581734935cfbe570d280a1265aaa2a6b", 86, 107)
210+
}
211+
}
135212
}
136213
})

plugins/scanners/scanoss/src/test/kotlin/ScanOssScannerDirectoryTest.kt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,11 @@ class ScanOssScannerDirectoryTest : StringSpec({
112112
),
113113
"pkg:github/scanoss/ort",
114114
SpdxExpression.parse("Apache-2.0"),
115-
additionalData = mapOf("release_date" to "2021-03-18")
115+
additionalData = mapOf(
116+
"release_date" to "2021-03-18",
117+
"all_purls" to "pkg:github/scanoss/ort"
118+
)
119+
116120
)
117121
)
118122
)

0 commit comments

Comments
 (0)