diff --git a/scanpipe/pipelines/inspect_packages.py b/scanpipe/pipelines/inspect_packages.py index 7674f7f25f..1fe268b26d 100644 --- a/scanpipe/pipelines/inspect_packages.py +++ b/scanpipe/pipelines/inspect_packages.py @@ -22,6 +22,7 @@ from aboutcode.pipeline import optional_step from scanpipe.pipelines.scan_codebase import ScanCodebase +from scanpipe.pipes import maven from scanpipe.pipes import scancode @@ -50,6 +51,7 @@ def steps(cls): cls.flag_empty_files, cls.flag_ignored_resources, cls.scan_for_application_packages, + cls.fix_maven_jar_packages, cls.resolve_dependencies, ) @@ -65,6 +67,10 @@ def scan_for_application_packages(self): progress_logger=self.log, ) + def fix_maven_jar_packages(self): + """Fix JAR packages that should be Maven packages based on pom.properties.""" + maven.detect_maven_jars_from_pom_properties(self.project, logger_func=self.log) + @optional_step("StaticResolver") def resolve_dependencies(self): """ diff --git a/scanpipe/pipelines/scan_codebase.py b/scanpipe/pipelines/scan_codebase.py index fd6580e456..a437011784 100644 --- a/scanpipe/pipelines/scan_codebase.py +++ b/scanpipe/pipelines/scan_codebase.py @@ -22,6 +22,7 @@ from scanpipe import pipes from scanpipe.pipelines import Pipeline +from scanpipe.pipes import maven from scanpipe.pipes import scancode from scanpipe.pipes.input import copy_inputs @@ -44,6 +45,7 @@ def steps(cls): cls.flag_empty_files, cls.flag_ignored_resources, cls.scan_for_application_packages, + cls.fix_maven_jar_packages, cls.scan_for_files, cls.collect_and_create_license_detections, ) @@ -63,6 +65,10 @@ def scan_for_application_packages(self): """Scan unknown resources for packages information.""" scancode.scan_for_application_packages(self.project, progress_logger=self.log) + def fix_maven_jar_packages(self): + """Fix JAR packages that should be Maven packages based on pom.properties.""" + maven.detect_maven_jars_from_pom_properties(self.project, logger_func=self.log) + def scan_for_files(self): """Scan unknown resources for copyrights, licenses, emails, and urls.""" scancode.scan_for_files(self.project, progress_logger=self.log) diff --git a/scanpipe/pipes/maven.py b/scanpipe/pipes/maven.py new file mode 100644 index 0000000000..753741e9c8 --- /dev/null +++ b/scanpipe/pipes/maven.py @@ -0,0 +1,194 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +"""Support for Maven-specific package detection and PURL correction.""" + +import logging +import re + +from packagedcode import get_package_handler +from packageurl import PackageURL + +logger = logging.getLogger(__name__) + + +def detect_maven_jars_from_pom_properties(project, logger_func=None): + """ + Detect JAR files that should be Maven packages by looking for Maven metadata. + + This function identifies JAR packages that were incorrectly detected as + pkg:jar/ type instead of pkg:maven/ type by looking for Maven metadata + files (pom.properties) in the JAR's extracted content. + + For each detected Maven JAR, it updates the package PURL to use the + correct Maven coordinates. + """ + if logger_func: + logger_func("Detecting Maven JARs from pom.properties files...") + + maven_jars_fixed = 0 + + # Look for pom.properties files in extracted JAR contents + pom_properties_resources = project.codebaseresources.filter( + path__contains="META-INF/maven/", name="pom.properties" + ) + + for pom_resource in pom_properties_resources: + maven_coords = get_maven_coordinates_from_pom_properties(pom_resource) + if not maven_coords: + continue + + jar_package = get_jar_package_for_pom_resource(project, pom_resource, maven_coords) + if not jar_package: + continue + + if convert_jar_package_to_maven(jar_package, maven_coords, logger_func): + maven_jars_fixed += 1 + + if logger_func and maven_jars_fixed: + logger_func(f"Fixed {maven_jars_fixed} JAR packages to use Maven PURLs") + + return maven_jars_fixed + + +def get_maven_coordinates_from_pom_properties(pom_resource): + """ + Extract Maven coordinates from a pom.properties file. + + Uses the ScanCode Toolkit package handler to do the heavy lifting. + """ + if not pom_resource.location: + return None + + handler = get_package_handler(pom_resource.location) + if not handler: + return None + + packages = list(handler.parse(pom_resource.location)) + if not packages: + return None + + package = packages[0] + if not all([package.namespace, package.name, package.version]): + return None + + return { + "group_id": package.namespace, + "artifact_id": package.name, + "version": package.version, + } + + +def get_jar_package_for_pom_resource(project, pom_resource, maven_coords): + """ + Find the JAR package that matches this pom.properties file. + + We look for packages by matching the JAR path pattern. + """ + # Extract JAR path from pom.properties location + # e.g., "some-lib.jar-extract/META-INF/maven/org/example/pom.properties" -> "some-lib.jar" + pom_path = pom_resource.path + jar_match = re.search(r"(.+\.jar)-extract/", pom_path) + if not jar_match: + return None + + jar_path = jar_match.group(1) + jar_packages = project.discoveredpackages.filter(type="jar") + + # First, try to match by checking package resources + for package in jar_packages: + for resource in package.codebase_resources.all(): + if resource.path == jar_path or resource.path.startswith(jar_path + "-extract/"): + if is_maven_coordinates_match(package, maven_coords): + return package + + # Fallback: match by coordinates alone + for package in jar_packages: + if is_maven_coordinates_match(package, maven_coords): + return package + + return None + + +def is_maven_coordinates_match(package, maven_coords): + """ + Check if a package matches the Maven coordinates we found. + + We're pretty lenient here - any reasonable match counts. + """ + artifact_id = maven_coords["artifact_id"] + group_id = maven_coords["group_id"] + version = maven_coords["version"] + + # Direct name match is best + if package.name == artifact_id: + return True + + # Version match is also a good sign + if package.version == version: + return True + + # Name contains artifact ID + if package.name and artifact_id in package.name: + return True + + # Namespace matches group ID + if package.namespace == group_id: + return True + + return False + + +def convert_jar_package_to_maven(jar_package, maven_coords, logger_func=None): + """ + Convert a JAR package to proper Maven format. + + Updates the package type and coordinates based on what we found + in the pom.properties file. + """ + try: + # Build the new Maven PURL + maven_purl = PackageURL( + type="maven", + namespace=maven_coords["group_id"], + name=maven_coords["artifact_id"], + version=maven_coords["version"], + qualifiers=jar_package.qualifiers, + subpath=jar_package.subpath, + ) + + # Update package info + jar_package.update( + type="maven", + namespace=maven_coords["group_id"], + name=maven_coords["artifact_id"], + version=maven_coords["version"], + ) + + if logger_func: + logger_func(f"Converting JAR to Maven: {jar_package.package_url} -> {maven_purl}") + + return True + + except Exception as error: + logger.error(f"Failed to convert package {jar_package.uuid}: {error}") + return False \ No newline at end of file diff --git a/scanpipe/tests/pipes/test_maven.py b/scanpipe/tests/pipes/test_maven.py new file mode 100644 index 0000000000..bcb13b2844 --- /dev/null +++ b/scanpipe/tests/pipes/test_maven.py @@ -0,0 +1,162 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +from pathlib import Path +from unittest import TestCase +from unittest.mock import Mock, patch + +from scanpipe.models import CodebaseResource +from scanpipe.models import DiscoveredPackage +from scanpipe.models import Project +from scanpipe.pipes import maven + + +class ScanPipeMavenTest(TestCase): + def setUp(self): + self.project = Project.objects.create(name="Test Maven Project") + + def tearDown(self): + self.project.delete() + + def test_detect_maven_jars_from_pom_properties_basic(self): + """Test basic Maven JAR detection from pom.properties files.""" + # Create test files in the project's codebase directory + test_jar_extract_dir = self.project.codebase_path / "test.jar-extract" / "META-INF" / "maven" / "io.perfmark" / "perfmark-api" + test_jar_extract_dir.mkdir(parents=True) + + pom_properties_path = test_jar_extract_dir / "pom.properties" + pom_properties_content = ( + "# Generated by Maven\n" + "groupId=io.perfmark\n" + "artifactId=perfmark-api\n" + "version=0.27.0\n" + ) + pom_properties_path.write_text(pom_properties_content) + + # Create CodebaseResource without setting location (it's computed) + pom_resource = CodebaseResource.objects.create( + project=self.project, + path="test.jar-extract/META-INF/maven/io.perfmark/perfmark-api/pom.properties", + name="pom.properties", + type=CodebaseResource.Type.FILE + ) + + # Create a JAR package (incorrectly detected as jar type) + jar_package = DiscoveredPackage.objects.create( + project=self.project, + type="jar", + name="perfmark-api", + version="0.27.0" + ) + + # Run the Maven detection + result = maven.detect_maven_jars_from_pom_properties(self.project) + + # Verify results + self.assertEqual(1, result) + + # Refresh the package from database + jar_package.refresh_from_db() + + # Check that the package was updated to Maven type + self.assertEqual("maven", jar_package.type) + self.assertEqual("io.perfmark", jar_package.namespace) + self.assertEqual("perfmark-api", jar_package.name) + self.assertEqual("0.27.0", jar_package.version) + + @patch('pathlib.Path.read_text') + def test_extract_maven_coordinates_from_pom_properties(self, mock_read_text): + """Test extraction of Maven coordinates from pom.properties content.""" + # Mock the file content + mock_read_text.return_value = ( + "# Generated by Maven\n" + "# Some comment\n" + "groupId=io.perfmark\n" + "artifactId=perfmark-api\n" + "version=0.27.0\n" + "someOtherProperty=value\n" + ) + + # Create a mock CodebaseResource + mock_resource = Mock() + mock_resource.location = "/fake/path/pom.properties" + + # Mock Path.exists to return True + with patch('pathlib.Path.exists', return_value=True): + # Test the extraction function + coords = maven.get_maven_coordinates_from_pom_properties(mock_resource) + + # Verify the extracted coordinates + expected = { + 'group_id': 'io.perfmark', + 'artifact_id': 'perfmark-api', + 'version': '0.27.0' + } + self.assertEqual(expected, coords) + + @patch('pathlib.Path.read_text') + def test_extract_maven_coordinates_missing_fields(self, mock_read_text): + """Test extraction when required fields are missing.""" + # Mock the file content with missing fields + mock_read_text.return_value = ( + "# Generated by Maven\n" + "groupId=io.perfmark\n" + "# artifactId is missing\n" + "version=0.27.0\n" + ) + + # Create a mock CodebaseResource + mock_resource = Mock() + mock_resource.location = "/fake/path/pom.properties" + + # Mock Path.exists to return True + with patch('pathlib.Path.exists', return_value=True): + # Test the extraction function + coords = maven.get_maven_coordinates_from_pom_properties(mock_resource) + + # Should return None when required fields are missing + self.assertIsNone(coords) + + def test_no_maven_jars_detected(self): + """Test that no changes are made when no Maven JARs are found.""" + # Create a regular JAR package without Maven metadata + jar_package = DiscoveredPackage.objects.create( + project=self.project, + type="jar", + name="some-library", + version="1.0.0", + package_uid="pkg:jar/some-library@1.0.0" + ) + + # Run the Maven detection + result = maven.detect_maven_jars_from_pom_properties(self.project) + + # Verify no packages were modified + self.assertEqual(0, result) + + # Refresh the package from database + jar_package.refresh_from_db() + + # Check that the package remains unchanged + self.assertEqual("jar", jar_package.type) + self.assertEqual("some-library", jar_package.name) + self.assertEqual("1.0.0", jar_package.version) \ No newline at end of file