From 16ab5933fe6caaff02d668f0f6d754ce1aead4c9 Mon Sep 17 00:00:00 2001 From: Varsha U N Date: Mon, 18 Aug 2025 17:45:42 +0530 Subject: [PATCH 01/18] Add download archiving system with LocalFilesystem, S3, and SFTP providers Signed-off-by: Varsha U N --- scancodeio/settings.py | 57 +++++- scanpipe/archiving.py | 405 ++++++++++++++++++++++++++++++++++++++++ scanpipe/pipes/input.py | 125 ++++++++++++- 3 files changed, 584 insertions(+), 3 deletions(-) create mode 100644 scanpipe/archiving.py diff --git a/scancodeio/settings.py b/scancodeio/settings.py index 2ffacb19f9..4d7c8cf472 100644 --- a/scancodeio/settings.py +++ b/scancodeio/settings.py @@ -23,8 +23,10 @@ import sys import tempfile from pathlib import Path - +from venv import logger import environ +from scanpipe.archiving import LocalFilesystemProvider, S3LikeProvider, SftpProvider + PROJECT_DIR = environ.Path(__file__) - 1 ROOT_DIR = PROJECT_DIR - 1 @@ -371,6 +373,59 @@ CRISPY_TEMPLATE_PACK = "bootstrap3" +# Storing archives locally or in S3 (Package Storage settings) + +ENABLE_DOWNLOAD_ARCHIVING = env.bool("ENABLE_DOWNLOAD_ARCHIVING", default=False) + +# localstorage, s3, sftp +DOWNLOAD_ARCHIVING_PROVIDER = env.str("DOWNLOAD_ARCHIVING_PROVIDER", default="localstorage") + +# For local storage, we would store the root path in that setting +DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION = env.dict("DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION", default=None) + +# Initialize the DownloadStore based on provider + +download_store = None +if ENABLE_DOWNLOAD_ARCHIVING: + if DOWNLOAD_ARCHIVING_PROVIDER == "localstorage": + config = DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION or {} + root_path = Path(config.get("root_path", "/var/scancodeio/downloads")) + try: + download_store = LocalFilesystemProvider(root_path=root_path) + except Exception as e: + logger.error(f"Failed to initialize LocalFilesystemProvider: {e}") + elif DOWNLOAD_ARCHIVING_PROVIDER == "s3": + config = DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION or {} + required_keys = ["bucket_name", "aws_userid", "aws_apikey"] + if not all(key in config for key in required_keys): + logger.error(f"S3 provider requires {required_keys} in DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION") + else: + try: + download_store = S3LikeProvider( + bucket_name=config.get("bucket_name"), + aws_userid=config.get("aws_userid"), + aws_apikey=config.get("aws_apikey"), + other_aws_credentials=config.get("other_aws_credentials", {}), + ) + except Exception as e: + logger.error(f"Failed to initialize S3LikeProvider: {e}") + elif DOWNLOAD_ARCHIVING_PROVIDER == "sftp": + config = DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION or {} + required_keys = ["host", "root_path", "ssh_credentials"] + if not all(key in config for key in required_keys): + logger.error(f"SFTP provider requires {required_keys} in DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION") + else: + try: + download_store = SftpProvider( + host=config.get("host"), + root_path=config.get("root_path"), + ssh_credentials=config.get("ssh_credentials", {}), + ) + except Exception as e: + logger.error(f"Failed to initialize SftpProvider: {e}") + else: + logger.error(f"Unknown DOWNLOAD_ARCHIVING_PROVIDER: {DOWNLOAD_ARCHIVING_PROVIDER}") + # Job Queue RQ_QUEUES = { diff --git a/scanpipe/archiving.py b/scanpipe/archiving.py new file mode 100644 index 0000000000..ca72be2c01 --- /dev/null +++ b/scanpipe/archiving.py @@ -0,0 +1,405 @@ +# scanpipe/archiving.py +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +from abc import ABC, abstractmethod +from dataclasses import dataclass +import hashlib +import json +import logging +from pathlib import Path +import boto3 +from botocore.exceptions import ClientError +import paramiko +from paramiko.ssh_exception import SSHException +import os + +logger = logging.getLogger(__name__) + +@dataclass +class Download: + sha256: str + download_date: str + download_url: str + filename: str + +class DownloadStore(ABC): + def _compute_sha256(self, content: bytes) -> str: + """Compute SHA256 hash for content.""" + return hashlib.sha256(content).hexdigest() + + def _compute_origin_hash(self, filename: str, download_date: str, download_url: str) -> str: + """Compute a hash for the metadata to name the origin JSON file.""" + to_hash = f"{filename}{download_date}{download_url}".encode("utf-8") + return hashlib.sha256(to_hash).hexdigest() + + def _build_metadata(self, sha256: str, filename: str, download_date: str, download_url: str) -> dict: + """Build metadata dictionary for JSON storage.""" + return { + "sha256": sha256, + "filename": filename, + "download_date": download_date, + "url": download_url + } + + @abstractmethod + def _get_content_path(self, sha256: str) -> str: + """Get the storage path/key for the content based on SHA256.""" + pass + + @abstractmethod + def list(self): + """Return an iterable of all stored downloads.""" + pass + + @abstractmethod + def get(self, sha256_checksum: str): + """Return a Download object for this checksum or None.""" + pass + + @abstractmethod + def put(self, content: bytes, download_url: str, download_date: str, filename: str): + """Store content with its metadata. Return a Download object on success. Raise an exception on error.""" + pass + + @abstractmethod + def find(self, download_url: str = None, filename: str = None, download_date: str = None): + """Return a Download object matching the metadata or None.""" + pass + +class LocalFilesystemProvider(DownloadStore): + def __init__(self, root_path: Path): + self.root_path = root_path + + def _get_content_path(self, sha256: str) -> Path: + """Create a nested path like 59/4c/67/... based on the SHA256 hash.""" + return self.root_path / sha256[:2] / sha256[2:4] / sha256[4:] + + def list(self): + """Return an iterable of all stored downloads.""" + downloads = [] + for content_path in self.root_path.rglob("content"): + sha256 = str(content_path.parent.relative_to(self.root_path)).replace("/", "") + origin_files = list(content_path.parent.glob("origin-*.json")) + for origin_file in origin_files: + try: + with open(origin_file, "r") as f: + data = json.load(f) + downloads.append(Download(**data)) + except Exception as e: + logger.error(f"Error reading {origin_file}: {e}") + return downloads + + def get(self, sha256_checksum: str): + """Retrieve a Download object for the given SHA256 hash.""" + content_path = self._get_content_path(sha256_checksum) + if content_path.exists(): + origin_files = list(content_path.glob("origin-*.json")) + if origin_files: + try: + with open(origin_files[0], "r") as f: + data = json.load(f) + return Download(**data) + except Exception as e: + logger.error(f"Error reading origin file for {sha256_checksum}: {e}") + return None + + def put(self, content: bytes, download_url: str, download_date: str, filename: str): + """Store the content and its metadata.""" + sha256 = self._compute_sha256(content) + content_path = self._get_content_path(sha256) + content_path.mkdir(parents=True, exist_ok=True) + + content_file = content_path / "content" + if not content_file.exists(): + try: + with open(content_file, 'wb') as f: + f.write(content) + except Exception as e: + raise Exception(f"Failed to write content to {content_file}: {e}") + + origin_hash = self._compute_origin_hash(filename, download_date, download_url) + origin_filename = f"origin-{origin_hash}.json" + origin_path = content_path / origin_filename + if origin_path.exists(): + raise Exception(f"Origin {origin_filename} already exists") + + metadata = self._build_metadata(sha256, filename, download_date, download_url) + try: + with open(origin_path, 'w') as f: + json.dump(metadata, f, indent=2) + except Exception as e: + raise Exception(f"Failed to write metadata to {origin_path}: {e}") + + return Download(**metadata) + + def find(self, download_url: str = None, filename: str = None, download_date: str = None): + """Find a download based on metadata.""" + if not (download_url or filename or download_date): + return None + for content_path in self.root_path.rglob("origin-*.json"): + try: + with open(content_path, "r") as f: + data = json.load(f) + if ( + (download_url is None or data.get("url") == download_url) and + (filename is None or data.get("filename") == filename) and + (download_date is None or data.get("download_date") == download_date) + ): + return Download(**data) + except Exception as e: + logger.error(f"Error reading {content_path}: {e}") + return None + +class S3LikeProvider(DownloadStore): + def __init__(self, bucket_name: str, aws_userid: str, aws_apikey: str, other_aws_credentials: dict): + self.bucket_name = bucket_name + self.s3_client = boto3.client( + 's3', + aws_access_key_id=aws_userid, + aws_secret_access_key=aws_apikey, + **(other_aws_credentials or {}) + ) + + def _get_content_path(self, sha256: str) -> str: + """S3 key like 59/4c/67//""" + return f"{sha256[:2]}/{sha256[2:4]}/{sha256[4:]}/" + + def list(self): + """List all stored downloads.""" + downloads = [] + try: + paginator = self.s3_client.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=self.bucket_name): + for obj in page.get("Contents", []): + key = obj["Key"] + if key.endswith(".json"): + try: + response = self.s3_client.get_object(Bucket=self.bucket_name, Key=key) + data = json.loads(response["Body"].read()) + downloads.append(Download(**data)) + except Exception as e: + logger.error(f"Error reading S3 object {key}: {e}") + except ClientError as e: + logger.error(f"Failed to list S3 objects: {e}") + return downloads + + def get(self, sha256_checksum: str): + """Retrieve a Download object for the given SHA256 hash.""" + prefix = self._get_content_path(sha256_checksum) + try: + response = self.s3_client.list_objects_v2( + Bucket=self.bucket_name, + Prefix=prefix, + MaxKeys=1 + ) + if "Contents" in response: + key = response["Contents"][0]["Key"] + obj_response = self.s3_client.get_object(Bucket=self.bucket_name, Key=key) + data = json.loads(obj_response["Body"].read()) + return Download(**data) + except ClientError as e: + logger.error(f"Failed to get S3 object for {sha256_checksum}: {e}") + return None + + def put(self, content: bytes, download_url: str, download_date: str, filename: str): + """Store the content and its metadata.""" + sha256 = self._compute_sha256(content) + content_key = self._get_content_path(sha256) + "content" + try: + self.s3_client.head_object(Bucket=self.bucket_name, Key=content_key) + logger.info(f"Content already exists for {sha256}") + except ClientError: + try: + self.s3_client.put_object( + Bucket=self.bucket_name, + Key=content_key, + Body=content, + ) + except ClientError as e: + raise Exception(f"Failed to write content to S3 {content_key}: {e}") + + origin_hash = self._compute_origin_hash(filename, download_date, download_url) + origin_filename = f"origin-{origin_hash}.json" + origin_key = self._get_content_path(sha256) + origin_filename + + metadata = self._build_metadata(sha256, filename, download_date, download_url) + metadata_json = json.dumps(metadata, indent=2).encode("utf-8") + try: + self.s3_client.put_object( + Bucket=self.bucket_name, + Key=origin_key, + Body=metadata_json, + ) + except ClientError as e: + raise Exception(f"Failed to write metadata to S3 {origin_key}: {e}") + + return Download(**metadata) + + def find(self, download_url: str = None, filename: str = None, download_date: str = None): + """Find a download based on metadata.""" + if not (download_url or filename or download_date): + return None + try: + paginator = self.s3_client.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=self.bucket_name): + for obj in page.get("Contents", []): + key = obj["Key"] + if key.endswith(".json"): + try: + response = self.s3_client.get_object(Bucket=self.bucket_name, Key=key) + data = json.loads(response["Body"].read()) + if ( + (download_url is None or data.get("url") == download_url) and + (filename is None or data.get("filename") == filename) and + (download_date is None or data.get("download_date") == download_date) + ): + return Download(**data) + except Exception as e: + logger.error(f"Error reading S3 object {key}: {e}") + except ClientError as e: + logger.error(f"Failed to find in S3: {e}") + return None + +class SftpProvider(DownloadStore): + def __init__(self, host: str, root_path: str, ssh_credentials: dict): + self.host = host + self.root_path = Path(root_path) + self.ssh_credentials = ssh_credentials + self.ssh = paramiko.SSHClient() + self.ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + try: + self.ssh.connect( + hostname=host, + username=ssh_credentials.get("username"), + password=ssh_credentials.get("password"), + ) + self.sftp = self.ssh.open_sftp() + except SSHException as e: + raise Exception(f"Failed to connect to SFTP server {host}: {e}") + + def _get_content_path(self, sha256: str) -> str: + """SFTP path like 59/4c/67//""" + return str(self.root_path / sha256[:2] / sha256[2:4] / sha256[4:]) + + def list(self): + """List all stored downloads.""" + downloads = [] + try: + for root, _, files in self._sftp_walk(self.root_path): + for filename in files: + if filename.endswith(".json"): + file_path = os.path.join(root, filename) + try: + with self.sftp.open(file_path, "r") as f: + data = json.load(f) + downloads.append(Download(**data)) + except Exception as e: + logger.error(f"Error reading SFTP file {file_path}: {e}") + except SSHException as e: + logger.error(f"Failed to list SFTP files: {e}") + return downloads + + def _sftp_walk(self, path): + """Recursively walk SFTP directory.""" + path = str(path) + for entry in self.sftp.listdir_attr(path): + full_path = os.path.join(path, entry.filename) + if stat.S_ISDIR(entry.st_mode): + yield from self._sftp_walk(full_path) + else: + yield path, [], [entry.filename] + + def get(self, sha256_checksum: str): + """Retrieve a Download object for the given SHA256 hash.""" + content_path = self._get_content_path(sha256_checksum) + try: + files = self.sftp.listdir(content_path) + origin_files = [f for f in files if f.startswith("origin-") and f.endswith(".json")] + if origin_files: + with self.sftp.open(os.path.join(content_path, origin_files[0]), "r") as f: + data = json.load(f) + return Download(**data) + except SSHException as e: + logger.error(f"Failed to get SFTP file for {sha256_checksum}: {e}") + return None + + def put(self, content: bytes, download_url: str, download_date: str, filename: str): + """Store the content and its metadata.""" + sha256 = self._compute_sha256(content) + content_path = self._get_content_path(sha256) + try: + self.sftp.mkdir(content_path) + except SSHException: + pass + + content_file = os.path.join(content_path, "content") + try: + self.sftp.stat(content_file) + logger.info(f"Content already exists for {sha256}") + except SSHException: + try: + with self.sftp.open(content_file, 'wb') as f: + f.write(content) + except SSHException as e: + raise Exception(f"Failed to write content to SFTP {content_file}: {e}") + + origin_hash = self._compute_origin_hash(filename, download_date, download_url) + origin_filename = f"origin-{origin_hash}.json" + origin_path = os.path.join(content_path, origin_filename) + try: + self.sftp.stat(origin_path) + raise Exception(f"Origin {origin_filename} already exists") + except SSHException: + metadata = self._build_metadata(sha256, filename, download_date, download_url) + metadata_json = json.dumps(metadata, indent=2).encode("utf-8") + try: + with self.sftp.open(origin_path, 'wb') as f: + f.write(metadata_json) + except SSHException as e: + raise Exception(f"Failed to write metadata to SFTP {origin_path}: {e}") + + return Download(**metadata) + + def find(self, download_url: str = None, filename: str = None, download_date: str = None): + """Find a download based on metadata.""" + if not (download_url or filename or download_date): + return None + try: + for root, _, files in self._sftp_walk(self.root_path): + for filename in files: + if filename.endswith(".json"): + file_path = os.path.join(root, filename) + try: + with self.sftp.open(file_path, "r") as f: + data = json.load(f) + if ( + (download_url is None or data.get("url") == download_url) and + (filename is None or data.get("filename") == filename) and + (download_date is None or data.get("download_date") == download_date) + ): + return Download(**data) + except Exception as e: + logger.error(f"Error reading SFTP file {file_path}: {e}") + except SSHException as e: + logger.error(f"Failed to find in SFTP: {e}") + return None \ No newline at end of file diff --git a/scanpipe/pipes/input.py b/scanpipe/pipes/input.py index 8defc41c6e..835d851a42 100644 --- a/scanpipe/pipes/input.py +++ b/scanpipe/pipes/input.py @@ -23,6 +23,10 @@ import os import shutil from pathlib import Path +import logging +from datetime import datetime +import hashlib +import requests from django.core.exceptions import FieldDoesNotExist from django.core.validators import EMPTY_VALUES @@ -32,13 +36,14 @@ from typecode.contenttype import get_type from scanpipe import pipes -from scanpipe.models import CodebaseRelation +from scanpipe.models import CodebaseRelation, InputSource from scanpipe.models import CodebaseResource from scanpipe.models import DiscoveredDependency from scanpipe.models import DiscoveredPackage from scanpipe.pipes import scancode from scanpipe.pipes.output import mappings_key_by_fieldname - +from scanpipe.settings import download_store, ENABLE_DOWNLOAD_ARCHIVING, DOWNLOAD_ARCHIVING_PROVIDER +logger = logging.getLogger(__name__) def copy_input(input_location, dest_path): """Copy the ``input_location`` (file or directory) to the ``dest_path``.""" @@ -229,3 +234,119 @@ def load_inventory_from_xlsx(project, input_location, extra_data_prefix=None): if extra_data_prefix: extra_data = {extra_data_prefix: extra_data} project.update_extra_data(extra_data) + +def add_input_from_url(project, url, filename=None): + """ + Download the file from the provided ``url`` and add it as an InputSource for the + specified ``project``. Optionally, specify a ``filename`` for the downloaded file. + If archiving is enabled, store the content in the DownloadStore and save metadata. + """ + try: + response = requests.get(url, stream=True) + response.raise_for_status() + content = response.content + except requests.RequestException as e: + logger.error(f"Failed to download {url}: {e}") + raise + + should_archive = ( + ENABLE_DOWNLOAD_ARCHIVING == "always" or + (ENABLE_DOWNLOAD_ARCHIVING == "per_project" and getattr(project, "archive_downloads", False)) or + (ENABLE_DOWNLOAD_ARCHIVING == "per_input" and "archive" in getattr(project, "input_tags", [])) + ) + + filename = filename or url.split("/")[-1] + if should_archive and download_store: + sha256 = hashlib.sha256(content).hexdigest() + existing_download = download_store.get(sha256) + if not existing_download: + try: + download = download_store.put( + content=content, + download_url=url, + download_date=datetime.now().isoformat(), + filename=filename + ) + except Exception as e: + logger.error(f"Failed to archive download for {url}: {e}") + raise + else: + download = existing_download + + InputSource.objects.create( + project=project, + sha256=download.sha256, + download_url=download.download_url, + filename=download.filename, + download_date=download.download_date, + is_uploaded=False, + ) + else: + input_path = project.input_path / filename + try: + with open(input_path, 'wb') as f: + f.write(content) + except Exception as e: + logger.error(f"Failed to save {filename} to {input_path}: {e}") + raise + + InputSource.objects.create( + project=project, + filename=filename, + download_url=url, + is_uploaded=False, + ) + +def add_input_from_upload(project, uploaded_file): + """ + Add an uploaded file as an InputSource for the specified ``project``. + If archiving is enabled, store the content in the DownloadStore and save metadata. + """ + content = uploaded_file.read() + filename = uploaded_file.name + + should_archive = ( + ENABLE_DOWNLOAD_ARCHIVING == "always" or + (ENABLE_DOWNLOAD_ARCHIVING == "per_project" and getattr(project, "archive_downloads", False)) or + (ENABLE_DOWNLOAD_ARCHIVING == "per_input" and "archive" in getattr(project, "input_tags", [])) + ) + + if should_archive and download_store: + sha256 = hashlib.sha256(content).hexdigest() + existing_download = download_store.get(sha256) + if not existing_download: + try: + download = download_store.put( + content=content, + download_url="", # No URL for uploads + download_date=datetime.now().isoformat(), + filename=filename + ) + except Exception as e: + logger.error(f"Failed to archive upload {filename}: {e}") + raise + else: + download = existing_download + + InputSource.objects.create( + project=project, + sha256=download.sha256, + download_url=download.download_url, + filename=download.filename, + download_date=download.download_date, + is_uploaded=True, + ) + else: + input_path = project.input_path / filename + try: + with open(input_path, 'wb') as f: + f.write(content) + except Exception as e: + logger.error(f"Failed to save {filename} to {input_path}: {e}") + raise + + InputSource.objects.create( + project=project, + filename=filename, + is_uploaded=True, + ) \ No newline at end of file From 0bc58cfefe5020c19e981d49711a1f655481d199 Mon Sep 17 00:00:00 2001 From: Varsha U N Date: Mon, 25 Aug 2025 05:13:52 +0530 Subject: [PATCH 02/18] add test for localfilesysytem Signed-off-by: Varsha U N --- scancodeio/settings.py | 27 +++- scanpipe/archiving.py | 153 ++++++++++++------ scanpipe/pipelines/__init__.py | 50 +++++- scanpipe/pipes/input.py | 54 ++++--- .../tests/data/test-downloads/sample.tar.gz | Bin 0 -> 30 bytes scanpipe/tests/test_archiving.py | 86 ++++++++++ scanpipe/tests/test_input.py | 107 ++++++++++++ scanpipe/tests/test_pipelines.py | 31 ++++ 8 files changed, 436 insertions(+), 72 deletions(-) create mode 100644 scanpipe/tests/data/test-downloads/sample.tar.gz create mode 100644 scanpipe/tests/test_archiving.py create mode 100644 scanpipe/tests/test_input.py diff --git a/scancodeio/settings.py b/scancodeio/settings.py index 4d7c8cf472..66bd4bfe9b 100644 --- a/scancodeio/settings.py +++ b/scancodeio/settings.py @@ -24,9 +24,12 @@ import tempfile from pathlib import Path from venv import logger + import environ -from scanpipe.archiving import LocalFilesystemProvider, S3LikeProvider, SftpProvider +from scanpipe.archiving import LocalFilesystemProvider +from scanpipe.archiving import S3LikeProvider +from scanpipe.archiving import SftpProvider PROJECT_DIR = environ.Path(__file__) - 1 ROOT_DIR = PROJECT_DIR - 1 @@ -378,10 +381,14 @@ ENABLE_DOWNLOAD_ARCHIVING = env.bool("ENABLE_DOWNLOAD_ARCHIVING", default=False) # localstorage, s3, sftp -DOWNLOAD_ARCHIVING_PROVIDER = env.str("DOWNLOAD_ARCHIVING_PROVIDER", default="localstorage") +DOWNLOAD_ARCHIVING_PROVIDER = env.str( + "DOWNLOAD_ARCHIVING_PROVIDER", default="localstorage" +) # For local storage, we would store the root path in that setting -DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION = env.dict("DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION", default=None) +DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION = env.dict( + "DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION", default=None +) # Initialize the DownloadStore based on provider @@ -398,7 +405,10 @@ config = DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION or {} required_keys = ["bucket_name", "aws_userid", "aws_apikey"] if not all(key in config for key in required_keys): - logger.error(f"S3 provider requires {required_keys} in DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION") + logger.error( + f"S3 provider requires {required_keys}" + "in DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION" + ) else: try: download_store = S3LikeProvider( @@ -413,7 +423,10 @@ config = DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION or {} required_keys = ["host", "root_path", "ssh_credentials"] if not all(key in config for key in required_keys): - logger.error(f"SFTP provider requires {required_keys} in DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION") + logger.error( + f"SFTP provider requires {required_keys}" + "in DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION" + ) else: try: download_store = SftpProvider( @@ -424,7 +437,9 @@ except Exception as e: logger.error(f"Failed to initialize SftpProvider: {e}") else: - logger.error(f"Unknown DOWNLOAD_ARCHIVING_PROVIDER: {DOWNLOAD_ARCHIVING_PROVIDER}") + logger.error( + f"Unknown DOWNLOAD_ARCHIVING_PROVIDER: {DOWNLOAD_ARCHIVING_PROVIDER}" + ) # Job Queue diff --git a/scanpipe/archiving.py b/scanpipe/archiving.py index ca72be2c01..4e9910cd35 100644 --- a/scanpipe/archiving.py +++ b/scanpipe/archiving.py @@ -21,20 +21,24 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. -from abc import ABC, abstractmethod -from dataclasses import dataclass import hashlib import json import logging -from pathlib import Path -import boto3 -from botocore.exceptions import ClientError -import paramiko -from paramiko.ssh_exception import SSHException import os +import stat +from abc import ABC +from abc import abstractmethod +from dataclasses import dataclass +from pathlib import Path + +import boto3 +import paramiko +from botocore.exceptions import ClientError +from paramiko.ssh_exception import SSHException logger = logging.getLogger(__name__) + @dataclass class Download: sha256: str @@ -42,23 +46,28 @@ class Download: download_url: str filename: str + class DownloadStore(ABC): def _compute_sha256(self, content: bytes) -> str: """Compute SHA256 hash for content.""" return hashlib.sha256(content).hexdigest() - def _compute_origin_hash(self, filename: str, download_date: str, download_url: str) -> str: + def _compute_origin_hash( + self, filename: str, download_date: str, download_url: str + ) -> str: """Compute a hash for the metadata to name the origin JSON file.""" - to_hash = f"{filename}{download_date}{download_url}".encode("utf-8") + to_hash = f"{filename}{download_date}{download_url}".encode() return hashlib.sha256(to_hash).hexdigest() - def _build_metadata(self, sha256: str, filename: str, download_date: str, download_url: str) -> dict: + def _build_metadata( + self, sha256: str, filename: str, download_date: str, download_url: str + ) -> dict: """Build metadata dictionary for JSON storage.""" return { "sha256": sha256, "filename": filename, "download_date": download_date, - "url": download_url + "url": download_url, } @abstractmethod @@ -78,14 +87,20 @@ def get(self, sha256_checksum: str): @abstractmethod def put(self, content: bytes, download_url: str, download_date: str, filename: str): - """Store content with its metadata. Return a Download object on success. Raise an exception on error.""" + """ + Store content with its metadata. Return a Download object on success. + Raise an exception on error. + """ pass @abstractmethod - def find(self, download_url: str = None, filename: str = None, download_date: str = None): + def find( + self, download_url: str = None, filename: str = None, download_date: str = None + ): """Return a Download object matching the metadata or None.""" pass + class LocalFilesystemProvider(DownloadStore): def __init__(self, root_path: Path): self.root_path = root_path @@ -98,11 +113,10 @@ def list(self): """Return an iterable of all stored downloads.""" downloads = [] for content_path in self.root_path.rglob("content"): - sha256 = str(content_path.parent.relative_to(self.root_path)).replace("/", "") origin_files = list(content_path.parent.glob("origin-*.json")) for origin_file in origin_files: try: - with open(origin_file, "r") as f: + with open(origin_file) as f: data = json.load(f) downloads.append(Download(**data)) except Exception as e: @@ -116,11 +130,13 @@ def get(self, sha256_checksum: str): origin_files = list(content_path.glob("origin-*.json")) if origin_files: try: - with open(origin_files[0], "r") as f: + with open(origin_files[0]) as f: data = json.load(f) return Download(**data) except Exception as e: - logger.error(f"Error reading origin file for {sha256_checksum}: {e}") + logger.error( + f"Error reading origin file for {sha256_checksum}: {e}" + ) return None def put(self, content: bytes, download_url: str, download_date: str, filename: str): @@ -132,7 +148,7 @@ def put(self, content: bytes, download_url: str, download_date: str, filename: s content_file = content_path / "content" if not content_file.exists(): try: - with open(content_file, 'wb') as f: + with open(content_file, "wb") as f: f.write(content) except Exception as e: raise Exception(f"Failed to write content to {content_file}: {e}") @@ -145,39 +161,51 @@ def put(self, content: bytes, download_url: str, download_date: str, filename: s metadata = self._build_metadata(sha256, filename, download_date, download_url) try: - with open(origin_path, 'w') as f: + with open(origin_path, "w") as f: json.dump(metadata, f, indent=2) except Exception as e: raise Exception(f"Failed to write metadata to {origin_path}: {e}") return Download(**metadata) - def find(self, download_url: str = None, filename: str = None, download_date: str = None): + def find( + self, download_url: str = None, filename: str = None, download_date: str = None + ): """Find a download based on metadata.""" if not (download_url or filename or download_date): return None for content_path in self.root_path.rglob("origin-*.json"): try: - with open(content_path, "r") as f: + with open(content_path) as f: data = json.load(f) if ( - (download_url is None or data.get("url") == download_url) and - (filename is None or data.get("filename") == filename) and - (download_date is None or data.get("download_date") == download_date) + (download_url is None or data.get("url") == download_url) + and (filename is None or data.get("filename") == filename) + and ( + download_date is None + or data.get("download_date") == download_date + ) ): return Download(**data) except Exception as e: logger.error(f"Error reading {content_path}: {e}") return None + class S3LikeProvider(DownloadStore): - def __init__(self, bucket_name: str, aws_userid: str, aws_apikey: str, other_aws_credentials: dict): + def __init__( + self, + bucket_name: str, + aws_userid: str, + aws_apikey: str, + other_aws_credentials: dict, + ): self.bucket_name = bucket_name self.s3_client = boto3.client( - 's3', + "s3", aws_access_key_id=aws_userid, aws_secret_access_key=aws_apikey, - **(other_aws_credentials or {}) + **(other_aws_credentials or {}), ) def _get_content_path(self, sha256: str) -> str: @@ -194,7 +222,9 @@ def list(self): key = obj["Key"] if key.endswith(".json"): try: - response = self.s3_client.get_object(Bucket=self.bucket_name, Key=key) + response = self.s3_client.get_object( + Bucket=self.bucket_name, Key=key + ) data = json.loads(response["Body"].read()) downloads.append(Download(**data)) except Exception as e: @@ -208,13 +238,13 @@ def get(self, sha256_checksum: str): prefix = self._get_content_path(sha256_checksum) try: response = self.s3_client.list_objects_v2( - Bucket=self.bucket_name, - Prefix=prefix, - MaxKeys=1 + Bucket=self.bucket_name, Prefix=prefix, MaxKeys=1 ) if "Contents" in response: key = response["Contents"][0]["Key"] - obj_response = self.s3_client.get_object(Bucket=self.bucket_name, Key=key) + obj_response = self.s3_client.get_object( + Bucket=self.bucket_name, Key=key + ) data = json.loads(obj_response["Body"].read()) return Download(**data) except ClientError as e: @@ -255,7 +285,9 @@ def put(self, content: bytes, download_url: str, download_date: str, filename: s return Download(**metadata) - def find(self, download_url: str = None, filename: str = None, download_date: str = None): + def find( + self, download_url: str = None, filename: str = None, download_date: str = None + ): """Find a download based on metadata.""" if not (download_url or filename or download_date): return None @@ -266,12 +298,22 @@ def find(self, download_url: str = None, filename: str = None, download_date: st key = obj["Key"] if key.endswith(".json"): try: - response = self.s3_client.get_object(Bucket=self.bucket_name, Key=key) + response = self.s3_client.get_object( + Bucket=self.bucket_name, Key=key + ) data = json.loads(response["Body"].read()) if ( - (download_url is None or data.get("url") == download_url) and - (filename is None or data.get("filename") == filename) and - (download_date is None or data.get("download_date") == download_date) + ( + download_url is None + or data.get("url") == download_url + ) + and ( + filename is None or data.get("filename") == filename + ) + and ( + download_date is None + or data.get("download_date") == download_date + ) ): return Download(**data) except Exception as e: @@ -280,6 +322,7 @@ def find(self, download_url: str = None, filename: str = None, download_date: st logger.error(f"Failed to find in S3: {e}") return None + class SftpProvider(DownloadStore): def __init__(self, host: str, root_path: str, ssh_credentials: dict): self.host = host @@ -334,9 +377,13 @@ def get(self, sha256_checksum: str): content_path = self._get_content_path(sha256_checksum) try: files = self.sftp.listdir(content_path) - origin_files = [f for f in files if f.startswith("origin-") and f.endswith(".json")] + origin_files = [ + f for f in files if f.startswith("origin-") and f.endswith(".json") + ] if origin_files: - with self.sftp.open(os.path.join(content_path, origin_files[0]), "r") as f: + with self.sftp.open( + os.path.join(content_path, origin_files[0]), "r" + ) as f: data = json.load(f) return Download(**data) except SSHException as e: @@ -358,7 +405,7 @@ def put(self, content: bytes, download_url: str, download_date: str, filename: s logger.info(f"Content already exists for {sha256}") except SSHException: try: - with self.sftp.open(content_file, 'wb') as f: + with self.sftp.open(content_file, "wb") as f: f.write(content) except SSHException as e: raise Exception(f"Failed to write content to SFTP {content_file}: {e}") @@ -370,17 +417,21 @@ def put(self, content: bytes, download_url: str, download_date: str, filename: s self.sftp.stat(origin_path) raise Exception(f"Origin {origin_filename} already exists") except SSHException: - metadata = self._build_metadata(sha256, filename, download_date, download_url) + metadata = self._build_metadata( + sha256, filename, download_date, download_url + ) metadata_json = json.dumps(metadata, indent=2).encode("utf-8") try: - with self.sftp.open(origin_path, 'wb') as f: + with self.sftp.open(origin_path, "wb") as f: f.write(metadata_json) except SSHException as e: raise Exception(f"Failed to write metadata to SFTP {origin_path}: {e}") return Download(**metadata) - def find(self, download_url: str = None, filename: str = None, download_date: str = None): + def find( + self, download_url: str = None, filename: str = None, download_date: str = None + ): """Find a download based on metadata.""" if not (download_url or filename or download_date): return None @@ -393,13 +444,21 @@ def find(self, download_url: str = None, filename: str = None, download_date: st with self.sftp.open(file_path, "r") as f: data = json.load(f) if ( - (download_url is None or data.get("url") == download_url) and - (filename is None or data.get("filename") == filename) and - (download_date is None or data.get("download_date") == download_date) + ( + download_url is None + or data.get("url") == download_url + ) + and ( + filename is None or data.get("filename") == filename + ) + and ( + download_date is None + or data.get("download_date") == download_date + ) ): return Download(**data) except Exception as e: logger.error(f"Error reading SFTP file {file_path}: {e}") except SSHException as e: logger.error(f"Failed to find in SFTP: {e}") - return None \ No newline at end of file + return None diff --git a/scanpipe/pipelines/__init__.py b/scanpipe/pipelines/__init__.py index b2a3f61cc5..303b35ae45 100644 --- a/scanpipe/pipelines/__init__.py +++ b/scanpipe/pipelines/__init__.py @@ -24,14 +24,18 @@ import logging import traceback from contextlib import contextmanager +from datetime import datetime from functools import wraps from pathlib import Path import bleach +import requests from markdown_it import MarkdownIt from pyinstrument import Profiler from aboutcode.pipeline import BasePipeline +from scanpipe.settings import ENABLE_DOWNLOAD_ARCHIVING +from scanpipe.settings import download_store logger = logging.getLogger(__name__) @@ -153,6 +157,46 @@ def download_missing_inputs(self): if error_tracebacks: raise InputFilesError(error_tracebacks) + def archive_downloads(self): + """ + Archive downloaded inputs to the centralized DownloadStore if not already + archived.Updates InputSource with archiving metadata (sha256, download_date). + """ + logger.info(f"Archiving downloads for project {self.project.name}") + for input_source in self.project.inputsources.filter( + sha256__isnull=True, is_uploaded=False + ): + if input_source.download_url: + try: + response = requests.get( + input_source.download_url, stream=True,timeout=30 + ) + response.raise_for_status() + content = response.content + filename = ( + input_source.filename + or input_source.download_url.split("/")[-1] + ) + download = download_store.put( + content=content, + download_url=input_source.download_url, + download_date=datetime.now().isoformat(), + filename=filename, + ) + input_source.sha256 = download.sha256 + input_source.download_date = download.download_date + input_source.save() + except Exception as e: + self.add_error( + exception=e, + message=f"Failed to archive {input_source.download_url}", + ) + else: + logger.warning( + f"No download URL for input {input_source.filename}," + "skipping archiving" + ) + class ProjectPipeline(CommonStepsMixin, BasePipeline): """Main class for all project related pipelines including common steps methods.""" @@ -182,8 +226,12 @@ def __init__(self, run_instance): @classmethod def get_initial_steps(cls): """Add the ``download_inputs`` step as an initial step if enabled.""" + steps = [] if cls.download_inputs: - return (cls.download_missing_inputs,) + steps.append(cls.download_missing_inputs) + if ENABLE_DOWNLOAD_ARCHIVING: + steps.append(cls.archive_downloads) + return tuple(steps) @classmethod def get_info(cls, as_html=False): diff --git a/scanpipe/pipes/input.py b/scanpipe/pipes/input.py index 835d851a42..9268d86376 100644 --- a/scanpipe/pipes/input.py +++ b/scanpipe/pipes/input.py @@ -20,31 +20,35 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. +import hashlib +import logging import os import shutil -from pathlib import Path -import logging from datetime import datetime -import hashlib -import requests +from pathlib import Path from django.core.exceptions import FieldDoesNotExist from django.core.validators import EMPTY_VALUES from django.db import models import openpyxl +import requests from typecode.contenttype import get_type from scanpipe import pipes -from scanpipe.models import CodebaseRelation, InputSource +from scanpipe.models import CodebaseRelation from scanpipe.models import CodebaseResource from scanpipe.models import DiscoveredDependency from scanpipe.models import DiscoveredPackage +from scanpipe.models import InputSource from scanpipe.pipes import scancode from scanpipe.pipes.output import mappings_key_by_fieldname -from scanpipe.settings import download_store, ENABLE_DOWNLOAD_ARCHIVING, DOWNLOAD_ARCHIVING_PROVIDER +from scanpipe.settings import ENABLE_DOWNLOAD_ARCHIVING +from scanpipe.settings import download_store + logger = logging.getLogger(__name__) + def copy_input(input_location, dest_path): """Copy the ``input_location`` (file or directory) to the ``dest_path``.""" input_path = Path(input_location) @@ -235,6 +239,7 @@ def load_inventory_from_xlsx(project, input_location, extra_data_prefix=None): extra_data = {extra_data_prefix: extra_data} project.update_extra_data(extra_data) + def add_input_from_url(project, url, filename=None): """ Download the file from the provided ``url`` and add it as an InputSource for the @@ -242,7 +247,7 @@ def add_input_from_url(project, url, filename=None): If archiving is enabled, store the content in the DownloadStore and save metadata. """ try: - response = requests.get(url, stream=True) + response = requests.get(url, stream=True,timeout=30) response.raise_for_status() content = response.content except requests.RequestException as e: @@ -250,9 +255,15 @@ def add_input_from_url(project, url, filename=None): raise should_archive = ( - ENABLE_DOWNLOAD_ARCHIVING == "always" or - (ENABLE_DOWNLOAD_ARCHIVING == "per_project" and getattr(project, "archive_downloads", False)) or - (ENABLE_DOWNLOAD_ARCHIVING == "per_input" and "archive" in getattr(project, "input_tags", [])) + ENABLE_DOWNLOAD_ARCHIVING == "always" + or ( + ENABLE_DOWNLOAD_ARCHIVING == "per_project" + and getattr(project, "archive_downloads", False) + ) + or ( + ENABLE_DOWNLOAD_ARCHIVING == "per_input" + and "archive" in getattr(project, "input_tags", []) + ) ) filename = filename or url.split("/")[-1] @@ -265,7 +276,7 @@ def add_input_from_url(project, url, filename=None): content=content, download_url=url, download_date=datetime.now().isoformat(), - filename=filename + filename=filename, ) except Exception as e: logger.error(f"Failed to archive download for {url}: {e}") @@ -284,7 +295,7 @@ def add_input_from_url(project, url, filename=None): else: input_path = project.input_path / filename try: - with open(input_path, 'wb') as f: + with open(input_path, "wb") as f: f.write(content) except Exception as e: logger.error(f"Failed to save {filename} to {input_path}: {e}") @@ -297,6 +308,7 @@ def add_input_from_url(project, url, filename=None): is_uploaded=False, ) + def add_input_from_upload(project, uploaded_file): """ Add an uploaded file as an InputSource for the specified ``project``. @@ -306,9 +318,15 @@ def add_input_from_upload(project, uploaded_file): filename = uploaded_file.name should_archive = ( - ENABLE_DOWNLOAD_ARCHIVING == "always" or - (ENABLE_DOWNLOAD_ARCHIVING == "per_project" and getattr(project, "archive_downloads", False)) or - (ENABLE_DOWNLOAD_ARCHIVING == "per_input" and "archive" in getattr(project, "input_tags", [])) + ENABLE_DOWNLOAD_ARCHIVING == "always" + or ( + ENABLE_DOWNLOAD_ARCHIVING == "per_project" + and getattr(project, "archive_downloads", False) + ) + or ( + ENABLE_DOWNLOAD_ARCHIVING == "per_input" + and "archive" in getattr(project, "input_tags", []) + ) ) if should_archive and download_store: @@ -320,7 +338,7 @@ def add_input_from_upload(project, uploaded_file): content=content, download_url="", # No URL for uploads download_date=datetime.now().isoformat(), - filename=filename + filename=filename, ) except Exception as e: logger.error(f"Failed to archive upload {filename}: {e}") @@ -339,7 +357,7 @@ def add_input_from_upload(project, uploaded_file): else: input_path = project.input_path / filename try: - with open(input_path, 'wb') as f: + with open(input_path, "wb") as f: f.write(content) except Exception as e: logger.error(f"Failed to save {filename} to {input_path}: {e}") @@ -349,4 +367,4 @@ def add_input_from_upload(project, uploaded_file): project=project, filename=filename, is_uploaded=True, - ) \ No newline at end of file + ) diff --git a/scanpipe/tests/data/test-downloads/sample.tar.gz b/scanpipe/tests/data/test-downloads/sample.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..e83f605c8683701c1a320145c4ba0b6e8284a62e GIT binary patch literal 30 mcmb2|=3uzMy*h(|Iaz{rae_!sz<~n{40`LgzGq-zU;qG&GzlC4 literal 0 HcmV?d00001 diff --git a/scanpipe/tests/test_archiving.py b/scanpipe/tests/test_archiving.py new file mode 100644 index 0000000000..a249c96c46 --- /dev/null +++ b/scanpipe/tests/test_archiving.py @@ -0,0 +1,86 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + + +import hashlib +from pathlib import Path + +from django.test import TestCase + +from scanpipe.archiving import LocalFilesystemProvider +from scanpipe.tests import make_project + + +class TestArchiving(TestCase): + def setUp(self): + self.project = make_project() + self.root_path = Path(__file__).parent / "data" / "test_downloads" + self.store = LocalFilesystemProvider(root_path=self.root_path) + self.test_content = b"test content" + self.test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" + self.test_filename = "sample.tar.gz" + + def tearDown(self): + if self.root_path.exists(): + import shutil + + shutil.rmtree(self.root_path) + + def test_local_filesystem_provider_put_get(self): + download = self.store.put( + content=self.test_content, + download_url=self.test_url, + download_date="2025-08-21T09:00:00", + filename=self.test_filename, + ) + sha256 = hashlib.sha256(self.test_content).hexdigest() + self.assertEqual(download.sha256, sha256) + self.assertEqual(download.download_url, self.test_url) + self.assertEqual(download.filename, self.test_filename) + self.assertEqual(download.download_date, "2025-08-21T09:00:00") + content_path = ( + self.root_path / sha256[:2] / sha256[2:4] / sha256[4:] / "content" + ) + self.assertTrue(content_path.exists()) + with open(content_path, "rb") as f: + self.assertEqual(f.read(), self.test_content) + + retrieved = self.store.get(sha256) + self.assertEqual(retrieved.sha256, sha256) + self.assertEqual(retrieved.download_url, self.test_url) + self.assertEqual(retrieved.filename, self.test_filename) + + def test_local_filesystem_provider_deduplication(self): + download1 = self.store.put( + content=self.test_content, + download_url=self.test_url, + download_date="2025-08-21T09:00:00", + filename=self.test_filename, + ) + download2 = self.store.put( + content=self.test_content, + download_url="https://files.pythonhosted.org/packages/another.tar.gz", + download_date="2025-08-21T10:00:00", + filename="another.tar.gz", + ) + self.assertEqual(download1.sha256, download2.sha256) + self.assertEqual(download1.download_url, self.test_url) diff --git a/scanpipe/tests/test_input.py b/scanpipe/tests/test_input.py new file mode 100644 index 0000000000..32863463a9 --- /dev/null +++ b/scanpipe/tests/test_input.py @@ -0,0 +1,107 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + + +from pathlib import Path +from unittest.mock import patch + +from django.core.files.uploadedfile import SimpleUploadedFile +from django.test import TestCase + +from scanpipe.models import InputSource +from scanpipe.pipes.input import add_input_from_upload +from scanpipe.pipes.input import add_input_from_url +from scanpipe.settings import download_store +from scanpipe.tests import make_project + + +class TestInput(TestCase): + def setUp(self): + self.project = make_project() + self.test_filename = "sample.tar.gz" + self.test_data_path = ( + Path(__file__).parent / "data" / "test-downloads" / self.test_filename + ) + with open(self.test_data_path, "rb") as f: + self.test_content = f.read() + + @patch("requests.get") + def test_add_input_from_url_with_archiving(self): + with patch("scanpipe.settings.ENABLE_DOWNLOAD_ARCHIVING", "always"): + test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" + mock_get = self.mocker.patch("requests.get") + mock_get.return_value.content = self.test_content + mock_get.return_value.status_code = 200 + add_input_from_url(self.project, test_url, filename=self.test_filename) + input_source = InputSource.objects.get(project=self.project) + self.assertEqual(input_source.filename, self.test_filename) + self.assertEqual(input_source.download_url, test_url) + self.assertTrue(input_source.sha256) + self.assertTrue(input_source.download_date) + self.assertFalse(input_source.is_uploaded) + if download_store: + download = download_store.get(input_source.sha256) + self.assertEqual(download.download_url, test_url) + + @patch("requests.get") + def test_add_input_from_url_without_archiving(self): + with patch("scanpipe.settings.ENABLE_DOWNLOAD_ARCHIVING", False): + test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" + mock_get = self.mocker.patch("requests.get") + mock_get.return_value.content = self.test_content + mock_get.return_value.status_code = 200 + add_input_from_url(self.project, test_url, filename=self.test_filename) + input_source = InputSource.objects.get(project=self.project) + self.assertEqual(input_source.filename, self.test_filename) + self.assertEqual(input_source.download_url, test_url) + self.assertFalse(input_source.sha256) + self.assertFalse(input_source.download_date) + self.assertFalse(input_source.is_uploaded) + input_path = self.project.input_path / self.test_filename + self.assertTrue(input_path.exists()) + + def test_add_input_from_upload_with_archiving(self): + with patch("scanpipe.settings.ENABLE_DOWNLOAD_ARCHIVING", "always"): + uploaded_file = SimpleUploadedFile(self.test_filename, self.test_content) + add_input_from_upload(self.project, uploaded_file) + input_source = InputSource.objects.get(project=self.project) + self.assertEqual(input_source.filename, self.test_filename) + self.assertEqual(input_source.download_url, "") + self.assertTrue(input_source.sha256) + self.assertTrue(input_source.download_date) + self.assertTrue(input_source.is_uploaded) + if download_store: + download = download_store.get(input_source.sha256) + self.assertEqual(download.filename, self.test_filename) + + def test_add_input_from_upload_without_archiving(self): + with patch("scanpipe.settings.ENABLE_DOWNLOAD_ARCHIVING", False): + uploaded_file = SimpleUploadedFile(self.test_filename, self.test_content) + add_input_from_upload(self.project, uploaded_file) + input_source = InputSource.objects.get(project=self.project) + self.assertEqual(input_source.filename, self.test_filename) + self.assertEqual(input_source.download_url, "") + self.assertFalse(input_source.sha256) + self.assertFalse(input_source.download_date) + self.assertTrue(input_source.is_uploaded) + input_path = self.project.input_path / self.test_filename + self.assertTrue(input_path.exists()) diff --git a/scanpipe/tests/test_pipelines.py b/scanpipe/tests/test_pipelines.py index 1f4f75d091..6c4248a226 100644 --- a/scanpipe/tests/test_pipelines.py +++ b/scanpipe/tests/test_pipelines.py @@ -29,6 +29,7 @@ from pathlib import Path from unittest import mock from unittest import skipIf +from unittest.mock import patch from django.conf import settings from django.test import TestCase @@ -40,6 +41,7 @@ from scanpipe import pipes from scanpipe.models import CodebaseResource from scanpipe.models import DiscoveredPackage +from scanpipe.models import InputSource from scanpipe.pipelines import CommonStepsMixin from scanpipe.pipelines import InputFilesError from scanpipe.pipelines import Pipeline @@ -285,6 +287,35 @@ def mock_make_to_path(**kwargs): self.assertEqual("scancode.io.git", input_source.filename) self.assertTrue(input_source.exists()) + @mock.patch("requests.get") + def test_archive_downloads(self, mock_get): + project1 = make_project() + run = project1.add_pipeline("scan_codebase") + pipeline = run.make_pipeline_instance() + test_filename = "sample.tar.gz" + test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" + test_data_path = ( + Path(__file__).parent / "data" / "test-downloads" / test_filename + ) + with open(test_data_path, "rb") as f: + test_content = f.read() + + InputSource.objects.create( + project=project1, + filename=test_filename, + download_url=test_url, + is_uploaded=False, + ) + with patch("scanpipe.settings.ENABLE_DOWNLOAD_ARCHIVING", "always"): + mock_get.return_value.content = test_content + mock_get.return_value.status_code = 200 + pipeline.archive_downloads() + input_source = InputSource.objects.get(project=project1) + self.assertTrue(input_source.sha256) + self.assertTrue(input_source.download_date) + self.assertEqual(input_source.download_url, test_url) + self.assertEqual(input_source.filename, test_filename) + def test_scanpipe_pipeline_class_save_errors_context_manager(self): project1 = make_project() run = project1.add_pipeline("do_nothing") From 54769339d658a243216ba0b62430aabd4696c606 Mon Sep 17 00:00:00 2001 From: Varsha U N Date: Mon, 15 Sep 2025 17:28:13 +0530 Subject: [PATCH 03/18] modify the required imports Signed-off-by: Varsha U N --- Dockerfile | 2 +- scancodeio/settings.py | 43 +----- scanpipe/archiving.py | 274 --------------------------------- scanpipe/pipelines/__init__.py | 4 +- scanpipe/pipes/input.py | 4 +- scanpipe/tests/test_input.py | 2 +- 6 files changed, 9 insertions(+), 320 deletions(-) diff --git a/Dockerfile b/Dockerfile index 5c42c68592..6a38d97eed 100644 --- a/Dockerfile +++ b/Dockerfile @@ -91,4 +91,4 @@ COPY --chown=$APP_USER:$APP_USER pyproject.toml $APP_DIR/ RUN pip install --no-cache-dir . # Copy the codebase and set the proper permissions for the APP_USER -COPY --chown=$APP_USER:$APP_USER . $APP_DIR +COPY --chown=$APP_USER:$APP_USER . $APP_DIR \ No newline at end of file diff --git a/scancodeio/settings.py b/scancodeio/settings.py index 56714963cf..cb2c2a9983 100644 --- a/scancodeio/settings.py +++ b/scancodeio/settings.py @@ -28,8 +28,6 @@ import environ from scanpipe.archiving import LocalFilesystemProvider -from scanpipe.archiving import S3LikeProvider -from scanpipe.archiving import SftpProvider PROJECT_DIR = environ.Path(__file__) - 1 ROOT_DIR = PROJECT_DIR - 1 @@ -378,11 +376,11 @@ CRISPY_TEMPLATE_PACK = "bootstrap3" -# Storing archives locally or in S3 (Package Storage settings) +# Storing archives locally (Package Storage settings) ENABLE_DOWNLOAD_ARCHIVING = env.bool("ENABLE_DOWNLOAD_ARCHIVING", default=False) -# localstorage, s3, sftp +# localstorage configuration DOWNLOAD_ARCHIVING_PROVIDER = env.str( "DOWNLOAD_ARCHIVING_PROVIDER", default="localstorage" ) @@ -392,7 +390,7 @@ "DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION", default=None ) -# Initialize the DownloadStore based on provider +# Initialize the DownloadStore for local storage download_store = None if ENABLE_DOWNLOAD_ARCHIVING: @@ -403,41 +401,6 @@ download_store = LocalFilesystemProvider(root_path=root_path) except Exception as e: logger.error(f"Failed to initialize LocalFilesystemProvider: {e}") - elif DOWNLOAD_ARCHIVING_PROVIDER == "s3": - config = DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION or {} - required_keys = ["bucket_name", "aws_userid", "aws_apikey"] - if not all(key in config for key in required_keys): - logger.error( - f"S3 provider requires {required_keys}" - "in DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION" - ) - else: - try: - download_store = S3LikeProvider( - bucket_name=config.get("bucket_name"), - aws_userid=config.get("aws_userid"), - aws_apikey=config.get("aws_apikey"), - other_aws_credentials=config.get("other_aws_credentials", {}), - ) - except Exception as e: - logger.error(f"Failed to initialize S3LikeProvider: {e}") - elif DOWNLOAD_ARCHIVING_PROVIDER == "sftp": - config = DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION or {} - required_keys = ["host", "root_path", "ssh_credentials"] - if not all(key in config for key in required_keys): - logger.error( - f"SFTP provider requires {required_keys}" - "in DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION" - ) - else: - try: - download_store = SftpProvider( - host=config.get("host"), - root_path=config.get("root_path"), - ssh_credentials=config.get("ssh_credentials", {}), - ) - except Exception as e: - logger.error(f"Failed to initialize SftpProvider: {e}") else: logger.error( f"Unknown DOWNLOAD_ARCHIVING_PROVIDER: {DOWNLOAD_ARCHIVING_PROVIDER}" diff --git a/scanpipe/archiving.py b/scanpipe/archiving.py index 4e9910cd35..8d7dd9f853 100644 --- a/scanpipe/archiving.py +++ b/scanpipe/archiving.py @@ -31,10 +31,6 @@ from dataclasses import dataclass from pathlib import Path -import boto3 -import paramiko -from botocore.exceptions import ClientError -from paramiko.ssh_exception import SSHException logger = logging.getLogger(__name__) @@ -192,273 +188,3 @@ def find( return None -class S3LikeProvider(DownloadStore): - def __init__( - self, - bucket_name: str, - aws_userid: str, - aws_apikey: str, - other_aws_credentials: dict, - ): - self.bucket_name = bucket_name - self.s3_client = boto3.client( - "s3", - aws_access_key_id=aws_userid, - aws_secret_access_key=aws_apikey, - **(other_aws_credentials or {}), - ) - - def _get_content_path(self, sha256: str) -> str: - """S3 key like 59/4c/67//""" - return f"{sha256[:2]}/{sha256[2:4]}/{sha256[4:]}/" - - def list(self): - """List all stored downloads.""" - downloads = [] - try: - paginator = self.s3_client.get_paginator("list_objects_v2") - for page in paginator.paginate(Bucket=self.bucket_name): - for obj in page.get("Contents", []): - key = obj["Key"] - if key.endswith(".json"): - try: - response = self.s3_client.get_object( - Bucket=self.bucket_name, Key=key - ) - data = json.loads(response["Body"].read()) - downloads.append(Download(**data)) - except Exception as e: - logger.error(f"Error reading S3 object {key}: {e}") - except ClientError as e: - logger.error(f"Failed to list S3 objects: {e}") - return downloads - - def get(self, sha256_checksum: str): - """Retrieve a Download object for the given SHA256 hash.""" - prefix = self._get_content_path(sha256_checksum) - try: - response = self.s3_client.list_objects_v2( - Bucket=self.bucket_name, Prefix=prefix, MaxKeys=1 - ) - if "Contents" in response: - key = response["Contents"][0]["Key"] - obj_response = self.s3_client.get_object( - Bucket=self.bucket_name, Key=key - ) - data = json.loads(obj_response["Body"].read()) - return Download(**data) - except ClientError as e: - logger.error(f"Failed to get S3 object for {sha256_checksum}: {e}") - return None - - def put(self, content: bytes, download_url: str, download_date: str, filename: str): - """Store the content and its metadata.""" - sha256 = self._compute_sha256(content) - content_key = self._get_content_path(sha256) + "content" - try: - self.s3_client.head_object(Bucket=self.bucket_name, Key=content_key) - logger.info(f"Content already exists for {sha256}") - except ClientError: - try: - self.s3_client.put_object( - Bucket=self.bucket_name, - Key=content_key, - Body=content, - ) - except ClientError as e: - raise Exception(f"Failed to write content to S3 {content_key}: {e}") - - origin_hash = self._compute_origin_hash(filename, download_date, download_url) - origin_filename = f"origin-{origin_hash}.json" - origin_key = self._get_content_path(sha256) + origin_filename - - metadata = self._build_metadata(sha256, filename, download_date, download_url) - metadata_json = json.dumps(metadata, indent=2).encode("utf-8") - try: - self.s3_client.put_object( - Bucket=self.bucket_name, - Key=origin_key, - Body=metadata_json, - ) - except ClientError as e: - raise Exception(f"Failed to write metadata to S3 {origin_key}: {e}") - - return Download(**metadata) - - def find( - self, download_url: str = None, filename: str = None, download_date: str = None - ): - """Find a download based on metadata.""" - if not (download_url or filename or download_date): - return None - try: - paginator = self.s3_client.get_paginator("list_objects_v2") - for page in paginator.paginate(Bucket=self.bucket_name): - for obj in page.get("Contents", []): - key = obj["Key"] - if key.endswith(".json"): - try: - response = self.s3_client.get_object( - Bucket=self.bucket_name, Key=key - ) - data = json.loads(response["Body"].read()) - if ( - ( - download_url is None - or data.get("url") == download_url - ) - and ( - filename is None or data.get("filename") == filename - ) - and ( - download_date is None - or data.get("download_date") == download_date - ) - ): - return Download(**data) - except Exception as e: - logger.error(f"Error reading S3 object {key}: {e}") - except ClientError as e: - logger.error(f"Failed to find in S3: {e}") - return None - - -class SftpProvider(DownloadStore): - def __init__(self, host: str, root_path: str, ssh_credentials: dict): - self.host = host - self.root_path = Path(root_path) - self.ssh_credentials = ssh_credentials - self.ssh = paramiko.SSHClient() - self.ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - try: - self.ssh.connect( - hostname=host, - username=ssh_credentials.get("username"), - password=ssh_credentials.get("password"), - ) - self.sftp = self.ssh.open_sftp() - except SSHException as e: - raise Exception(f"Failed to connect to SFTP server {host}: {e}") - - def _get_content_path(self, sha256: str) -> str: - """SFTP path like 59/4c/67//""" - return str(self.root_path / sha256[:2] / sha256[2:4] / sha256[4:]) - - def list(self): - """List all stored downloads.""" - downloads = [] - try: - for root, _, files in self._sftp_walk(self.root_path): - for filename in files: - if filename.endswith(".json"): - file_path = os.path.join(root, filename) - try: - with self.sftp.open(file_path, "r") as f: - data = json.load(f) - downloads.append(Download(**data)) - except Exception as e: - logger.error(f"Error reading SFTP file {file_path}: {e}") - except SSHException as e: - logger.error(f"Failed to list SFTP files: {e}") - return downloads - - def _sftp_walk(self, path): - """Recursively walk SFTP directory.""" - path = str(path) - for entry in self.sftp.listdir_attr(path): - full_path = os.path.join(path, entry.filename) - if stat.S_ISDIR(entry.st_mode): - yield from self._sftp_walk(full_path) - else: - yield path, [], [entry.filename] - - def get(self, sha256_checksum: str): - """Retrieve a Download object for the given SHA256 hash.""" - content_path = self._get_content_path(sha256_checksum) - try: - files = self.sftp.listdir(content_path) - origin_files = [ - f for f in files if f.startswith("origin-") and f.endswith(".json") - ] - if origin_files: - with self.sftp.open( - os.path.join(content_path, origin_files[0]), "r" - ) as f: - data = json.load(f) - return Download(**data) - except SSHException as e: - logger.error(f"Failed to get SFTP file for {sha256_checksum}: {e}") - return None - - def put(self, content: bytes, download_url: str, download_date: str, filename: str): - """Store the content and its metadata.""" - sha256 = self._compute_sha256(content) - content_path = self._get_content_path(sha256) - try: - self.sftp.mkdir(content_path) - except SSHException: - pass - - content_file = os.path.join(content_path, "content") - try: - self.sftp.stat(content_file) - logger.info(f"Content already exists for {sha256}") - except SSHException: - try: - with self.sftp.open(content_file, "wb") as f: - f.write(content) - except SSHException as e: - raise Exception(f"Failed to write content to SFTP {content_file}: {e}") - - origin_hash = self._compute_origin_hash(filename, download_date, download_url) - origin_filename = f"origin-{origin_hash}.json" - origin_path = os.path.join(content_path, origin_filename) - try: - self.sftp.stat(origin_path) - raise Exception(f"Origin {origin_filename} already exists") - except SSHException: - metadata = self._build_metadata( - sha256, filename, download_date, download_url - ) - metadata_json = json.dumps(metadata, indent=2).encode("utf-8") - try: - with self.sftp.open(origin_path, "wb") as f: - f.write(metadata_json) - except SSHException as e: - raise Exception(f"Failed to write metadata to SFTP {origin_path}: {e}") - - return Download(**metadata) - - def find( - self, download_url: str = None, filename: str = None, download_date: str = None - ): - """Find a download based on metadata.""" - if not (download_url or filename or download_date): - return None - try: - for root, _, files in self._sftp_walk(self.root_path): - for filename in files: - if filename.endswith(".json"): - file_path = os.path.join(root, filename) - try: - with self.sftp.open(file_path, "r") as f: - data = json.load(f) - if ( - ( - download_url is None - or data.get("url") == download_url - ) - and ( - filename is None or data.get("filename") == filename - ) - and ( - download_date is None - or data.get("download_date") == download_date - ) - ): - return Download(**data) - except Exception as e: - logger.error(f"Error reading SFTP file {file_path}: {e}") - except SSHException as e: - logger.error(f"Failed to find in SFTP: {e}") - return None diff --git a/scanpipe/pipelines/__init__.py b/scanpipe/pipelines/__init__.py index ab239509e6..ddf652566e 100644 --- a/scanpipe/pipelines/__init__.py +++ b/scanpipe/pipelines/__init__.py @@ -34,8 +34,8 @@ from pyinstrument import Profiler from aboutcode.pipeline import BasePipeline -from scanpipe.settings import ENABLE_DOWNLOAD_ARCHIVING -from scanpipe.settings import download_store +from scancodeio.settings import ENABLE_DOWNLOAD_ARCHIVING +from scancodeio.settings import download_store logger = logging.getLogger(__name__) diff --git a/scanpipe/pipes/input.py b/scanpipe/pipes/input.py index 4d89e28068..ce50fb6e63 100644 --- a/scanpipe/pipes/input.py +++ b/scanpipe/pipes/input.py @@ -44,8 +44,8 @@ from scanpipe.models import InputSource from scanpipe.pipes import scancode from scanpipe.pipes.output import mappings_key_by_fieldname -from scanpipe.settings import ENABLE_DOWNLOAD_ARCHIVING -from scanpipe.settings import download_store +from scancodeio.settings import ENABLE_DOWNLOAD_ARCHIVING +from scancodeio.settings import download_store logger = logging.getLogger(__name__) diff --git a/scanpipe/tests/test_input.py b/scanpipe/tests/test_input.py index 32863463a9..64e634865f 100644 --- a/scanpipe/tests/test_input.py +++ b/scanpipe/tests/test_input.py @@ -30,7 +30,7 @@ from scanpipe.models import InputSource from scanpipe.pipes.input import add_input_from_upload from scanpipe.pipes.input import add_input_from_url -from scanpipe.settings import download_store +from scancodeio.settings import download_store from scanpipe.tests import make_project From 35efe84d30c8aa2ae0054d7944c08988eb96975b Mon Sep 17 00:00:00 2001 From: Varsha U N Date: Wed, 17 Sep 2025 05:17:59 +0530 Subject: [PATCH 04/18] fix CI errors Signed-off-by: Varsha U N --- scancodeio/settings.py | 28 +++--- scanpipe/archiving.py | 2 +- scanpipe/pipelines/__init__.py | 17 +++- scanpipe/pipes/input.py | 149 ++++++++++++----------------- scanpipe/tests/test_input.py | 158 +++++++++++++++++++------------ scanpipe/tests/test_pipelines.py | 39 ++++++-- 6 files changed, 218 insertions(+), 175 deletions(-) diff --git a/scancodeio/settings.py b/scancodeio/settings.py index cb2c2a9983..2d7686900c 100644 --- a/scancodeio/settings.py +++ b/scancodeio/settings.py @@ -23,12 +23,13 @@ import sys import tempfile from pathlib import Path -from venv import logger +import logging import environ from scanpipe.archiving import LocalFilesystemProvider + PROJECT_DIR = environ.Path(__file__) - 1 ROOT_DIR = PROJECT_DIR - 1 @@ -376,9 +377,10 @@ CRISPY_TEMPLATE_PACK = "bootstrap3" -# Storing archives locally (Package Storage settings) - -ENABLE_DOWNLOAD_ARCHIVING = env.bool("ENABLE_DOWNLOAD_ARCHIVING", default=False) +# Centralized archive directory for all projects +CENTRAL_ARCHIVE_PATH = env.str( + "CENTRAL_ARCHIVE_PATH", default="/var/scancodeio/archives" +) # localstorage configuration DOWNLOAD_ARCHIVING_PROVIDER = env.str( @@ -393,15 +395,15 @@ # Initialize the DownloadStore for local storage download_store = None -if ENABLE_DOWNLOAD_ARCHIVING: - if DOWNLOAD_ARCHIVING_PROVIDER == "localstorage": - config = DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION or {} - root_path = Path(config.get("root_path", "/var/scancodeio/downloads")) - try: - download_store = LocalFilesystemProvider(root_path=root_path) - except Exception as e: - logger.error(f"Failed to initialize LocalFilesystemProvider: {e}") - else: +logger = logging.getLogger(__name__) +if DOWNLOAD_ARCHIVING_PROVIDER == "localstorage": + config = DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION or {} + root_path = Path(config.get("root_path", CENTRAL_ARCHIVE_PATH)) + try: + download_store = LocalFilesystemProvider(root_path=root_path) + except Exception as e: + logger.error(f"Failed to initialize LocalFilesystemProvider: {e}") +else: logger.error( f"Unknown DOWNLOAD_ARCHIVING_PROVIDER: {DOWNLOAD_ARCHIVING_PROVIDER}" ) diff --git a/scanpipe/archiving.py b/scanpipe/archiving.py index 8d7dd9f853..482f448de5 100644 --- a/scanpipe/archiving.py +++ b/scanpipe/archiving.py @@ -63,7 +63,7 @@ def _build_metadata( "sha256": sha256, "filename": filename, "download_date": download_date, - "url": download_url, + "download_url": download_url, } @abstractmethod diff --git a/scanpipe/pipelines/__init__.py b/scanpipe/pipelines/__init__.py index ddf652566e..1b6cd4e0a0 100644 --- a/scanpipe/pipelines/__init__.py +++ b/scanpipe/pipelines/__init__.py @@ -23,6 +23,7 @@ import inspect import logging import traceback +import hashlib from contextlib import contextmanager from datetime import datetime from functools import wraps @@ -34,7 +35,6 @@ from pyinstrument import Profiler from aboutcode.pipeline import BasePipeline -from scancodeio.settings import ENABLE_DOWNLOAD_ARCHIVING from scancodeio.settings import download_store logger = logging.getLogger(__name__) @@ -148,9 +148,24 @@ def download_missing_inputs(self): error_tracebacks.append((msg, "No traceback available.")) continue + download_url = input_source.download_url + if not download_url: + continue + + url_hash = hashlib.sha256(download_url.encode()).hexdigest() + filename = input_source.filename or Path(download_url).name or f"{url_hash}.archive" + archive_path = Path(settings.CENTRAL_ARCHIVE_PATH) / url_hash / filename + + if archive_path.exists(): + logger.info(f"Reusing existing archive at {archive_path}") + input_source.file_path = str(archive_path) + input_source.save() + continue + self.log(f"Fetching input from {input_source.download_url}") try: input_source.fetch() + except Exception as error: traceback_str = traceback.format_exc() logger.error(traceback_str) diff --git a/scanpipe/pipes/input.py b/scanpipe/pipes/input.py index ce50fb6e63..81ae91c21d 100644 --- a/scanpipe/pipes/input.py +++ b/scanpipe/pipes/input.py @@ -44,7 +44,6 @@ from scanpipe.models import InputSource from scanpipe.pipes import scancode from scanpipe.pipes.output import mappings_key_by_fieldname -from scancodeio.settings import ENABLE_DOWNLOAD_ARCHIVING from scancodeio.settings import download_store logger = logging.getLogger(__name__) @@ -262,61 +261,47 @@ def add_input_from_url(project, url, filename=None): logger.error(f"Failed to download {url}: {e}") raise - should_archive = ( - ENABLE_DOWNLOAD_ARCHIVING == "always" - or ( - ENABLE_DOWNLOAD_ARCHIVING == "per_project" - and getattr(project, "archive_downloads", False) - ) - or ( - ENABLE_DOWNLOAD_ARCHIVING == "per_input" - and "archive" in getattr(project, "input_tags", []) - ) - ) + filename = filename or url.split("/")[-1] or "downloaded_file" + url_hash = hashlib.sha256(url.encode()).hexdigest() + archive_path = Path(project.settings.CENTRAL_ARCHIVE_PATH) / url_hash / filename - filename = filename or url.split("/")[-1] - if should_archive and download_store: - sha256 = hashlib.sha256(content).hexdigest() - existing_download = download_store.get(sha256) - if not existing_download: - try: - download = download_store.put( - content=content, - download_url=url, - download_date=datetime.now().isoformat(), - filename=filename, - ) - except Exception as e: - logger.error(f"Failed to archive download for {url}: {e}") - raise - else: - download = existing_download - - InputSource.objects.create( - project=project, - sha256=download.sha256, - download_url=download.download_url, - filename=download.filename, - download_date=download.download_date, - is_uploaded=False, - ) + if download_store: + try: + download = download_store.put( + content=content, + download_url=url, + download_date=datetime.now().isoformat(), + filename=filename, + ) + InputSource.objects.create( + project=project, + sha256=download.sha256, + download_url=download.download_url, + filename=download.filename, + download_date=download.download_date, + file_path=str(download.path), + is_uploaded=False, + ) + except Exception as e: + logger.error(f"Failed to archive download for {url}: {e}") + raise else: input_path = project.input_path / filename try: + input_path.parent.mkdir(parents=True, exist_ok=True) with open(input_path, "wb") as f: f.write(content) + InputSource.objects.create( + project=project, + filename=filename, + download_url=url, + file_path=str(input_path), + is_uploaded=False, + ) except Exception as e: logger.error(f"Failed to save {filename} to {input_path}: {e}") raise - InputSource.objects.create( - project=project, - filename=filename, - download_url=url, - is_uploaded=False, - ) - - def add_input_from_upload(project, uploaded_file): """ Add an uploaded file as an InputSource for the specified ``project``. @@ -325,54 +310,38 @@ def add_input_from_upload(project, uploaded_file): content = uploaded_file.read() filename = uploaded_file.name - should_archive = ( - ENABLE_DOWNLOAD_ARCHIVING == "always" - or ( - ENABLE_DOWNLOAD_ARCHIVING == "per_project" - and getattr(project, "archive_downloads", False) - ) - or ( - ENABLE_DOWNLOAD_ARCHIVING == "per_input" - and "archive" in getattr(project, "input_tags", []) - ) - ) - - if should_archive and download_store: - sha256 = hashlib.sha256(content).hexdigest() - existing_download = download_store.get(sha256) - if not existing_download: - try: - download = download_store.put( - content=content, - download_url="", # No URL for uploads - download_date=datetime.now().isoformat(), - filename=filename, - ) - except Exception as e: - logger.error(f"Failed to archive upload {filename}: {e}") - raise - else: - download = existing_download - - InputSource.objects.create( - project=project, - sha256=download.sha256, - download_url=download.download_url, - filename=download.filename, - download_date=download.download_date, - is_uploaded=True, - ) + if download_store: + try: + download = download_store.put( + content=content, + download_url="", + download_date=datetime.now().isoformat(), + filename=filename, + ) + InputSource.objects.create( + project=project, + sha256=download.sha256, + download_url=download.download_url, + filename=download.filename, + download_date=download.download_date, + file_path=str(download.path), + is_uploaded=True, + ) + except Exception as e: + logger.error(f"Failed to archive upload {filename}: {e}") + raise else: input_path = project.input_path / filename try: + input_path.parent.mkdir(parents=True, exist_ok=True) with open(input_path, "wb") as f: f.write(content) + InputSource.objects.create( + project=project, + filename=filename, + file_path=str(input_path), + is_uploaded=True, + ) except Exception as e: logger.error(f"Failed to save {filename} to {input_path}: {e}") - raise - - InputSource.objects.create( - project=project, - filename=filename, - is_uploaded=True, - ) + raise \ No newline at end of file diff --git a/scanpipe/tests/test_input.py b/scanpipe/tests/test_input.py index 64e634865f..3f2848cf1b 100644 --- a/scanpipe/tests/test_input.py +++ b/scanpipe/tests/test_input.py @@ -6,13 +6,16 @@ # ScanCode is a trademark of nexB Inc. # # You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed +# You may obtain a copy of the License at: +# http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, +# software distributed # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR # CONDITIONS OF ANY KIND, either express or implied. See the License for the # specific language governing permissions and limitations under the License. # -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# Data Generated with ScanCode.io is provided on an +# "AS IS" BASIS, WITHOUT WARRANTIES # OR CONDITIONS OF ANY KIND, either express or implied. No content created from # ScanCode.io should be considered or used as legal advice. Consult an Attorney # for any legal advice. @@ -30,7 +33,7 @@ from scanpipe.models import InputSource from scanpipe.pipes.input import add_input_from_upload from scanpipe.pipes.input import add_input_from_url -from scancodeio.settings import download_store +from scancodeio.settings import settings from scanpipe.tests import make_project @@ -39,69 +42,102 @@ def setUp(self): self.project = make_project() self.test_filename = "sample.tar.gz" self.test_data_path = ( - Path(__file__).parent / "data" / "test-downloads" / self.test_filename + Path(__file__).parent / + "data" / + "test-downloads" / + self.test_filename ) with open(self.test_data_path, "rb") as f: self.test_content = f.read() @patch("requests.get") - def test_add_input_from_url_with_archiving(self): - with patch("scanpipe.settings.ENABLE_DOWNLOAD_ARCHIVING", "always"): - test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" - mock_get = self.mocker.patch("requests.get") - mock_get.return_value.content = self.test_content - mock_get.return_value.status_code = 200 - add_input_from_url(self.project, test_url, filename=self.test_filename) - input_source = InputSource.objects.get(project=self.project) - self.assertEqual(input_source.filename, self.test_filename) - self.assertEqual(input_source.download_url, test_url) - self.assertTrue(input_source.sha256) - self.assertTrue(input_source.download_date) - self.assertFalse(input_source.is_uploaded) - if download_store: - download = download_store.get(input_source.sha256) - self.assertEqual(download.download_url, test_url) + def test_add_input_from_url(self, mock_get): + test_url = ( + "https://files.pythonhosted.org/" + "packages/sample.tar.gz" + ) + mock_get.return_value.content = self.test_content + mock_get.return_value.status_code = 200 + add_input_from_url( + self.project, + test_url, + filename=self.test_filename + ) + input_source = InputSource.objects.get(project=self.project) + self.assertEqual(input_source.filename, self.test_filename) + self.assertEqual(input_source.download_url, test_url) + self.assertTrue(input_source.sha256) + self.assertTrue(input_source.download_date) + self.assertFalse(input_source.is_uploaded) + self.assertTrue( + input_source.file_path.startswith( + settings.CENTRAL_ARCHIVE_PATH + ) + ) + self.assertTrue(Path(input_source.file_path).exists()) + @patch("scanpipe.pipes.input.download_store", None) @patch("requests.get") - def test_add_input_from_url_without_archiving(self): - with patch("scanpipe.settings.ENABLE_DOWNLOAD_ARCHIVING", False): - test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" - mock_get = self.mocker.patch("requests.get") - mock_get.return_value.content = self.test_content - mock_get.return_value.status_code = 200 - add_input_from_url(self.project, test_url, filename=self.test_filename) - input_source = InputSource.objects.get(project=self.project) - self.assertEqual(input_source.filename, self.test_filename) - self.assertEqual(input_source.download_url, test_url) - self.assertFalse(input_source.sha256) - self.assertFalse(input_source.download_date) - self.assertFalse(input_source.is_uploaded) - input_path = self.project.input_path / self.test_filename - self.assertTrue(input_path.exists()) + def test_add_input_from_url_fallback(self, mock_get): + test_url = ( + "https://files.pythonhosted.org/" + "packages/sample.tar.gz" + ) + mock_get.return_value.content = self.test_content + mock_get.return_value.status_code = 200 + add_input_from_url( + self.project, + test_url, + filename=self.test_filename + ) + input_source = InputSource.objects.get(project=self.project) + self.assertEqual(input_source.filename, self.test_filename) + self.assertEqual(input_source.download_url, test_url) + self.assertFalse(input_source.sha256) + self.assertFalse(input_source.download_date) + self.assertFalse(input_source.is_uploaded) + self.assertTrue( + str(input_source.file_path).startswith( + str(self.project.input_path) + ) + ) + self.assertTrue(Path(input_source.file_path).exists()) - def test_add_input_from_upload_with_archiving(self): - with patch("scanpipe.settings.ENABLE_DOWNLOAD_ARCHIVING", "always"): - uploaded_file = SimpleUploadedFile(self.test_filename, self.test_content) - add_input_from_upload(self.project, uploaded_file) - input_source = InputSource.objects.get(project=self.project) - self.assertEqual(input_source.filename, self.test_filename) - self.assertEqual(input_source.download_url, "") - self.assertTrue(input_source.sha256) - self.assertTrue(input_source.download_date) - self.assertTrue(input_source.is_uploaded) - if download_store: - download = download_store.get(input_source.sha256) - self.assertEqual(download.filename, self.test_filename) + def test_add_input_from_upload(self): + uploaded_file = SimpleUploadedFile( + self.test_filename, + self.test_content + ) + add_input_from_upload(self.project, uploaded_file) + input_source = InputSource.objects.get(project=self.project) + self.assertEqual(input_source.filename, self.test_filename) + self.assertEqual(input_source.download_url, "") + self.assertTrue(input_source.sha256) + self.assertTrue(input_source.download_date) + self.assertTrue(input_source.is_uploaded) + self.assertTrue( + input_source.file_path.startswith( + settings.CENTRAL_ARCHIVE_PATH + ) + ) + self.assertTrue(Path(input_source.file_path).exists()) - def test_add_input_from_upload_without_archiving(self): - with patch("scanpipe.settings.ENABLE_DOWNLOAD_ARCHIVING", False): - uploaded_file = SimpleUploadedFile(self.test_filename, self.test_content) - add_input_from_upload(self.project, uploaded_file) - input_source = InputSource.objects.get(project=self.project) - self.assertEqual(input_source.filename, self.test_filename) - self.assertEqual(input_source.download_url, "") - self.assertFalse(input_source.sha256) - self.assertFalse(input_source.download_date) - self.assertTrue(input_source.is_uploaded) - input_path = self.project.input_path / self.test_filename - self.assertTrue(input_path.exists()) + @patch("scanpipe.pipes.input.download_store", None) + def test_add_input_from_upload_fallback(self): + uploaded_file = SimpleUploadedFile( + self.test_filename, + self.test_content + ) + add_input_from_upload(self.project, uploaded_file) + input_source = InputSource.objects.get(project=self.project) + self.assertEqual(input_source.filename, self.test_filename) + self.assertEqual(input_source.download_url, "") + self.assertFalse(input_source.sha256) + self.assertFalse(input_source.download_date) + self.assertTrue(input_source.is_uploaded) + self.assertTrue( + str(input_source.file_path).startswith( + str(self.project.input_path) + ) + ) + self.assertTrue(Path(input_source.file_path).exists()) diff --git a/scanpipe/tests/test_pipelines.py b/scanpipe/tests/test_pipelines.py index 42bd262e63..306ea85e17 100644 --- a/scanpipe/tests/test_pipelines.py +++ b/scanpipe/tests/test_pipelines.py @@ -306,15 +306,36 @@ def test_archive_downloads(self, mock_get): download_url=test_url, is_uploaded=False, ) - with patch("scanpipe.settings.ENABLE_DOWNLOAD_ARCHIVING", "always"): - mock_get.return_value.content = test_content - mock_get.return_value.status_code = 200 - pipeline.archive_downloads() - input_source = InputSource.objects.get(project=project1) - self.assertTrue(input_source.sha256) - self.assertTrue(input_source.download_date) - self.assertEqual(input_source.download_url, test_url) - self.assertEqual(input_source.filename, test_filename) + + mock_get.return_value.content = test_content + mock_get.return_value.status_code = 200 + + pipeline.download_missing_inputs() + input_source.refresh_from_db() + self.assertTrue(input_source.file_path.startswith(settings.CENTRAL_ARCHIVE_PATH)) + self.assertTrue(Path(input_source.file_path).exists()) + + + pipeline.archive_downloads() + input_source = InputSource.refresh_from_db() + self.assertTrue(input_source.sha256) + self.assertTrue(input_source.download_date) + self.assertEqual(input_source.download_url, test_url) + self.assertEqual(input_source.filename, test_filename) + + project2 = make_project(name="project2") + input_source2 = InputSource.objects.create( + project=project2, + filename=test_filename, + download_url=test_url, + is_uploaded=False, + ) + run2 = project2.add_pipeline("scan_codebase") + pipeline2 = run2.make_pipeline_instance() + pipeline2.download_missing_inputs() + input_source2.refresh_from_db() + self.assertEqual(input_source.file_path, input_source2.file_path) + self.assertTrue(Path(input_source2.file_path).exists()) def test_scanpipe_pipeline_class_save_errors_context_manager(self): project1 = make_project() From 87c81bd08c57ac5ac6d1dee1cc21121cb3363687 Mon Sep 17 00:00:00 2001 From: Varsha U N Date: Wed, 17 Sep 2025 07:49:41 +0530 Subject: [PATCH 05/18] add tests for storing packages Signed-off-by: Varsha U N --- Dockerfile | 186 +- scancodeio/settings.py | 979 ++++--- scanpipe/archiving.py | 375 ++- scanpipe/pipelines/__init__.py | 699 ++--- scanpipe/pipes/input.py | 692 +++-- scanpipe/tests/test_archiving.py | 172 +- scanpipe/tests/test_input.py | 255 +- scanpipe/tests/test_pipelines.py | 4108 +++++++++++++++--------------- 8 files changed, 3716 insertions(+), 3750 deletions(-) diff --git a/Dockerfile b/Dockerfile index 6a38d97eed..0cb8b60d73 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,94 +1,94 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - -FROM python:3.13-slim - -LABEL org.opencontainers.image.source="https://github.com/aboutcode-org/scancode.io" -LABEL org.opencontainers.image.description="ScanCode.io" -LABEL org.opencontainers.image.licenses="Apache-2.0" - -ENV APP_NAME scancodeio -ENV APP_USER app -ENV APP_DIR /opt/$APP_NAME -ENV VENV_LOCATION /opt/$APP_NAME/.venv - -# Force Python unbuffered stdout and stderr (they are flushed to terminal immediately) -ENV PYTHONUNBUFFERED 1 -# Do not write Python .pyc files -ENV PYTHONDONTWRITEBYTECODE 1 -# Add the app dir in the Python path for entry points availability -ENV PYTHONPATH $PYTHONPATH:$APP_DIR - -# OS requirements as per -# https://scancode-toolkit.readthedocs.io/en/latest/getting-started/install.html -# Also install universal-ctags and xgettext for symbol and string collection. -RUN apt-get update \ - && apt-get install -y --no-install-recommends \ - bzip2 \ - xz-utils \ - zlib1g \ - libxml2-dev \ - libxslt1-dev \ - libgomp1 \ - libsqlite3-0 \ - libgcrypt20 \ - libpopt0 \ - libzstd1 \ - libgpgme11 \ - libdevmapper1.02.1 \ - libguestfs-tools \ - linux-image-amd64 \ - git \ - wait-for-it \ - universal-ctags \ - gettext \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* - -# Create the APP_USER group and user -RUN addgroup --system $APP_USER \ - && adduser --system --group --home=$APP_DIR $APP_USER \ - && chown $APP_USER:$APP_USER $APP_DIR - -# Create the /var/APP_NAME directory with proper permission for APP_USER -RUN mkdir -p /var/$APP_NAME \ - && chown $APP_USER:$APP_USER /var/$APP_NAME - -# Setup the work directory and the user as APP_USER for the remaining stages -WORKDIR $APP_DIR -USER $APP_USER - -# Create the virtualenv -RUN python -m venv $VENV_LOCATION -# Enable the virtualenv, similar effect as "source activate" -ENV PATH $VENV_LOCATION/bin:$PATH - -# Create static/ and workspace/ directories -RUN mkdir -p /var/$APP_NAME/static/ \ - && mkdir -p /var/$APP_NAME/workspace/ - -# Install the dependencies before the codebase COPY for proper Docker layer caching -COPY --chown=$APP_USER:$APP_USER pyproject.toml $APP_DIR/ -RUN pip install --no-cache-dir . - -# Copy the codebase and set the proper permissions for the APP_USER +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +FROM python:3.13-slim + +LABEL org.opencontainers.image.source="https://github.com/aboutcode-org/scancode.io" +LABEL org.opencontainers.image.description="ScanCode.io" +LABEL org.opencontainers.image.licenses="Apache-2.0" + +ENV APP_NAME scancodeio +ENV APP_USER app +ENV APP_DIR /opt/$APP_NAME +ENV VENV_LOCATION /opt/$APP_NAME/.venv + +# Force Python unbuffered stdout and stderr (they are flushed to terminal immediately) +ENV PYTHONUNBUFFERED 1 +# Do not write Python .pyc files +ENV PYTHONDONTWRITEBYTECODE 1 +# Add the app dir in the Python path for entry points availability +ENV PYTHONPATH $PYTHONPATH:$APP_DIR + +# OS requirements as per +# https://scancode-toolkit.readthedocs.io/en/latest/getting-started/install.html +# Also install universal-ctags and xgettext for symbol and string collection. +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + bzip2 \ + xz-utils \ + zlib1g \ + libxml2-dev \ + libxslt1-dev \ + libgomp1 \ + libsqlite3-0 \ + libgcrypt20 \ + libpopt0 \ + libzstd1 \ + libgpgme11 \ + libdevmapper1.02.1 \ + libguestfs-tools \ + linux-image-amd64 \ + git \ + wait-for-it \ + universal-ctags \ + gettext \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +# Create the APP_USER group and user +RUN addgroup --system $APP_USER \ + && adduser --system --group --home=$APP_DIR $APP_USER \ + && chown $APP_USER:$APP_USER $APP_DIR + +# Create the /var/APP_NAME directory with proper permission for APP_USER +RUN mkdir -p /var/$APP_NAME \ + && chown $APP_USER:$APP_USER /var/$APP_NAME + +# Setup the work directory and the user as APP_USER for the remaining stages +WORKDIR $APP_DIR +USER $APP_USER + +# Create the virtualenv +RUN python -m venv $VENV_LOCATION +# Enable the virtualenv, similar effect as "source activate" +ENV PATH $VENV_LOCATION/bin:$PATH + +# Create static/ and workspace/ directories +RUN mkdir -p /var/$APP_NAME/static/ \ + && mkdir -p /var/$APP_NAME/workspace/ + +# Install the dependencies before the codebase COPY for proper Docker layer caching +COPY --chown=$APP_USER:$APP_USER pyproject.toml $APP_DIR/ +RUN pip install --no-cache-dir . + +# Copy the codebase and set the proper permissions for the APP_USER COPY --chown=$APP_USER:$APP_USER . $APP_DIR \ No newline at end of file diff --git a/scancodeio/settings.py b/scancodeio/settings.py index 2d7686900c..15e52a4440 100644 --- a/scancodeio/settings.py +++ b/scancodeio/settings.py @@ -1,491 +1,488 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - -import sys -import tempfile -from pathlib import Path -import logging - -import environ - -from scanpipe.archiving import LocalFilesystemProvider - - -PROJECT_DIR = environ.Path(__file__) - 1 -ROOT_DIR = PROJECT_DIR - 1 - -# True if running tests through `./manage test` -IS_TESTS = "test" in sys.argv - -# Environment - -ENV_FILE = "/etc/scancodeio/.env" -if not Path(ENV_FILE).exists(): - ENV_FILE = ROOT_DIR(".env") - -# Do not use local .env environment when running the tests. -if IS_TESTS: - ENV_FILE = None - -env = environ.Env() -environ.Env.read_env(ENV_FILE) - -# Security - -SECRET_KEY = env.str("SECRET_KEY", default="") - -ALLOWED_HOSTS = env.list( - "ALLOWED_HOSTS", - default=[".localhost", "127.0.0.1", "[::1]", "host.docker.internal", "172.17.0.1"], -) - -CSRF_TRUSTED_ORIGINS = env.list("CSRF_TRUSTED_ORIGINS", default=[]) - -# SECURITY WARNING: don't run with debug turned on in production -DEBUG = env.bool("SCANCODEIO_DEBUG", default=False) - -SCANCODEIO_REQUIRE_AUTHENTICATION = env.bool( - "SCANCODEIO_REQUIRE_AUTHENTICATION", default=False -) - -SCANCODEIO_ENABLE_ADMIN_SITE = env.bool("SCANCODEIO_ENABLE_ADMIN_SITE", default=False) - -SECURE_CONTENT_TYPE_NOSNIFF = env.bool("SECURE_CONTENT_TYPE_NOSNIFF", default=True) - -X_FRAME_OPTIONS = env.str("X_FRAME_OPTIONS", default="DENY") - -SESSION_COOKIE_SECURE = env.bool("SESSION_COOKIE_SECURE", default=True) - -CSRF_COOKIE_SECURE = env.bool("CSRF_COOKIE_SECURE", default=True) - -# ``security.W004`` SECURE_HSTS_SECONDS and ``security.W008`` SECURE_SSL_REDIRECT -# are handled by the web server. -SILENCED_SYSTEM_CHECKS = ["security.W004", "security.W008"] - -# ScanCode.io - -SCANCODEIO_WORKSPACE_LOCATION = env.str("SCANCODEIO_WORKSPACE_LOCATION", default="var") - -SCANCODEIO_CONFIG_DIR = env.str("SCANCODEIO_CONFIG_DIR", default=".scancode") - -SCANCODEIO_CONFIG_FILE = env.str( - "SCANCODEIO_CONFIG_FILE", default="scancode-config.yml" -) - -SCANCODEIO_LOG_LEVEL = env.str("SCANCODEIO_LOG_LEVEL", "INFO") - -# Set the number of parallel processes to use for ScanCode related scan execution. -# If the SCANCODEIO_PROCESSES argument is not set, defaults to an optimal number of CPUs -# available on the machine. -SCANCODEIO_PROCESSES = env.int("SCANCODEIO_PROCESSES", default=None) - -SCANCODEIO_POLICIES_FILE = env.str("SCANCODEIO_POLICIES_FILE", default="policies.yml") - -# This setting defines the additional locations ScanCode.io will search for pipelines. -# This should be set to a list of strings that contain full paths to your additional -# pipelines directories. -SCANCODEIO_PIPELINES_DIRS = env.list("SCANCODEIO_PIPELINES_DIRS", default=[]) - -# Maximum time allowed for a pipeline to complete. -SCANCODEIO_TASK_TIMEOUT = env.str("SCANCODEIO_TASK_TIMEOUT", default="24h") - -# Default to 2 minutes. -SCANCODEIO_SCAN_FILE_TIMEOUT = env.int("SCANCODEIO_SCAN_FILE_TIMEOUT", default=120) - -# Default to None which scans all files -SCANCODEIO_SCAN_MAX_FILE_SIZE = env.int("SCANCODEIO_SCAN_MAX_FILE_SIZE", default=None) - -# List views pagination, controls the number of items displayed per page. -# Syntax in .env: SCANCODEIO_PAGINATE_BY=project=10,project_error=10 -SCANCODEIO_PAGINATE_BY = env.dict( - "SCANCODEIO_PAGINATE_BY", - default={ - "project": 20, - "error": 50, - "resource": 100, - "package": 100, - "dependency": 100, - "license": 100, - "relation": 100, - }, -) - -# Default limit for "most common" entries in QuerySets. -SCANCODEIO_MOST_COMMON_LIMIT = env.int("SCANCODEIO_MOST_COMMON_LIMIT", default=7) - -# The base URL (e.g., https://hostname/) of this application instance. -# Required for generating URLs to reference objects within the app, -# such as in webhook notifications. -SCANCODEIO_SITE_URL = env.str("SCANCODEIO_SITE_URL", default="") - -# Fetch authentication credentials - -# SCANCODEIO_FETCH_BASIC_AUTH="host=user,password;" -SCANCODEIO_FETCH_BASIC_AUTH = env.dict( - "SCANCODEIO_FETCH_BASIC_AUTH", - cast={"value": tuple}, - default={}, -) - -# SCANCODEIO_FETCH_DIGEST_AUTH="host=user,password;" -SCANCODEIO_FETCH_DIGEST_AUTH = env.dict( - "SCANCODEIO_FETCH_DIGEST_AUTH", - cast={"value": tuple}, - default={}, -) - -# SCANCODEIO_FETCH_HEADERS="host=Header1=value,Header2=value;" -SCANCODEIO_FETCH_HEADERS = {} -FETCH_HEADERS_STR = env.str("SCANCODEIO_FETCH_HEADERS", default="") -for entry in FETCH_HEADERS_STR.split(";"): - if entry.strip(): - host, headers = entry.split("=", 1) - SCANCODEIO_FETCH_HEADERS[host] = env.parse_value(headers, cast=dict) - -# SCANCODEIO_NETRC_LOCATION="~/.netrc" -SCANCODEIO_NETRC_LOCATION = env.str("SCANCODEIO_NETRC_LOCATION", default="") -if SCANCODEIO_NETRC_LOCATION: - # Propagate the location to the environ for `requests.utils.get_netrc_auth` - env.ENVIRON["NETRC"] = SCANCODEIO_NETRC_LOCATION - -# SCANCODEIO_SKOPEO_CREDENTIALS="host1=user:password,host2=user:password" -SCANCODEIO_SKOPEO_CREDENTIALS = env.dict("SCANCODEIO_SKOPEO_CREDENTIALS", default={}) - -# SCANCODEIO_SKOPEO_AUTHFILE_LOCATION="/path/to/auth.json" -SCANCODEIO_SKOPEO_AUTHFILE_LOCATION = env.str( - "SCANCODEIO_SKOPEO_AUTHFILE_LOCATION", default="" -) - -# This webhook will be added as WebhookSubscription for each new project. -# SCANCODEIO_GLOBAL_WEBHOOK=target_url=https://webhook.url,trigger_on_each_run=False,include_summary=True,include_results=False -SCANCODEIO_GLOBAL_WEBHOOK = env.dict("SCANCODEIO_GLOBAL_WEBHOOK", default={}) - -# Application definition - -INSTALLED_APPS = [ - # Local apps - # Must come before Third-party apps for proper templates override - "scanpipe", - # Django built-in - "django.contrib.auth", - "django.contrib.contenttypes", - "django.contrib.sessions", - "django.contrib.messages", - "django.contrib.staticfiles", - "django.contrib.admin", - "django.contrib.humanize", - # Third-party apps - "crispy_forms", - "crispy_bootstrap3", # required for the djangorestframework browsable API - "django_filters", - "rest_framework", - "rest_framework.authtoken", - "django_rq", - "django_probes", - "taggit", -] - -MIDDLEWARE = [ - "django.middleware.security.SecurityMiddleware", - "django.contrib.sessions.middleware.SessionMiddleware", - "django.middleware.common.CommonMiddleware", - "django.middleware.csrf.CsrfViewMiddleware", - "django.contrib.auth.middleware.AuthenticationMiddleware", - "django.contrib.messages.middleware.MessageMiddleware", - "django.middleware.clickjacking.XFrameOptionsMiddleware", - "scancodeio.middleware.TimezoneMiddleware", -] - -ROOT_URLCONF = "scancodeio.urls" - -WSGI_APPLICATION = "scancodeio.wsgi.application" - -SECURE_PROXY_SSL_HEADER = env.tuple( - "SECURE_PROXY_SSL_HEADER", default=("HTTP_X_FORWARDED_PROTO", "https") -) - -# Database - -DATABASES = { - "default": { - "ENGINE": env.str("SCANCODEIO_DB_ENGINE", "django.db.backends.postgresql"), - "HOST": env.str("SCANCODEIO_DB_HOST", "localhost"), - "NAME": env.str("SCANCODEIO_DB_NAME", "scancodeio"), - "USER": env.str("SCANCODEIO_DB_USER", "scancodeio"), - "PASSWORD": env.str("SCANCODEIO_DB_PASSWORD", "scancodeio"), - "PORT": env.str("SCANCODEIO_DB_PORT", "5432"), - "ATOMIC_REQUESTS": True, - } -} - -DEFAULT_AUTO_FIELD = "django.db.models.AutoField" - -# Forms and filters - -FILTERS_EMPTY_CHOICE_LABEL = env.str("FILTERS_EMPTY_CHOICE_LABEL", default="All") - -# Templates - -TEMPLATES = [ - { - "BACKEND": "django.template.backends.django.DjangoTemplates", - "APP_DIRS": True, - "OPTIONS": { - "debug": DEBUG, - "context_processors": [ - "django.contrib.auth.context_processors.auth", - "django.contrib.messages.context_processors.messages", - "django.template.context_processors.request", - "scancodeio.context_processors.versions", - ], - }, - }, -] - -# Login - -LOGIN_REDIRECT_URL = "project_list" - -# Passwords - -AUTH_PASSWORD_VALIDATORS = [ - { - "NAME": ( - "django.contrib.auth.password_validation.UserAttributeSimilarityValidator" - ), - }, - { - "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator", - "OPTIONS": { - "min_length": env.int("SCANCODEIO_PASSWORD_MIN_LENGTH", default=12), - }, - }, - { - "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator", - }, - { - "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator", - }, -] - -# Testing - -if IS_TESTS: - from django.core.management.utils import get_random_secret_key - - SECRET_KEY = get_random_secret_key() - # Do not pollute the workspace while running the tests. - SCANCODEIO_WORKSPACE_LOCATION = tempfile.mkdtemp() - SCANCODEIO_REQUIRE_AUTHENTICATION = True - SCANCODEIO_SCAN_FILE_TIMEOUT = 120 - SCANCODEIO_POLICIES_FILE = None - # The default password hasher is rather slow by design. - # Using a faster hashing algorithm in the testing context to speed up the run. - PASSWORD_HASHERS = ["django.contrib.auth.hashers.MD5PasswordHasher"] - -# Debug toolbar - -DEBUG_TOOLBAR = env.bool("SCANCODEIO_DEBUG_TOOLBAR", default=False) -if DEBUG and DEBUG_TOOLBAR: - INSTALLED_APPS.append("debug_toolbar") - MIDDLEWARE.append("debug_toolbar.middleware.DebugToolbarMiddleware") - INTERNAL_IPS = ["127.0.0.1"] - -# Logging - -LOGGING = { - "version": 1, - "disable_existing_loggers": False, - "formatters": { - "simple": { - "format": "{levelname} {message}", - "style": "{", - }, - }, - "handlers": { - "null": { - "class": "logging.NullHandler", - }, - "console": { - "class": "logging.StreamHandler", - "formatter": "simple", - }, - }, - "loggers": { - "scanpipe": { - "handlers": ["null"] if IS_TESTS else ["console"], - "level": SCANCODEIO_LOG_LEVEL, - "propagate": False, - }, - "django": { - "handlers": ["null"] if IS_TESTS else ["console"], - "propagate": False, - }, - # Set SCANCODEIO_LOG_LEVEL=DEBUG to display all SQL queries in the console. - "django.db.backends": { - "level": SCANCODEIO_LOG_LEVEL, - }, - }, -} - -# Instead of sending out real emails the console backend just writes the emails -# that would be sent to the standard output. -EMAIL_BACKEND = "django.core.mail.backends.console.EmailBackend" - -# Internationalization - -LANGUAGE_CODE = "en-us" - -FORMAT_MODULE_PATH = ["scancodeio.formats"] - -TIME_ZONE = env.str("TIME_ZONE", default="UTC") - -USE_I18N = True - -USE_TZ = True - -# Static files (CSS, JavaScript, Images) - -STATIC_URL = "/static/" - -STATIC_ROOT = env.str("STATIC_ROOT", default="/var/scancodeio/static/") - -STATICFILES_DIRS = [ - PROJECT_DIR("static"), -] - -# Third-party apps - -CRISPY_TEMPLATE_PACK = "bootstrap3" - -# Centralized archive directory for all projects -CENTRAL_ARCHIVE_PATH = env.str( - "CENTRAL_ARCHIVE_PATH", default="/var/scancodeio/archives" -) - -# localstorage configuration -DOWNLOAD_ARCHIVING_PROVIDER = env.str( - "DOWNLOAD_ARCHIVING_PROVIDER", default="localstorage" -) - -# For local storage, we would store the root path in that setting -DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION = env.dict( - "DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION", default=None -) - -# Initialize the DownloadStore for local storage - -download_store = None -logger = logging.getLogger(__name__) -if DOWNLOAD_ARCHIVING_PROVIDER == "localstorage": - config = DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION or {} - root_path = Path(config.get("root_path", CENTRAL_ARCHIVE_PATH)) - try: - download_store = LocalFilesystemProvider(root_path=root_path) - except Exception as e: - logger.error(f"Failed to initialize LocalFilesystemProvider: {e}") -else: - logger.error( - f"Unknown DOWNLOAD_ARCHIVING_PROVIDER: {DOWNLOAD_ARCHIVING_PROVIDER}" - ) - -# Job Queue - -RQ_QUEUES = { - "default": { - "HOST": env.str("SCANCODEIO_RQ_REDIS_HOST", default="localhost"), - "PORT": env.str("SCANCODEIO_RQ_REDIS_PORT", default="6379"), - "DB": env.int("SCANCODEIO_RQ_REDIS_DB", default=0), - "USERNAME": env.str("SCANCODEIO_RQ_REDIS_USERNAME", default=None), - "PASSWORD": env.str("SCANCODEIO_RQ_REDIS_PASSWORD", default=""), - "DEFAULT_TIMEOUT": env.int("SCANCODEIO_RQ_REDIS_DEFAULT_TIMEOUT", default=360), - # Enable SSL for Redis connections when deploying ScanCode.io in environments - # where Redis is hosted on a separate system (e.g., cloud deployment or remote - # Redis server) to secure data in transit. - "SSL": env.bool("SCANCODEIO_RQ_REDIS_SSL", default=False), - }, -} - -SCANCODEIO_ASYNC = env.bool("SCANCODEIO_ASYNC", default=False) -if not SCANCODEIO_ASYNC: - for queue_config in RQ_QUEUES.values(): - queue_config["ASYNC"] = False - -# ClamAV virus scan -CLAMD_USE_TCP = env.bool("CLAMD_USE_TCP", default=True) -CLAMD_TCP_ADDR = env.str("CLAMD_TCP_ADDR", default="clamav") - -# Django restframework - -REST_FRAMEWORK = { - "DEFAULT_AUTHENTICATION_CLASSES": ( - "rest_framework.authentication.TokenAuthentication", - ), - "DEFAULT_PERMISSION_CLASSES": ("rest_framework.permissions.IsAuthenticated",), - "DEFAULT_RENDERER_CLASSES": ( - "rest_framework.renderers.JSONRenderer", - "rest_framework.renderers.BrowsableAPIRenderer", - "rest_framework.renderers.AdminRenderer", - ), - "DEFAULT_FILTER_BACKENDS": ( - "django_filters.rest_framework.DjangoFilterBackend", - "rest_framework.filters.SearchFilter", - ), - "DEFAULT_PAGINATION_CLASS": "rest_framework.pagination.PageNumberPagination", - "PAGE_SIZE": env.int("SCANCODEIO_REST_API_PAGE_SIZE", default=50), - "UPLOADED_FILES_USE_URL": False, -} - -if not SCANCODEIO_REQUIRE_AUTHENTICATION: - REST_FRAMEWORK["DEFAULT_PERMISSION_CLASSES"] = ( - "rest_framework.permissions.AllowAny", - ) - -# VulnerableCode integration - -VULNERABLECODE_URL = env.str("VULNERABLECODE_URL", default="").rstrip("/") -VULNERABLECODE_USER = env.str("VULNERABLECODE_USER", default="") -VULNERABLECODE_PASSWORD = env.str("VULNERABLECODE_PASSWORD", default="") -VULNERABLECODE_API_KEY = env.str("VULNERABLECODE_API_KEY", default="") - -# PurlDB integration - -PURLDB_URL = env.str("PURLDB_URL", default="").rstrip("/") -PURLDB_USER = env.str("PURLDB_USER", default="") -PURLDB_PASSWORD = env.str("PURLDB_PASSWORD", default="") -PURLDB_API_KEY = env.str("PURLDB_API_KEY", default="") - -# MatchCode.io integration - -MATCHCODEIO_URL = env.str("MATCHCODEIO_URL", default="").rstrip("/") -MATCHCODEIO_USER = env.str("MATCHCODEIO_USER", default="") -MATCHCODEIO_PASSWORD = env.str("MATCHCODEIO_PASSWORD", default="") -MATCHCODEIO_API_KEY = env.str("MATCHCODEIO_API_KEY", default="") - -# FederatedCode integration - -FEDERATEDCODE_GIT_ACCOUNT_URL = env.str( - "FEDERATEDCODE_GIT_ACCOUNT_URL", default="" -).rstrip("/") -FEDERATEDCODE_GIT_SERVICE_TOKEN = env.str("FEDERATEDCODE_GIT_SERVICE_TOKEN", default="") -FEDERATEDCODE_GIT_SERVICE_NAME = env.str("FEDERATEDCODE_GIT_SERVICE_NAME", default="") -FEDERATEDCODE_GIT_SERVICE_EMAIL = env.str("FEDERATEDCODE_GIT_SERVICE_EMAIL", default="") +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +import logging +import sys +import tempfile +from pathlib import Path + +import environ + +from scanpipe.archiving import LocalFilesystemProvider + +PROJECT_DIR = environ.Path(__file__) - 1 +ROOT_DIR = PROJECT_DIR - 1 + +# True if running tests through `./manage test` +IS_TESTS = "test" in sys.argv + +# Environment + +ENV_FILE = "/etc/scancodeio/.env" +if not Path(ENV_FILE).exists(): + ENV_FILE = ROOT_DIR(".env") + +# Do not use local .env environment when running the tests. +if IS_TESTS: + ENV_FILE = None + +env = environ.Env() +environ.Env.read_env(ENV_FILE) + +# Security + +SECRET_KEY = env.str("SECRET_KEY", default="") + +ALLOWED_HOSTS = env.list( + "ALLOWED_HOSTS", + default=[".localhost", "127.0.0.1", "[::1]", "host.docker.internal", "172.17.0.1"], +) + +CSRF_TRUSTED_ORIGINS = env.list("CSRF_TRUSTED_ORIGINS", default=[]) + +# SECURITY WARNING: don't run with debug turned on in production +DEBUG = env.bool("SCANCODEIO_DEBUG", default=False) + +SCANCODEIO_REQUIRE_AUTHENTICATION = env.bool( + "SCANCODEIO_REQUIRE_AUTHENTICATION", default=False +) + +SCANCODEIO_ENABLE_ADMIN_SITE = env.bool("SCANCODEIO_ENABLE_ADMIN_SITE", default=False) + +SECURE_CONTENT_TYPE_NOSNIFF = env.bool("SECURE_CONTENT_TYPE_NOSNIFF", default=True) + +X_FRAME_OPTIONS = env.str("X_FRAME_OPTIONS", default="DENY") + +SESSION_COOKIE_SECURE = env.bool("SESSION_COOKIE_SECURE", default=True) + +CSRF_COOKIE_SECURE = env.bool("CSRF_COOKIE_SECURE", default=True) + +# ``security.W004`` SECURE_HSTS_SECONDS and ``security.W008`` SECURE_SSL_REDIRECT +# are handled by the web server. +SILENCED_SYSTEM_CHECKS = ["security.W004", "security.W008"] + +# ScanCode.io + +SCANCODEIO_WORKSPACE_LOCATION = env.str("SCANCODEIO_WORKSPACE_LOCATION", default="var") + +SCANCODEIO_CONFIG_DIR = env.str("SCANCODEIO_CONFIG_DIR", default=".scancode") + +SCANCODEIO_CONFIG_FILE = env.str( + "SCANCODEIO_CONFIG_FILE", default="scancode-config.yml" +) + +SCANCODEIO_LOG_LEVEL = env.str("SCANCODEIO_LOG_LEVEL", "INFO") + +# Set the number of parallel processes to use for ScanCode related scan execution. +# If the SCANCODEIO_PROCESSES argument is not set, defaults to an optimal number of CPUs +# available on the machine. +SCANCODEIO_PROCESSES = env.int("SCANCODEIO_PROCESSES", default=None) + +SCANCODEIO_POLICIES_FILE = env.str("SCANCODEIO_POLICIES_FILE", default="policies.yml") + +# This setting defines the additional locations ScanCode.io will search for pipelines. +# This should be set to a list of strings that contain full paths to your additional +# pipelines directories. +SCANCODEIO_PIPELINES_DIRS = env.list("SCANCODEIO_PIPELINES_DIRS", default=[]) + +# Maximum time allowed for a pipeline to complete. +SCANCODEIO_TASK_TIMEOUT = env.str("SCANCODEIO_TASK_TIMEOUT", default="24h") + +# Default to 2 minutes. +SCANCODEIO_SCAN_FILE_TIMEOUT = env.int("SCANCODEIO_SCAN_FILE_TIMEOUT", default=120) + +# Default to None which scans all files +SCANCODEIO_SCAN_MAX_FILE_SIZE = env.int("SCANCODEIO_SCAN_MAX_FILE_SIZE", default=None) + +# List views pagination, controls the number of items displayed per page. +# Syntax in .env: SCANCODEIO_PAGINATE_BY=project=10,project_error=10 +SCANCODEIO_PAGINATE_BY = env.dict( + "SCANCODEIO_PAGINATE_BY", + default={ + "project": 20, + "error": 50, + "resource": 100, + "package": 100, + "dependency": 100, + "license": 100, + "relation": 100, + }, +) + +# Default limit for "most common" entries in QuerySets. +SCANCODEIO_MOST_COMMON_LIMIT = env.int("SCANCODEIO_MOST_COMMON_LIMIT", default=7) + +# The base URL (e.g., https://hostname/) of this application instance. +# Required for generating URLs to reference objects within the app, +# such as in webhook notifications. +SCANCODEIO_SITE_URL = env.str("SCANCODEIO_SITE_URL", default="") + +# Fetch authentication credentials + +# SCANCODEIO_FETCH_BASIC_AUTH="host=user,password;" +SCANCODEIO_FETCH_BASIC_AUTH = env.dict( + "SCANCODEIO_FETCH_BASIC_AUTH", + cast={"value": tuple}, + default={}, +) + +# SCANCODEIO_FETCH_DIGEST_AUTH="host=user,password;" +SCANCODEIO_FETCH_DIGEST_AUTH = env.dict( + "SCANCODEIO_FETCH_DIGEST_AUTH", + cast={"value": tuple}, + default={}, +) + +# SCANCODEIO_FETCH_HEADERS="host=Header1=value,Header2=value;" +SCANCODEIO_FETCH_HEADERS = {} +FETCH_HEADERS_STR = env.str("SCANCODEIO_FETCH_HEADERS", default="") +for entry in FETCH_HEADERS_STR.split(";"): + if entry.strip(): + host, headers = entry.split("=", 1) + SCANCODEIO_FETCH_HEADERS[host] = env.parse_value(headers, cast=dict) + +# SCANCODEIO_NETRC_LOCATION="~/.netrc" +SCANCODEIO_NETRC_LOCATION = env.str("SCANCODEIO_NETRC_LOCATION", default="") +if SCANCODEIO_NETRC_LOCATION: + # Propagate the location to the environ for `requests.utils.get_netrc_auth` + env.ENVIRON["NETRC"] = SCANCODEIO_NETRC_LOCATION + +# SCANCODEIO_SKOPEO_CREDENTIALS="host1=user:password,host2=user:password" +SCANCODEIO_SKOPEO_CREDENTIALS = env.dict("SCANCODEIO_SKOPEO_CREDENTIALS", default={}) + +# SCANCODEIO_SKOPEO_AUTHFILE_LOCATION="/path/to/auth.json" +SCANCODEIO_SKOPEO_AUTHFILE_LOCATION = env.str( + "SCANCODEIO_SKOPEO_AUTHFILE_LOCATION", default="" +) + +# This webhook will be added as WebhookSubscription for each new project. +# SCANCODEIO_GLOBAL_WEBHOOK=target_url=https://webhook.url,trigger_on_each_run=False,include_summary=True,include_results=False +SCANCODEIO_GLOBAL_WEBHOOK = env.dict("SCANCODEIO_GLOBAL_WEBHOOK", default={}) + +# Application definition + +INSTALLED_APPS = [ + # Local apps + # Must come before Third-party apps for proper templates override + "scanpipe", + # Django built-in + "django.contrib.auth", + "django.contrib.contenttypes", + "django.contrib.sessions", + "django.contrib.messages", + "django.contrib.staticfiles", + "django.contrib.admin", + "django.contrib.humanize", + # Third-party apps + "crispy_forms", + "crispy_bootstrap3", # required for the djangorestframework browsable API + "django_filters", + "rest_framework", + "rest_framework.authtoken", + "django_rq", + "django_probes", + "taggit", +] + +MIDDLEWARE = [ + "django.middleware.security.SecurityMiddleware", + "django.contrib.sessions.middleware.SessionMiddleware", + "django.middleware.common.CommonMiddleware", + "django.middleware.csrf.CsrfViewMiddleware", + "django.contrib.auth.middleware.AuthenticationMiddleware", + "django.contrib.messages.middleware.MessageMiddleware", + "django.middleware.clickjacking.XFrameOptionsMiddleware", + "scancodeio.middleware.TimezoneMiddleware", +] + +ROOT_URLCONF = "scancodeio.urls" + +WSGI_APPLICATION = "scancodeio.wsgi.application" + +SECURE_PROXY_SSL_HEADER = env.tuple( + "SECURE_PROXY_SSL_HEADER", default=("HTTP_X_FORWARDED_PROTO", "https") +) + +# Database + +DATABASES = { + "default": { + "ENGINE": env.str("SCANCODEIO_DB_ENGINE", "django.db.backends.postgresql"), + "HOST": env.str("SCANCODEIO_DB_HOST", "localhost"), + "NAME": env.str("SCANCODEIO_DB_NAME", "scancodeio"), + "USER": env.str("SCANCODEIO_DB_USER", "scancodeio"), + "PASSWORD": env.str("SCANCODEIO_DB_PASSWORD", "scancodeio"), + "PORT": env.str("SCANCODEIO_DB_PORT", "5432"), + "ATOMIC_REQUESTS": True, + } +} + +DEFAULT_AUTO_FIELD = "django.db.models.AutoField" + +# Forms and filters + +FILTERS_EMPTY_CHOICE_LABEL = env.str("FILTERS_EMPTY_CHOICE_LABEL", default="All") + +# Templates + +TEMPLATES = [ + { + "BACKEND": "django.template.backends.django.DjangoTemplates", + "APP_DIRS": True, + "OPTIONS": { + "debug": DEBUG, + "context_processors": [ + "django.contrib.auth.context_processors.auth", + "django.contrib.messages.context_processors.messages", + "django.template.context_processors.request", + "scancodeio.context_processors.versions", + ], + }, + }, +] + +# Login + +LOGIN_REDIRECT_URL = "project_list" + +# Passwords + +AUTH_PASSWORD_VALIDATORS = [ + { + "NAME": ( + "django.contrib.auth.password_validation.UserAttributeSimilarityValidator" + ), + }, + { + "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator", + "OPTIONS": { + "min_length": env.int("SCANCODEIO_PASSWORD_MIN_LENGTH", default=12), + }, + }, + { + "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator", + }, + { + "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator", + }, +] + +# Testing + +if IS_TESTS: + from django.core.management.utils import get_random_secret_key + + SECRET_KEY = get_random_secret_key() + # Do not pollute the workspace while running the tests. + SCANCODEIO_WORKSPACE_LOCATION = tempfile.mkdtemp() + SCANCODEIO_REQUIRE_AUTHENTICATION = True + SCANCODEIO_SCAN_FILE_TIMEOUT = 120 + SCANCODEIO_POLICIES_FILE = None + # The default password hasher is rather slow by design. + # Using a faster hashing algorithm in the testing context to speed up the run. + PASSWORD_HASHERS = ["django.contrib.auth.hashers.MD5PasswordHasher"] + +# Debug toolbar + +DEBUG_TOOLBAR = env.bool("SCANCODEIO_DEBUG_TOOLBAR", default=False) +if DEBUG and DEBUG_TOOLBAR: + INSTALLED_APPS.append("debug_toolbar") + MIDDLEWARE.append("debug_toolbar.middleware.DebugToolbarMiddleware") + INTERNAL_IPS = ["127.0.0.1"] + +# Logging + +LOGGING = { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "simple": { + "format": "{levelname} {message}", + "style": "{", + }, + }, + "handlers": { + "null": { + "class": "logging.NullHandler", + }, + "console": { + "class": "logging.StreamHandler", + "formatter": "simple", + }, + }, + "loggers": { + "scanpipe": { + "handlers": ["null"] if IS_TESTS else ["console"], + "level": SCANCODEIO_LOG_LEVEL, + "propagate": False, + }, + "django": { + "handlers": ["null"] if IS_TESTS else ["console"], + "propagate": False, + }, + # Set SCANCODEIO_LOG_LEVEL=DEBUG to display all SQL queries in the console. + "django.db.backends": { + "level": SCANCODEIO_LOG_LEVEL, + }, + }, +} + +# Instead of sending out real emails the console backend just writes the emails +# that would be sent to the standard output. +EMAIL_BACKEND = "django.core.mail.backends.console.EmailBackend" + +# Internationalization + +LANGUAGE_CODE = "en-us" + +FORMAT_MODULE_PATH = ["scancodeio.formats"] + +TIME_ZONE = env.str("TIME_ZONE", default="UTC") + +USE_I18N = True + +USE_TZ = True + +# Static files (CSS, JavaScript, Images) + +STATIC_URL = "/static/" + +STATIC_ROOT = env.str("STATIC_ROOT", default="/var/scancodeio/static/") + +STATICFILES_DIRS = [ + PROJECT_DIR("static"), +] + +# Third-party apps + +CRISPY_TEMPLATE_PACK = "bootstrap3" + +# Centralized archive directory for all projects +CENTRAL_ARCHIVE_PATH = env.str( + "CENTRAL_ARCHIVE_PATH", default="/var/scancodeio/archives" +) + +# localstorage configuration +DOWNLOAD_ARCHIVING_PROVIDER = env.str( + "DOWNLOAD_ARCHIVING_PROVIDER", default="localstorage" +) + +# For local storage, we would store the root path in that setting +DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION = env.dict( + "DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION", default=None +) + +# Initialize the DownloadStore for local storage + +download_store = None +logger = logging.getLogger(__name__) +if DOWNLOAD_ARCHIVING_PROVIDER == "localstorage": + config = DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION or {} + root_path = Path(config.get("root_path", CENTRAL_ARCHIVE_PATH)) + try: + download_store = LocalFilesystemProvider(root_path=root_path) + except Exception as e: + logger.error(f"Failed to initialize LocalFilesystemProvider: {e}") +else: + logger.error(f"Unknown DOWNLOAD_ARCHIVING_PROVIDER: {DOWNLOAD_ARCHIVING_PROVIDER}") + +# Job Queue + +RQ_QUEUES = { + "default": { + "HOST": env.str("SCANCODEIO_RQ_REDIS_HOST", default="localhost"), + "PORT": env.str("SCANCODEIO_RQ_REDIS_PORT", default="6379"), + "DB": env.int("SCANCODEIO_RQ_REDIS_DB", default=0), + "USERNAME": env.str("SCANCODEIO_RQ_REDIS_USERNAME", default=None), + "PASSWORD": env.str("SCANCODEIO_RQ_REDIS_PASSWORD", default=""), + "DEFAULT_TIMEOUT": env.int("SCANCODEIO_RQ_REDIS_DEFAULT_TIMEOUT", default=360), + # Enable SSL for Redis connections when deploying ScanCode.io in environments + # where Redis is hosted on a separate system (e.g., cloud deployment or remote + # Redis server) to secure data in transit. + "SSL": env.bool("SCANCODEIO_RQ_REDIS_SSL", default=False), + }, +} + +SCANCODEIO_ASYNC = env.bool("SCANCODEIO_ASYNC", default=False) +if not SCANCODEIO_ASYNC: + for queue_config in RQ_QUEUES.values(): + queue_config["ASYNC"] = False + +# ClamAV virus scan +CLAMD_USE_TCP = env.bool("CLAMD_USE_TCP", default=True) +CLAMD_TCP_ADDR = env.str("CLAMD_TCP_ADDR", default="clamav") + +# Django restframework + +REST_FRAMEWORK = { + "DEFAULT_AUTHENTICATION_CLASSES": ( + "rest_framework.authentication.TokenAuthentication", + ), + "DEFAULT_PERMISSION_CLASSES": ("rest_framework.permissions.IsAuthenticated",), + "DEFAULT_RENDERER_CLASSES": ( + "rest_framework.renderers.JSONRenderer", + "rest_framework.renderers.BrowsableAPIRenderer", + "rest_framework.renderers.AdminRenderer", + ), + "DEFAULT_FILTER_BACKENDS": ( + "django_filters.rest_framework.DjangoFilterBackend", + "rest_framework.filters.SearchFilter", + ), + "DEFAULT_PAGINATION_CLASS": "rest_framework.pagination.PageNumberPagination", + "PAGE_SIZE": env.int("SCANCODEIO_REST_API_PAGE_SIZE", default=50), + "UPLOADED_FILES_USE_URL": False, +} + +if not SCANCODEIO_REQUIRE_AUTHENTICATION: + REST_FRAMEWORK["DEFAULT_PERMISSION_CLASSES"] = ( + "rest_framework.permissions.AllowAny", + ) + +# VulnerableCode integration + +VULNERABLECODE_URL = env.str("VULNERABLECODE_URL", default="").rstrip("/") +VULNERABLECODE_USER = env.str("VULNERABLECODE_USER", default="") +VULNERABLECODE_PASSWORD = env.str("VULNERABLECODE_PASSWORD", default="") +VULNERABLECODE_API_KEY = env.str("VULNERABLECODE_API_KEY", default="") + +# PurlDB integration + +PURLDB_URL = env.str("PURLDB_URL", default="").rstrip("/") +PURLDB_USER = env.str("PURLDB_USER", default="") +PURLDB_PASSWORD = env.str("PURLDB_PASSWORD", default="") +PURLDB_API_KEY = env.str("PURLDB_API_KEY", default="") + +# MatchCode.io integration + +MATCHCODEIO_URL = env.str("MATCHCODEIO_URL", default="").rstrip("/") +MATCHCODEIO_USER = env.str("MATCHCODEIO_USER", default="") +MATCHCODEIO_PASSWORD = env.str("MATCHCODEIO_PASSWORD", default="") +MATCHCODEIO_API_KEY = env.str("MATCHCODEIO_API_KEY", default="") + +# FederatedCode integration + +FEDERATEDCODE_GIT_ACCOUNT_URL = env.str( + "FEDERATEDCODE_GIT_ACCOUNT_URL", default="" +).rstrip("/") +FEDERATEDCODE_GIT_SERVICE_TOKEN = env.str("FEDERATEDCODE_GIT_SERVICE_TOKEN", default="") +FEDERATEDCODE_GIT_SERVICE_NAME = env.str("FEDERATEDCODE_GIT_SERVICE_NAME", default="") +FEDERATEDCODE_GIT_SERVICE_EMAIL = env.str("FEDERATEDCODE_GIT_SERVICE_EMAIL", default="") diff --git a/scanpipe/archiving.py b/scanpipe/archiving.py index 482f448de5..3f3d66e2e8 100644 --- a/scanpipe/archiving.py +++ b/scanpipe/archiving.py @@ -1,190 +1,185 @@ -# scanpipe/archiving.py -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - -import hashlib -import json -import logging -import os -import stat -from abc import ABC -from abc import abstractmethod -from dataclasses import dataclass -from pathlib import Path - - -logger = logging.getLogger(__name__) - - -@dataclass -class Download: - sha256: str - download_date: str - download_url: str - filename: str - - -class DownloadStore(ABC): - def _compute_sha256(self, content: bytes) -> str: - """Compute SHA256 hash for content.""" - return hashlib.sha256(content).hexdigest() - - def _compute_origin_hash( - self, filename: str, download_date: str, download_url: str - ) -> str: - """Compute a hash for the metadata to name the origin JSON file.""" - to_hash = f"{filename}{download_date}{download_url}".encode() - return hashlib.sha256(to_hash).hexdigest() - - def _build_metadata( - self, sha256: str, filename: str, download_date: str, download_url: str - ) -> dict: - """Build metadata dictionary for JSON storage.""" - return { - "sha256": sha256, - "filename": filename, - "download_date": download_date, - "download_url": download_url, - } - - @abstractmethod - def _get_content_path(self, sha256: str) -> str: - """Get the storage path/key for the content based on SHA256.""" - pass - - @abstractmethod - def list(self): - """Return an iterable of all stored downloads.""" - pass - - @abstractmethod - def get(self, sha256_checksum: str): - """Return a Download object for this checksum or None.""" - pass - - @abstractmethod - def put(self, content: bytes, download_url: str, download_date: str, filename: str): - """ - Store content with its metadata. Return a Download object on success. - Raise an exception on error. - """ - pass - - @abstractmethod - def find( - self, download_url: str = None, filename: str = None, download_date: str = None - ): - """Return a Download object matching the metadata or None.""" - pass - - -class LocalFilesystemProvider(DownloadStore): - def __init__(self, root_path: Path): - self.root_path = root_path - - def _get_content_path(self, sha256: str) -> Path: - """Create a nested path like 59/4c/67/... based on the SHA256 hash.""" - return self.root_path / sha256[:2] / sha256[2:4] / sha256[4:] - - def list(self): - """Return an iterable of all stored downloads.""" - downloads = [] - for content_path in self.root_path.rglob("content"): - origin_files = list(content_path.parent.glob("origin-*.json")) - for origin_file in origin_files: - try: - with open(origin_file) as f: - data = json.load(f) - downloads.append(Download(**data)) - except Exception as e: - logger.error(f"Error reading {origin_file}: {e}") - return downloads - - def get(self, sha256_checksum: str): - """Retrieve a Download object for the given SHA256 hash.""" - content_path = self._get_content_path(sha256_checksum) - if content_path.exists(): - origin_files = list(content_path.glob("origin-*.json")) - if origin_files: - try: - with open(origin_files[0]) as f: - data = json.load(f) - return Download(**data) - except Exception as e: - logger.error( - f"Error reading origin file for {sha256_checksum}: {e}" - ) - return None - - def put(self, content: bytes, download_url: str, download_date: str, filename: str): - """Store the content and its metadata.""" - sha256 = self._compute_sha256(content) - content_path = self._get_content_path(sha256) - content_path.mkdir(parents=True, exist_ok=True) - - content_file = content_path / "content" - if not content_file.exists(): - try: - with open(content_file, "wb") as f: - f.write(content) - except Exception as e: - raise Exception(f"Failed to write content to {content_file}: {e}") - - origin_hash = self._compute_origin_hash(filename, download_date, download_url) - origin_filename = f"origin-{origin_hash}.json" - origin_path = content_path / origin_filename - if origin_path.exists(): - raise Exception(f"Origin {origin_filename} already exists") - - metadata = self._build_metadata(sha256, filename, download_date, download_url) - try: - with open(origin_path, "w") as f: - json.dump(metadata, f, indent=2) - except Exception as e: - raise Exception(f"Failed to write metadata to {origin_path}: {e}") - - return Download(**metadata) - - def find( - self, download_url: str = None, filename: str = None, download_date: str = None - ): - """Find a download based on metadata.""" - if not (download_url or filename or download_date): - return None - for content_path in self.root_path.rglob("origin-*.json"): - try: - with open(content_path) as f: - data = json.load(f) - if ( - (download_url is None or data.get("url") == download_url) - and (filename is None or data.get("filename") == filename) - and ( - download_date is None - or data.get("download_date") == download_date - ) - ): - return Download(**data) - except Exception as e: - logger.error(f"Error reading {content_path}: {e}") - return None - - +# scanpipe/archiving.py +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +import hashlib +import json +import logging +from abc import ABC +from abc import abstractmethod +from dataclasses import dataclass +from pathlib import Path + +logger = logging.getLogger(__name__) + + +@dataclass +class Download: + sha256: str + download_date: str + download_url: str + filename: str + + +class DownloadStore(ABC): + def _compute_sha256(self, content: bytes) -> str: + """Compute SHA256 hash for content.""" + return hashlib.sha256(content).hexdigest() + + def _compute_origin_hash( + self, filename: str, download_date: str, download_url: str + ) -> str: + """Compute a hash for the metadata to name the origin JSON file.""" + to_hash = f"{filename}{download_date}{download_url}".encode() + return hashlib.sha256(to_hash).hexdigest() + + def _build_metadata( + self, sha256: str, filename: str, download_date: str, download_url: str + ) -> dict: + """Build metadata dictionary for JSON storage.""" + return { + "sha256": sha256, + "filename": filename, + "download_date": download_date, + "download_url": download_url, + } + + @abstractmethod + def _get_content_path(self, sha256: str) -> str: + """Get the storage path/key for the content based on SHA256.""" + pass + + @abstractmethod + def list(self): + """Return an iterable of all stored downloads.""" + pass + + @abstractmethod + def get(self, sha256_checksum: str): + """Return a Download object for this checksum or None.""" + pass + + @abstractmethod + def put(self, content: bytes, download_url: str, download_date: str, filename: str): + """ + Store content with its metadata. Return a Download object on success. + Raise an exception on error. + """ + pass + + @abstractmethod + def find( + self, download_url: str = None, filename: str = None, download_date: str = None + ): + """Return a Download object matching the metadata or None.""" + pass + + +class LocalFilesystemProvider(DownloadStore): + def __init__(self, root_path: Path): + self.root_path = root_path + + def _get_content_path(self, sha256: str) -> Path: + """Create a nested path like 59/4c/67/... based on the SHA256 hash.""" + return self.root_path / sha256[:2] / sha256[2:4] / sha256[4:] + + def list(self): + """Return an iterable of all stored downloads.""" + downloads = [] + for content_path in self.root_path.rglob("content"): + origin_files = list(content_path.parent.glob("origin-*.json")) + for origin_file in origin_files: + try: + with open(origin_file) as f: + data = json.load(f) + downloads.append(Download(**data)) + except Exception as e: + logger.error(f"Error reading {origin_file}: {e}") + return downloads + + def get(self, sha256_checksum: str): + """Retrieve a Download object for the given SHA256 hash.""" + content_path = self._get_content_path(sha256_checksum) + if content_path.exists(): + origin_files = list(content_path.glob("origin-*.json")) + if origin_files: + try: + with open(origin_files[0]) as f: + data = json.load(f) + return Download(**data) + except Exception as e: + logger.error( + f"Error reading origin file for {sha256_checksum}: {e}" + ) + return None + + def put(self, content: bytes, download_url: str, download_date: str, filename: str): + """Store the content and its metadata.""" + sha256 = self._compute_sha256(content) + content_path = self._get_content_path(sha256) + content_path.mkdir(parents=True, exist_ok=True) + + content_file = content_path / "content" + if not content_file.exists(): + try: + with open(content_file, "wb") as f: + f.write(content) + except Exception as e: + raise Exception(f"Failed to write content to {content_file}: {e}") + + origin_hash = self._compute_origin_hash(filename, download_date, download_url) + origin_filename = f"origin-{origin_hash}.json" + origin_path = content_path / origin_filename + if origin_path.exists(): + raise Exception(f"Origin {origin_filename} already exists") + + metadata = self._build_metadata(sha256, filename, download_date, download_url) + try: + with open(origin_path, "w") as f: + json.dump(metadata, f, indent=2) + except Exception as e: + raise Exception(f"Failed to write metadata to {origin_path}: {e}") + + return Download(**metadata) + + def find( + self, download_url: str = None, filename: str = None, download_date: str = None + ): + """Find a download based on metadata.""" + if not (download_url or filename or download_date): + return None + for content_path in self.root_path.rglob("origin-*.json"): + try: + with open(content_path) as f: + data = json.load(f) + if ( + (download_url is None or data.get("url") == download_url) + and (filename is None or data.get("filename") == filename) + and ( + download_date is None + or data.get("download_date") == download_date + ) + ): + return Download(**data) + except Exception as e: + logger.error(f"Error reading {content_path}: {e}") + return None diff --git a/scanpipe/pipelines/__init__.py b/scanpipe/pipelines/__init__.py index 1b6cd4e0a0..5153bf1887 100644 --- a/scanpipe/pipelines/__init__.py +++ b/scanpipe/pipelines/__init__.py @@ -1,346 +1,353 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - -import inspect -import logging -import traceback -import hashlib -from contextlib import contextmanager -from datetime import datetime -from functools import wraps -from pathlib import Path - -import bleach -import requests -from markdown_it import MarkdownIt -from pyinstrument import Profiler - -from aboutcode.pipeline import BasePipeline -from scancodeio.settings import download_store - -logger = logging.getLogger(__name__) - - -class InputFilesError(Exception): - """InputFile is missing or cannot be downloaded.""" - - def __init__(self, error_tracebacks): - self.error_tracebacks = error_tracebacks - super().__init__(self._generate_message()) - - def _generate_message(self): - message = "InputFilesError encountered with the following issues:\n" - for index, (error, tb) in enumerate(self.error_tracebacks, start=1): - message += f"\nError {index}: {str(error)}\n\n{tb}" - return message - - -def convert_markdown_to_html(markdown_text): - """Convert Markdown text to sanitized HTML.""" - # Using the "js-default" for safety. - html_content = MarkdownIt("js-default").renderInline(markdown_text) - # Sanitize HTML using bleach. - sanitized_html = bleach.clean(html_content) - return sanitized_html - - -class CommonStepsMixin: - """Common steps available on all project pipelines.""" - - def flag_empty_files(self): - """Flag empty files.""" - from scanpipe.pipes import flag - - flag.flag_empty_files(self.project) - - def flag_ignored_resources(self): - """Flag ignored resources based on Project ``ignored_patterns`` setting.""" - from scanpipe.pipes import flag - - ignored_patterns = self.env.get("ignored_patterns", []) - - if isinstance(ignored_patterns, str): - ignored_patterns = ignored_patterns.splitlines() - ignored_patterns.extend(flag.DEFAULT_IGNORED_PATTERNS) - - flag.flag_ignored_patterns( - codebaseresources=self.project.codebaseresources.no_status(), - patterns=ignored_patterns, - ) - - def extract_archive(self, location, target): - """Extract archive at `location` to `target`. Save errors as messages.""" - from scanpipe.pipes import scancode - - extract_errors = scancode.extract_archive(location, target) - - for resource_location, errors in extract_errors.items(): - resource_path = Path(resource_location) - - if resource_path.is_relative_to(self.project.codebase_path): - resource_path = resource_path.relative_to(self.project.codebase_path) - details = {"resource_path": str(resource_path)} - elif resource_path.is_relative_to(self.project.input_path): - resource_path = resource_path.relative_to(self.project.input_path) - details = {"path": f"input/{str(resource_path)}"} - else: - details = {"filename": str(resource_path.name)} - - self.project.add_error( - description="\n".join(errors), - model="extract_archive", - details=details, - ) - - def extract_archives(self, location=None): - """Extract archives located in the codebase/ directory with extractcode.""" - from scanpipe.pipes import scancode - - if not location: - location = self.project.codebase_path - - extract_errors = scancode.extract_archives(location=location, recurse=True) - - for resource_path, errors in extract_errors.items(): - self.project.add_error( - description="\n".join(errors), - model="extract_archives", - details={"resource_path": resource_path}, - ) - - # Reload the project env post-extraction as the scancode-config.yml file - # may be located in one of the extracted archives. - self.env = self.project.get_env() - - def download_missing_inputs(self): - """ - Download any InputSource missing on disk. - Raise an error if any of the uploaded files is not available or not reachable. - """ - error_tracebacks = [] - - for input_source in self.project.inputsources.all(): - if input_source.exists(): - continue - - if input_source.is_uploaded: - msg = f"Uploaded file {input_source} not available." - self.log(msg) - error_tracebacks.append((msg, "No traceback available.")) - continue - - download_url = input_source.download_url - if not download_url: - continue - - url_hash = hashlib.sha256(download_url.encode()).hexdigest() - filename = input_source.filename or Path(download_url).name or f"{url_hash}.archive" - archive_path = Path(settings.CENTRAL_ARCHIVE_PATH) / url_hash / filename - - if archive_path.exists(): - logger.info(f"Reusing existing archive at {archive_path}") - input_source.file_path = str(archive_path) - input_source.save() - continue - - self.log(f"Fetching input from {input_source.download_url}") - try: - input_source.fetch() - - except Exception as error: - traceback_str = traceback.format_exc() - logger.error(traceback_str) - self.log(f"{input_source.download_url} could not be fetched.") - error_tracebacks.append((str(error), traceback_str)) - - if error_tracebacks: - raise InputFilesError(error_tracebacks) - - def archive_downloads(self): - """ - Archive downloaded inputs to the centralized DownloadStore if not already - archived.Updates InputSource with archiving metadata (sha256, download_date). - """ - logger.info(f"Archiving downloads for project {self.project.name}") - for input_source in self.project.inputsources.filter( - sha256__isnull=True, is_uploaded=False - ): - if input_source.download_url: - try: - response = requests.get( - input_source.download_url, stream=True,timeout=30 - ) - response.raise_for_status() - content = response.content - filename = ( - input_source.filename - or input_source.download_url.split("/")[-1] - ) - download = download_store.put( - content=content, - download_url=input_source.download_url, - download_date=datetime.now().isoformat(), - filename=filename, - ) - input_source.sha256 = download.sha256 - input_source.download_date = download.download_date - input_source.save() - except Exception as e: - self.add_error( - exception=e, - message=f"Failed to archive {input_source.download_url}", - ) - else: - logger.warning( - f"No download URL for input {input_source.filename}," - "skipping archiving" - ) - - -class ProjectPipeline(CommonStepsMixin, BasePipeline): - """Main class for all project related pipelines including common steps methods.""" - - # Flag specifying whether to download missing inputs as an initial step. - download_inputs = True - - # Optional URL that targets a view of the results relative to this Pipeline. - # This URL may contain dictionary-style string formatting, which will be - # interpolated against the project's field attributes. - # For example, you could use results_url="/project/{slug}/packages/?filter=value" - # to target the Package list view with an active filtering. - results_url = "" - - def __init__(self, run_instance): - """Load the Pipeline execution context from a Run database object.""" - self.run = run_instance - self.project = run_instance.project - self.env = self.project.get_env() - - self.pipeline_class = run_instance.pipeline_class - self.pipeline_name = run_instance.pipeline_name - - self.selected_groups = run_instance.selected_groups or [] - self.selected_steps = run_instance.selected_steps or [] - - self.ecosystem_config = None - - @classmethod - def get_initial_steps(cls): - """Add the ``download_inputs`` step as an initial step if enabled.""" - steps = [] - if cls.download_inputs: - steps.append(cls.download_missing_inputs) - if ENABLE_DOWNLOAD_ARCHIVING: - steps.append(cls.archive_downloads) - return tuple(steps) - - @classmethod - def get_info(cls, as_html=False): - """Add the option to render the values as HTML.""" - info = super().get_info() - - if as_html: - info["summary"] = convert_markdown_to_html(info["summary"]) - info["description"] = convert_markdown_to_html(info["description"]) - for step in info["steps"]: - step["doc"] = convert_markdown_to_html(step["doc"]) - - return info - - def append_to_log(self, message): - self.run.append_to_log(message) - - def set_current_step(self, message): - self.run.set_current_step(message) - - def add_error(self, exception, resource=None): - """Create a ``ProjectMessage`` ERROR record on the current `project`.""" - self.project.add_error( - model=self.pipeline_name, - exception=exception, - object_instance=resource, - ) - - @contextmanager - def save_errors(self, *exceptions, **kwargs): - """ - Context manager to save specified exceptions as ``ProjectMessage`` in the - database. - - - Example in a Pipeline step:: - - with self.save_errors(rootfs.DistroNotFound): - rootfs.scan_rootfs_for_system_packages(self.project, rfs) - - - Example when iterating over resources:: - - for resource in self.project.codebaseresources.all(): - with self.save_errors(Exception, resource=resource): - analyse(resource) - """ - try: - yield - except exceptions as error: - self.add_error(exception=error, **kwargs) - - -class Pipeline(ProjectPipeline): - """Alias for the ProjectPipeline class.""" - - pass - - -def is_pipeline(obj): - """ - Return True if the `obj` is a subclass of `Pipeline` except for the - `Pipeline` class itself. - """ - return inspect.isclass(obj) and issubclass(obj, Pipeline) and obj is not Pipeline - - -def profile(step): - """ - Profile a Pipeline step and save the results as HTML file in the project output - directory. - - Usage: - @profile - def step(self): - pass - """ - - @wraps(step) - def wrapper(*arg, **kwargs): - pipeline_instance = arg[0] - project = pipeline_instance.project - - with Profiler() as profiler: - result = step(*arg, **kwargs) - - output_file = project.get_output_file_path("profile", "html") - output_file.write_text(profiler.output_html()) - - pipeline_instance.log(f"Profiling results at {output_file.resolve()}") - - return result - - return wrapper +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +import hashlib +import inspect +import logging +import traceback +from contextlib import contextmanager +from datetime import datetime +from functools import wraps +from pathlib import Path + +import bleach +from markdown_it import MarkdownIt +from pyinstrument import Profiler + +from aboutcode.pipeline import BasePipeline +from scancodeio.settings import download_store +from scancodeio.settings import settings + +logger = logging.getLogger(__name__) + + +class InputFilesError(Exception): + """InputFile is missing or cannot be downloaded.""" + + def __init__(self, error_tracebacks): + self.error_tracebacks = error_tracebacks + super().__init__(self._generate_message()) + + def _generate_message(self): + message = "InputFilesError encountered with the following issues:\n" + for index, (error, tb) in enumerate(self.error_tracebacks, start=1): + message += f"\nError {index}: {str(error)}\n\n{tb}" + return message + + +def convert_markdown_to_html(markdown_text): + """Convert Markdown text to sanitized HTML.""" + # Using the "js-default" for safety. + html_content = MarkdownIt("js-default").renderInline(markdown_text) + # Sanitize HTML using bleach. + sanitized_html = bleach.clean(html_content) + return sanitized_html + + +class CommonStepsMixin: + """Common steps available on all project pipelines.""" + + def flag_empty_files(self): + """Flag empty files.""" + from scanpipe.pipes import flag + + flag.flag_empty_files(self.project) + + def flag_ignored_resources(self): + """Flag ignored resources based on Project ``ignored_patterns`` setting.""" + from scanpipe.pipes import flag + + ignored_patterns = self.env.get("ignored_patterns", []) + + if isinstance(ignored_patterns, str): + ignored_patterns = ignored_patterns.splitlines() + ignored_patterns.extend(flag.DEFAULT_IGNORED_PATTERNS) + + flag.flag_ignored_patterns( + codebaseresources=self.project.codebaseresources.no_status(), + patterns=ignored_patterns, + ) + + def extract_archive(self, location, target): + """Extract archive at `location` to `target`. Save errors as messages.""" + from scanpipe.pipes import scancode + + extract_errors = scancode.extract_archive(location, target) + + for resource_location, errors in extract_errors.items(): + resource_path = Path(resource_location) + + if resource_path.is_relative_to(self.project.codebase_path): + resource_path = resource_path.relative_to(self.project.codebase_path) + details = {"resource_path": str(resource_path)} + elif resource_path.is_relative_to(self.project.input_path): + resource_path = resource_path.relative_to(self.project.input_path) + details = {"path": f"input/{str(resource_path)}"} + else: + details = {"filename": str(resource_path.name)} + + self.project.add_error( + description="\n".join(errors), + model="extract_archive", + details=details, + ) + + def extract_archives(self, location=None): + """Extract archives located in the codebase/ directory with extractcode.""" + from scanpipe.pipes import scancode + + if not location: + location = self.project.codebase_path + + extract_errors = scancode.extract_archives(location=location, recurse=True) + + for resource_path, errors in extract_errors.items(): + self.project.add_error( + description="\n".join(errors), + model="extract_archives", + details={"resource_path": resource_path}, + ) + + # Reload the project env post-extraction as the scancode-config.yml file + # may be located in one of the extracted archives. + self.env = self.project.get_env() + + def download_missing_inputs(self): + """ + Download any InputSource missing on disk. + Raise an error if any of the uploaded files is not available or not reachable. + """ + error_tracebacks = [] + + for input_source in self.project.inputsources.all(): + if input_source.exists(): + continue + + if input_source.is_uploaded: + msg = f"Uploaded file {input_source} not available." + self.log(msg) + error_tracebacks.append((msg, "No traceback available.")) + continue + + download_url = input_source.download_url + if not download_url: + continue + + url_hash = hashlib.sha256(download_url.encode()).hexdigest() + filename = ( + input_source.filename + or Path(download_url).name + or f"{url_hash}.archive" + ) + archive_path = Path(settings.CENTRAL_ARCHIVE_PATH) / url_hash / filename + + if archive_path.exists(): + logger.info(f"Reusing existing archive at {archive_path}") + input_source.file_path = str(archive_path) + input_source.save() + continue + + self.log(f"Fetching input from {input_source.download_url}") + try: + input_source.fetch() + + except Exception as error: + traceback_str = traceback.format_exc() + logger.error(traceback_str) + self.log(f"{input_source.download_url} could not be fetched.") + error_tracebacks.append((str(error), traceback_str)) + + if error_tracebacks: + raise InputFilesError(error_tracebacks) + + def archive_downloads(self): + """ + Archive downloaded inputs to the centralized DownloadStore if not already + archived.Updates InputSource with archiving metadata (sha256, download_date). + """ + logger.info(f"Archiving downloads for project {self.project.name}") + for input_source in self.project.inputsources.filter( + sha256__isnull=True, is_uploaded=False + ): + if input_source.download_url: + logger.warning( + f"No download URL for input {input_source.filename}, " + "skipping archiving" + ) + continue + + if not input_source.file_path: + logger.warning( + f"No file_path for input {input_source.download_url}, " + "skipping archiving" + ) + continue + try: + with open(input_source.file_path, "rb") as f: + content = f.read() + filename = ( + input_source.filename or input_source.download_url.split("/")[-1] + ) + download = download_store.put( + content=content, + download_url=input_source.download_url, + download_date=datetime.now().isoformat(), + filename=filename, + ) + input_source.sha256 = download.sha256 + input_source.download_date = download.download_date + input_source.file_path = str(download.path) + input_source.save() + except Exception as e: + self.add_error( + exception=e, + message=f"Failed to archive {input_source.download_url}", + ) + + +class ProjectPipeline(CommonStepsMixin, BasePipeline): + """Main class for all project related pipelines including common steps methods.""" + + # Flag specifying whether to download missing inputs as an initial step. + download_inputs = True + + # Optional URL that targets a view of the results relative to this Pipeline. + # This URL may contain dictionary-style string formatting, which will be + # interpolated against the project's field attributes. + # For example, you could use results_url="/project/{slug}/packages/?filter=value" + # to target the Package list view with an active filtering. + results_url = "" + + def __init__(self, run_instance): + """Load the Pipeline execution context from a Run database object.""" + self.run = run_instance + self.project = run_instance.project + self.env = self.project.get_env() + + self.pipeline_class = run_instance.pipeline_class + self.pipeline_name = run_instance.pipeline_name + + self.selected_groups = run_instance.selected_groups or [] + self.selected_steps = run_instance.selected_steps or [] + + self.ecosystem_config = None + + @classmethod + def get_initial_steps(cls): + """Add the ``download_inputs`` step as an initial step if enabled.""" + steps = [] + if cls.download_inputs: + steps.append(cls.download_missing_inputs) + steps.append(cls.archive_downloads) + return tuple(steps) + + @classmethod + def get_info(cls, as_html=False): + """Add the option to render the values as HTML.""" + info = super().get_info() + + if as_html: + info["summary"] = convert_markdown_to_html(info["summary"]) + info["description"] = convert_markdown_to_html(info["description"]) + for step in info["steps"]: + step["doc"] = convert_markdown_to_html(step["doc"]) + + return info + + def append_to_log(self, message): + self.run.append_to_log(message) + + def set_current_step(self, message): + self.run.set_current_step(message) + + def add_error(self, exception, resource=None): + """Create a ``ProjectMessage`` ERROR record on the current `project`.""" + self.project.add_error( + model=self.pipeline_name, + exception=exception, + object_instance=resource, + ) + + @contextmanager + def save_errors(self, *exceptions, **kwargs): + """ + Context manager to save specified exceptions as ``ProjectMessage`` in the + database. + + - Example in a Pipeline step:: + + with self.save_errors(rootfs.DistroNotFound): + rootfs.scan_rootfs_for_system_packages(self.project, rfs) + + - Example when iterating over resources:: + + for resource in self.project.codebaseresources.all(): + with self.save_errors(Exception, resource=resource): + analyse(resource) + """ + try: + yield + except exceptions as error: + self.add_error(exception=error, **kwargs) + + +class Pipeline(ProjectPipeline): + """Alias for the ProjectPipeline class.""" + + pass + + +def is_pipeline(obj): + """ + Return True if the `obj` is a subclass of `Pipeline` except for the + `Pipeline` class itself. + """ + return inspect.isclass(obj) and issubclass(obj, Pipeline) and obj is not Pipeline + + +def profile(step): + """ + Profile a Pipeline step and save the results as HTML file in the project output + directory. + + Usage: + @profile + def step(self): + pass + """ + + @wraps(step) + def wrapper(*arg, **kwargs): + pipeline_instance = arg[0] + project = pipeline_instance.project + + with Profiler() as profiler: + result = step(*arg, **kwargs) + + output_file = project.get_output_file_path("profile", "html") + output_file.write_text(profiler.output_html()) + + pipeline_instance.log(f"Profiling results at {output_file.resolve()}") + + return result + + return wrapper diff --git a/scanpipe/pipes/input.py b/scanpipe/pipes/input.py index 81ae91c21d..906a2ee3a1 100644 --- a/scanpipe/pipes/input.py +++ b/scanpipe/pipes/input.py @@ -1,347 +1,345 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - -import hashlib -import logging -import os -import shutil -from datetime import datetime -from pathlib import Path - -from django.core.exceptions import FieldDoesNotExist -from django.core.validators import EMPTY_VALUES -from django.db import models - -import openpyxl -import requests -from typecode.contenttype import get_type - -from scanpipe import pipes -from scanpipe.models import CodebaseRelation -from scanpipe.models import CodebaseResource -from scanpipe.models import DiscoveredDependency -from scanpipe.models import DiscoveredLicense -from scanpipe.models import DiscoveredPackage -from scanpipe.models import InputSource -from scanpipe.pipes import scancode -from scanpipe.pipes.output import mappings_key_by_fieldname -from scancodeio.settings import download_store - -logger = logging.getLogger(__name__) - - -def copy_input(input_location, dest_path): - """Copy the ``input_location`` (file or directory) to the ``dest_path``.""" - input_path = Path(input_location) - destination_dir = Path(dest_path) - destination = destination_dir / input_path.name - - if input_path.is_dir(): - shutil.copytree(input_location, destination) - else: - if not os.path.exists(destination_dir): - os.makedirs(destination_dir) - shutil.copyfile(input_location, destination) - - return destination - - -def copy_inputs(input_locations, dest_path): - """Copy the provided ``input_locations`` to the ``dest_path``.""" - for input_location in input_locations: - copy_input(input_location, dest_path) - - -def move_input(input_location, dest_path): - """Move the provided ``input_location`` to the ``dest_path``.""" - destination = dest_path / Path(input_location).name - return shutil.move(input_location, destination) - - -def move_inputs(inputs, dest_path): - """Move the provided ``inputs`` to the ``dest_path``.""" - for input_location in inputs: - move_input(input_location, dest_path) - - -def get_tool_name_from_scan_headers(scan_data): - """Return the ``tool_name`` of the first header in the provided ``scan_data``.""" - if headers := scan_data.get("headers", []): - first_header = headers[0] - tool_name = first_header.get("tool_name", "") - return tool_name - - -def get_extra_data_from_scan_headers(scan_data): - """Return the ``extra_data`` of the first header in the provided ``scan_data``.""" - if headers := scan_data.get("headers", []): - first_header = headers[0] - if extra_data := first_header.get("extra_data"): - return extra_data - - -def is_archive(location): - """Return True if the file at ``location`` is an archive.""" - return get_type(location).is_archive - - -def load_inventory_from_toolkit_scan(project, input_location): - """ - Create license detections, packages, dependencies, and resources - loaded from the ScanCode-toolkit scan results located at ``input_location``. - """ - scanned_codebase = scancode.get_virtual_codebase(project, input_location) - scancode.create_discovered_licenses(project, scanned_codebase) - scancode.create_discovered_packages(project, scanned_codebase) - scancode.create_codebase_resources(project, scanned_codebase) - scancode.create_discovered_dependencies( - project, scanned_codebase, strip_datafile_path_root=True - ) - scancode.load_todo_issues(project, scanned_codebase) - - -def load_inventory_from_scanpipe(project, scan_data, extra_data_prefix=None): - """ - Create packages, dependencies, license detections, resources, and relations - loaded from a ScanCode.io JSON output provided as ``scan_data``. - - An ``extra_data_prefix`` can be provided in case multiple input files are loaded - into the same project. The prefix is usually the filename of the input. - """ - for detection_data in scan_data.get("license_detections", []): - pipes.update_or_create_license_detection(project, detection_data) - - for package_data in scan_data.get("packages", []): - pipes.update_or_create_package(project, package_data) - - for resource_data in scan_data.get("files", []): - pipes.update_or_create_resource(project, resource_data) - - for dependency_data in scan_data.get("dependencies", []): - pipes.update_or_create_dependency(project, dependency_data) - - for relation_data in scan_data.get("relations", []): - pipes.get_or_create_relation(project, relation_data) - - if extra_data := get_extra_data_from_scan_headers(scan_data): - if extra_data_prefix: - extra_data = {extra_data_prefix: extra_data} - project.update_extra_data(extra_data) - - -model_to_object_maker_func = { - DiscoveredPackage: pipes.update_or_create_package, - DiscoveredDependency: pipes.update_or_create_dependency, - DiscoveredLicense: pipes.update_or_create_license_detection, - CodebaseResource: pipes.update_or_create_resource, - CodebaseRelation: pipes.get_or_create_relation, -} - -worksheet_name_to_model = { - "PACKAGES": DiscoveredPackage, - "LICENSE_DETECTIONS": DiscoveredLicense, - "RESOURCES": CodebaseResource, - "DEPENDENCIES": DiscoveredDependency, - "RELATIONS": CodebaseRelation, -} - - -def get_worksheet_data(worksheet): - """Return the data from provided ``worksheet`` as a list of dict.""" - try: - header = [cell.value for cell in next(worksheet.rows)] - except StopIteration: - return {} - - worksheet_data = [ - dict(zip(header, row)) - for row in worksheet.iter_rows(min_row=2, values_only=True) - ] - return worksheet_data - - -def clean_xlsx_field_value(model_class, field_name, value): - """Clean the ``value`` for compatibility with the database ``model_class``.""" - if value in EMPTY_VALUES: - return - - if field_name == "for_packages": - return value.splitlines() - - elif field_name in ["purl", "for_package_uid", "datafile_path"]: - return value - - try: - field = model_class._meta.get_field(field_name) - except FieldDoesNotExist: - return - - if dict_key := mappings_key_by_fieldname.get(field_name): - return [{dict_key: entry} for entry in value.splitlines()] - - elif isinstance(field, models.JSONField): - if field.default is list: - return value.splitlines() - elif field.default is dict: - return # dict stored as JSON are not supported - - return value - - -def clean_xlsx_data_to_model_data(model_class, xlsx_data): - """Clean the ``xlsx_data`` for compatibility with the database ``model_class``.""" - cleaned_data = {} - - for field_name, value in xlsx_data.items(): - if cleaned_value := clean_xlsx_field_value(model_class, field_name, value): - cleaned_data[field_name] = cleaned_value - - return cleaned_data - - -def load_inventory_from_xlsx(project, input_location, extra_data_prefix=None): - """ - Create packages, dependencies, resources, and relations loaded from XLSX file - located at ``input_location``. - - An ``extra_data_prefix`` can be provided in case multiple input files are loaded - into the same project. The prefix is usually the filename of the input. - """ - workbook = openpyxl.load_workbook(input_location, read_only=True, data_only=True) - - for worksheet_name, model_class in worksheet_name_to_model.items(): - if worksheet_name not in workbook: - continue - - worksheet_data = get_worksheet_data(worksheet=workbook[worksheet_name]) - for row_data in worksheet_data: - object_maker_func = model_to_object_maker_func.get(model_class) - cleaned_data = clean_xlsx_data_to_model_data(model_class, row_data) - if cleaned_data: - object_maker_func(project, cleaned_data) - - if "LAYERS" in workbook: - layers_data = get_worksheet_data(worksheet=workbook["LAYERS"]) - extra_data = {"layers": layers_data} - if extra_data_prefix: - extra_data = {extra_data_prefix: extra_data} - project.update_extra_data(extra_data) - - -def add_input_from_url(project, url, filename=None): - """ - Download the file from the provided ``url`` and add it as an InputSource for the - specified ``project``. Optionally, specify a ``filename`` for the downloaded file. - If archiving is enabled, store the content in the DownloadStore and save metadata. - """ - try: - response = requests.get(url, stream=True,timeout=30) - response.raise_for_status() - content = response.content - except requests.RequestException as e: - logger.error(f"Failed to download {url}: {e}") - raise - - filename = filename or url.split("/")[-1] or "downloaded_file" - url_hash = hashlib.sha256(url.encode()).hexdigest() - archive_path = Path(project.settings.CENTRAL_ARCHIVE_PATH) / url_hash / filename - - if download_store: - try: - download = download_store.put( - content=content, - download_url=url, - download_date=datetime.now().isoformat(), - filename=filename, - ) - InputSource.objects.create( - project=project, - sha256=download.sha256, - download_url=download.download_url, - filename=download.filename, - download_date=download.download_date, - file_path=str(download.path), - is_uploaded=False, - ) - except Exception as e: - logger.error(f"Failed to archive download for {url}: {e}") - raise - else: - input_path = project.input_path / filename - try: - input_path.parent.mkdir(parents=True, exist_ok=True) - with open(input_path, "wb") as f: - f.write(content) - InputSource.objects.create( - project=project, - filename=filename, - download_url=url, - file_path=str(input_path), - is_uploaded=False, - ) - except Exception as e: - logger.error(f"Failed to save {filename} to {input_path}: {e}") - raise - -def add_input_from_upload(project, uploaded_file): - """ - Add an uploaded file as an InputSource for the specified ``project``. - If archiving is enabled, store the content in the DownloadStore and save metadata. - """ - content = uploaded_file.read() - filename = uploaded_file.name - - if download_store: - try: - download = download_store.put( - content=content, - download_url="", - download_date=datetime.now().isoformat(), - filename=filename, - ) - InputSource.objects.create( - project=project, - sha256=download.sha256, - download_url=download.download_url, - filename=download.filename, - download_date=download.download_date, - file_path=str(download.path), - is_uploaded=True, - ) - except Exception as e: - logger.error(f"Failed to archive upload {filename}: {e}") - raise - else: - input_path = project.input_path / filename - try: - input_path.parent.mkdir(parents=True, exist_ok=True) - with open(input_path, "wb") as f: - f.write(content) - InputSource.objects.create( - project=project, - filename=filename, - file_path=str(input_path), - is_uploaded=True, - ) - except Exception as e: - logger.error(f"Failed to save {filename} to {input_path}: {e}") - raise \ No newline at end of file +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +import logging +import os +import shutil +from datetime import datetime +from pathlib import Path + +from django.core.exceptions import FieldDoesNotExist +from django.core.validators import EMPTY_VALUES +from django.db import models + +import openpyxl +import requests +from typecode.contenttype import get_type + +from scancodeio.settings import download_store +from scanpipe import pipes +from scanpipe.models import CodebaseRelation +from scanpipe.models import CodebaseResource +from scanpipe.models import DiscoveredDependency +from scanpipe.models import DiscoveredLicense +from scanpipe.models import DiscoveredPackage +from scanpipe.models import InputSource +from scanpipe.pipes import scancode +from scanpipe.pipes.output import mappings_key_by_fieldname + +logger = logging.getLogger(__name__) + + +def copy_input(input_location, dest_path): + """Copy the ``input_location`` (file or directory) to the ``dest_path``.""" + input_path = Path(input_location) + destination_dir = Path(dest_path) + destination = destination_dir / input_path.name + + if input_path.is_dir(): + shutil.copytree(input_location, destination) + else: + if not os.path.exists(destination_dir): + os.makedirs(destination_dir) + shutil.copyfile(input_location, destination) + + return destination + + +def copy_inputs(input_locations, dest_path): + """Copy the provided ``input_locations`` to the ``dest_path``.""" + for input_location in input_locations: + copy_input(input_location, dest_path) + + +def move_input(input_location, dest_path): + """Move the provided ``input_location`` to the ``dest_path``.""" + destination = dest_path / Path(input_location).name + return shutil.move(input_location, destination) + + +def move_inputs(inputs, dest_path): + """Move the provided ``inputs`` to the ``dest_path``.""" + for input_location in inputs: + move_input(input_location, dest_path) + + +def get_tool_name_from_scan_headers(scan_data): + """Return the ``tool_name`` of the first header in the provided ``scan_data``.""" + if headers := scan_data.get("headers", []): + first_header = headers[0] + tool_name = first_header.get("tool_name", "") + return tool_name + + +def get_extra_data_from_scan_headers(scan_data): + """Return the ``extra_data`` of the first header in the provided ``scan_data``.""" + if headers := scan_data.get("headers", []): + first_header = headers[0] + if extra_data := first_header.get("extra_data"): + return extra_data + + +def is_archive(location): + """Return True if the file at ``location`` is an archive.""" + return get_type(location).is_archive + + +def load_inventory_from_toolkit_scan(project, input_location): + """ + Create license detections, packages, dependencies, and resources + loaded from the ScanCode-toolkit scan results located at ``input_location``. + """ + scanned_codebase = scancode.get_virtual_codebase(project, input_location) + scancode.create_discovered_licenses(project, scanned_codebase) + scancode.create_discovered_packages(project, scanned_codebase) + scancode.create_codebase_resources(project, scanned_codebase) + scancode.create_discovered_dependencies( + project, scanned_codebase, strip_datafile_path_root=True + ) + scancode.load_todo_issues(project, scanned_codebase) + + +def load_inventory_from_scanpipe(project, scan_data, extra_data_prefix=None): + """ + Create packages, dependencies, license detections, resources, and relations + loaded from a ScanCode.io JSON output provided as ``scan_data``. + + An ``extra_data_prefix`` can be provided in case multiple input files are loaded + into the same project. The prefix is usually the filename of the input. + """ + for detection_data in scan_data.get("license_detections", []): + pipes.update_or_create_license_detection(project, detection_data) + + for package_data in scan_data.get("packages", []): + pipes.update_or_create_package(project, package_data) + + for resource_data in scan_data.get("files", []): + pipes.update_or_create_resource(project, resource_data) + + for dependency_data in scan_data.get("dependencies", []): + pipes.update_or_create_dependency(project, dependency_data) + + for relation_data in scan_data.get("relations", []): + pipes.get_or_create_relation(project, relation_data) + + if extra_data := get_extra_data_from_scan_headers(scan_data): + if extra_data_prefix: + extra_data = {extra_data_prefix: extra_data} + project.update_extra_data(extra_data) + + +model_to_object_maker_func = { + DiscoveredPackage: pipes.update_or_create_package, + DiscoveredDependency: pipes.update_or_create_dependency, + DiscoveredLicense: pipes.update_or_create_license_detection, + CodebaseResource: pipes.update_or_create_resource, + CodebaseRelation: pipes.get_or_create_relation, +} + +worksheet_name_to_model = { + "PACKAGES": DiscoveredPackage, + "LICENSE_DETECTIONS": DiscoveredLicense, + "RESOURCES": CodebaseResource, + "DEPENDENCIES": DiscoveredDependency, + "RELATIONS": CodebaseRelation, +} + + +def get_worksheet_data(worksheet): + """Return the data from provided ``worksheet`` as a list of dict.""" + try: + header = [cell.value for cell in next(worksheet.rows)] + except StopIteration: + return {} + + worksheet_data = [ + dict(zip(header, row)) + for row in worksheet.iter_rows(min_row=2, values_only=True) + ] + return worksheet_data + + +def clean_xlsx_field_value(model_class, field_name, value): + """Clean the ``value`` for compatibility with the database ``model_class``.""" + if value in EMPTY_VALUES: + return + + if field_name == "for_packages": + return value.splitlines() + + elif field_name in ["purl", "for_package_uid", "datafile_path"]: + return value + + try: + field = model_class._meta.get_field(field_name) + except FieldDoesNotExist: + return + + if dict_key := mappings_key_by_fieldname.get(field_name): + return [{dict_key: entry} for entry in value.splitlines()] + + elif isinstance(field, models.JSONField): + if field.default is list: + return value.splitlines() + elif field.default is dict: + return # dict stored as JSON are not supported + + return value + + +def clean_xlsx_data_to_model_data(model_class, xlsx_data): + """Clean the ``xlsx_data`` for compatibility with the database ``model_class``.""" + cleaned_data = {} + + for field_name, value in xlsx_data.items(): + if cleaned_value := clean_xlsx_field_value(model_class, field_name, value): + cleaned_data[field_name] = cleaned_value + + return cleaned_data + + +def load_inventory_from_xlsx(project, input_location, extra_data_prefix=None): + """ + Create packages, dependencies, resources, and relations loaded from XLSX file + located at ``input_location``. + + An ``extra_data_prefix`` can be provided in case multiple input files are loaded + into the same project. The prefix is usually the filename of the input. + """ + workbook = openpyxl.load_workbook(input_location, read_only=True, data_only=True) + + for worksheet_name, model_class in worksheet_name_to_model.items(): + if worksheet_name not in workbook: + continue + + worksheet_data = get_worksheet_data(worksheet=workbook[worksheet_name]) + for row_data in worksheet_data: + object_maker_func = model_to_object_maker_func.get(model_class) + cleaned_data = clean_xlsx_data_to_model_data(model_class, row_data) + if cleaned_data: + object_maker_func(project, cleaned_data) + + if "LAYERS" in workbook: + layers_data = get_worksheet_data(worksheet=workbook["LAYERS"]) + extra_data = {"layers": layers_data} + if extra_data_prefix: + extra_data = {extra_data_prefix: extra_data} + project.update_extra_data(extra_data) + + +def add_input_from_url(project, url, filename=None): + """ + Download the file from the provided ``url`` and add it as an InputSource for the + specified ``project``. Optionally, specify a ``filename`` for the downloaded file. + If archiving is enabled, store the content in the DownloadStore and save metadata. + """ + try: + response = requests.get(url, stream=True, timeout=30) + response.raise_for_status() + content = response.content + except requests.RequestException as e: + logger.error(f"Failed to download {url}: {e}") + raise + + filename = filename or url.split("/")[-1] or "downloaded_file" + + if download_store: + try: + download = download_store.put( + content=content, + download_url=url, + download_date=datetime.now().isoformat(), + filename=filename, + ) + InputSource.objects.create( + project=project, + sha256=download.sha256, + download_url=download.download_url, + filename=download.filename, + download_date=download.download_date, + file_path=str(download.path), + is_uploaded=False, + ) + except Exception as e: + logger.error(f"Failed to archive download for {url}: {e}") + raise + else: + input_path = project.input_path / filename + try: + input_path.parent.mkdir(parents=True, exist_ok=True) + with open(input_path, "wb") as f: + f.write(content) + InputSource.objects.create( + project=project, + filename=filename, + download_url=url, + file_path=str(input_path), + is_uploaded=False, + ) + except Exception as e: + logger.error(f"Failed to save {filename} to {input_path}: {e}") + raise + + +def add_input_from_upload(project, uploaded_file): + """ + Add an uploaded file as an InputSource for the specified ``project``. + If archiving is enabled, store the content in the DownloadStore and save metadata. + """ + content = uploaded_file.read() + filename = uploaded_file.name + + if download_store: + try: + download = download_store.put( + content=content, + download_url="", + download_date=datetime.now().isoformat(), + filename=filename, + ) + InputSource.objects.create( + project=project, + sha256=download.sha256, + download_url=download.download_url, + filename=download.filename, + download_date=download.download_date, + file_path=str(download.path), + is_uploaded=True, + ) + except Exception as e: + logger.error(f"Failed to archive upload {filename}: {e}") + raise + else: + input_path = project.input_path / filename + try: + input_path.parent.mkdir(parents=True, exist_ok=True) + with open(input_path, "wb") as f: + f.write(content) + InputSource.objects.create( + project=project, + filename=filename, + file_path=str(input_path), + is_uploaded=True, + ) + except Exception as e: + logger.error(f"Failed to save {filename} to {input_path}: {e}") + raise diff --git a/scanpipe/tests/test_archiving.py b/scanpipe/tests/test_archiving.py index a249c96c46..0da1a236b5 100644 --- a/scanpipe/tests/test_archiving.py +++ b/scanpipe/tests/test_archiving.py @@ -1,86 +1,86 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - - -import hashlib -from pathlib import Path - -from django.test import TestCase - -from scanpipe.archiving import LocalFilesystemProvider -from scanpipe.tests import make_project - - -class TestArchiving(TestCase): - def setUp(self): - self.project = make_project() - self.root_path = Path(__file__).parent / "data" / "test_downloads" - self.store = LocalFilesystemProvider(root_path=self.root_path) - self.test_content = b"test content" - self.test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" - self.test_filename = "sample.tar.gz" - - def tearDown(self): - if self.root_path.exists(): - import shutil - - shutil.rmtree(self.root_path) - - def test_local_filesystem_provider_put_get(self): - download = self.store.put( - content=self.test_content, - download_url=self.test_url, - download_date="2025-08-21T09:00:00", - filename=self.test_filename, - ) - sha256 = hashlib.sha256(self.test_content).hexdigest() - self.assertEqual(download.sha256, sha256) - self.assertEqual(download.download_url, self.test_url) - self.assertEqual(download.filename, self.test_filename) - self.assertEqual(download.download_date, "2025-08-21T09:00:00") - content_path = ( - self.root_path / sha256[:2] / sha256[2:4] / sha256[4:] / "content" - ) - self.assertTrue(content_path.exists()) - with open(content_path, "rb") as f: - self.assertEqual(f.read(), self.test_content) - - retrieved = self.store.get(sha256) - self.assertEqual(retrieved.sha256, sha256) - self.assertEqual(retrieved.download_url, self.test_url) - self.assertEqual(retrieved.filename, self.test_filename) - - def test_local_filesystem_provider_deduplication(self): - download1 = self.store.put( - content=self.test_content, - download_url=self.test_url, - download_date="2025-08-21T09:00:00", - filename=self.test_filename, - ) - download2 = self.store.put( - content=self.test_content, - download_url="https://files.pythonhosted.org/packages/another.tar.gz", - download_date="2025-08-21T10:00:00", - filename="another.tar.gz", - ) - self.assertEqual(download1.sha256, download2.sha256) - self.assertEqual(download1.download_url, self.test_url) +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + + +import hashlib +from pathlib import Path + +from django.test import TestCase + +from scanpipe.archiving import LocalFilesystemProvider +from scanpipe.tests import make_project + + +class TestArchiving(TestCase): + def setUp(self): + self.project = make_project() + self.root_path = Path(__file__).parent / "data" / "test_downloads" + self.store = LocalFilesystemProvider(root_path=self.root_path) + self.test_content = b"test content" + self.test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" + self.test_filename = "sample.tar.gz" + + def tearDown(self): + if self.root_path.exists(): + import shutil + + shutil.rmtree(self.root_path) + + def test_local_filesystem_provider_put_get(self): + download = self.store.put( + content=self.test_content, + download_url=self.test_url, + download_date="2025-08-21T09:00:00", + filename=self.test_filename, + ) + sha256 = hashlib.sha256(self.test_content).hexdigest() + self.assertEqual(download.sha256, sha256) + self.assertEqual(download.download_url, self.test_url) + self.assertEqual(download.filename, self.test_filename) + self.assertEqual(download.download_date, "2025-08-21T09:00:00") + content_path = ( + self.root_path / sha256[:2] / sha256[2:4] / sha256[4:] / "content" + ) + self.assertTrue(content_path.exists()) + with open(content_path, "rb") as f: + self.assertEqual(f.read(), self.test_content) + + retrieved = self.store.get(sha256) + self.assertEqual(retrieved.sha256, sha256) + self.assertEqual(retrieved.download_url, self.test_url) + self.assertEqual(retrieved.filename, self.test_filename) + + def test_local_filesystem_provider_deduplication(self): + download1 = self.store.put( + content=self.test_content, + download_url=self.test_url, + download_date="2025-08-21T09:00:00", + filename=self.test_filename, + ) + download2 = self.store.put( + content=self.test_content, + download_url="https://files.pythonhosted.org/packages/another.tar.gz", + download_date="2025-08-21T10:00:00", + filename="another.tar.gz", + ) + self.assertEqual(download1.sha256, download2.sha256) + self.assertEqual(download1.download_url, self.test_url) diff --git a/scanpipe/tests/test_input.py b/scanpipe/tests/test_input.py index 3f2848cf1b..e55a90cace 100644 --- a/scanpipe/tests/test_input.py +++ b/scanpipe/tests/test_input.py @@ -1,143 +1,112 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: -# http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, -# software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an -# "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - - -from pathlib import Path -from unittest.mock import patch - -from django.core.files.uploadedfile import SimpleUploadedFile -from django.test import TestCase - -from scanpipe.models import InputSource -from scanpipe.pipes.input import add_input_from_upload -from scanpipe.pipes.input import add_input_from_url -from scancodeio.settings import settings -from scanpipe.tests import make_project - - -class TestInput(TestCase): - def setUp(self): - self.project = make_project() - self.test_filename = "sample.tar.gz" - self.test_data_path = ( - Path(__file__).parent / - "data" / - "test-downloads" / - self.test_filename - ) - with open(self.test_data_path, "rb") as f: - self.test_content = f.read() - - @patch("requests.get") - def test_add_input_from_url(self, mock_get): - test_url = ( - "https://files.pythonhosted.org/" - "packages/sample.tar.gz" - ) - mock_get.return_value.content = self.test_content - mock_get.return_value.status_code = 200 - add_input_from_url( - self.project, - test_url, - filename=self.test_filename - ) - input_source = InputSource.objects.get(project=self.project) - self.assertEqual(input_source.filename, self.test_filename) - self.assertEqual(input_source.download_url, test_url) - self.assertTrue(input_source.sha256) - self.assertTrue(input_source.download_date) - self.assertFalse(input_source.is_uploaded) - self.assertTrue( - input_source.file_path.startswith( - settings.CENTRAL_ARCHIVE_PATH - ) - ) - self.assertTrue(Path(input_source.file_path).exists()) - - @patch("scanpipe.pipes.input.download_store", None) - @patch("requests.get") - def test_add_input_from_url_fallback(self, mock_get): - test_url = ( - "https://files.pythonhosted.org/" - "packages/sample.tar.gz" - ) - mock_get.return_value.content = self.test_content - mock_get.return_value.status_code = 200 - add_input_from_url( - self.project, - test_url, - filename=self.test_filename - ) - input_source = InputSource.objects.get(project=self.project) - self.assertEqual(input_source.filename, self.test_filename) - self.assertEqual(input_source.download_url, test_url) - self.assertFalse(input_source.sha256) - self.assertFalse(input_source.download_date) - self.assertFalse(input_source.is_uploaded) - self.assertTrue( - str(input_source.file_path).startswith( - str(self.project.input_path) - ) - ) - self.assertTrue(Path(input_source.file_path).exists()) - - def test_add_input_from_upload(self): - uploaded_file = SimpleUploadedFile( - self.test_filename, - self.test_content - ) - add_input_from_upload(self.project, uploaded_file) - input_source = InputSource.objects.get(project=self.project) - self.assertEqual(input_source.filename, self.test_filename) - self.assertEqual(input_source.download_url, "") - self.assertTrue(input_source.sha256) - self.assertTrue(input_source.download_date) - self.assertTrue(input_source.is_uploaded) - self.assertTrue( - input_source.file_path.startswith( - settings.CENTRAL_ARCHIVE_PATH - ) - ) - self.assertTrue(Path(input_source.file_path).exists()) - - @patch("scanpipe.pipes.input.download_store", None) - def test_add_input_from_upload_fallback(self): - uploaded_file = SimpleUploadedFile( - self.test_filename, - self.test_content - ) - add_input_from_upload(self.project, uploaded_file) - input_source = InputSource.objects.get(project=self.project) - self.assertEqual(input_source.filename, self.test_filename) - self.assertEqual(input_source.download_url, "") - self.assertFalse(input_source.sha256) - self.assertFalse(input_source.download_date) - self.assertTrue(input_source.is_uploaded) - self.assertTrue( - str(input_source.file_path).startswith( - str(self.project.input_path) - ) - ) - self.assertTrue(Path(input_source.file_path).exists()) +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: +# http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, +# software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an +# "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + + +from pathlib import Path +from unittest.mock import patch + +from django.core.files.uploadedfile import SimpleUploadedFile +from django.test import TestCase + +from scancodeio.settings import settings +from scanpipe.models import InputSource +from scanpipe.pipes.input import add_input_from_upload +from scanpipe.pipes.input import add_input_from_url +from scanpipe.tests import make_project + + +class TestInput(TestCase): + def setUp(self): + self.project = make_project() + self.test_filename = "sample.tar.gz" + self.test_data_path = ( + Path(__file__).parent / "data" / "test-downloads" / self.test_filename + ) + with open(self.test_data_path, "rb") as f: + self.test_content = f.read() + + @patch("requests.get") + def test_add_input_from_url(self, mock_get): + test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" + mock_get.return_value.content = self.test_content + mock_get.return_value.status_code = 200 + add_input_from_url(self.project, test_url, filename=self.test_filename) + input_source = InputSource.objects.get(project=self.project) + self.assertEqual(input_source.filename, self.test_filename) + self.assertEqual(input_source.download_url, test_url) + self.assertTrue(input_source.sha256) + self.assertTrue(input_source.download_date) + self.assertFalse(input_source.is_uploaded) + self.assertTrue( + input_source.file_path.startswith(settings.CENTRAL_ARCHIVE_PATH) + ) + self.assertTrue(Path(input_source.file_path).exists()) + + @patch("scanpipe.pipes.input.download_store", None) + @patch("requests.get") + def test_add_input_from_url_fallback(self, mock_get): + test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" + mock_get.return_value.content = self.test_content + mock_get.return_value.status_code = 200 + add_input_from_url(self.project, test_url, filename=self.test_filename) + input_source = InputSource.objects.get(project=self.project) + self.assertEqual(input_source.filename, self.test_filename) + self.assertEqual(input_source.download_url, test_url) + self.assertFalse(input_source.sha256) + self.assertFalse(input_source.download_date) + self.assertFalse(input_source.is_uploaded) + self.assertTrue( + str(input_source.file_path).startswith(str(self.project.input_path)) + ) + self.assertTrue(Path(input_source.file_path).exists()) + + def test_add_input_from_upload(self): + uploaded_file = SimpleUploadedFile(self.test_filename, self.test_content) + add_input_from_upload(self.project, uploaded_file) + input_source = InputSource.objects.get(project=self.project) + self.assertEqual(input_source.filename, self.test_filename) + self.assertEqual(input_source.download_url, "") + self.assertTrue(input_source.sha256) + self.assertTrue(input_source.download_date) + self.assertTrue(input_source.is_uploaded) + self.assertTrue( + input_source.file_path.startswith(settings.CENTRAL_ARCHIVE_PATH) + ) + self.assertTrue(Path(input_source.file_path).exists()) + + @patch("scanpipe.pipes.input.download_store", None) + def test_add_input_from_upload_fallback(self): + uploaded_file = SimpleUploadedFile(self.test_filename, self.test_content) + add_input_from_upload(self.project, uploaded_file) + input_source = InputSource.objects.get(project=self.project) + self.assertEqual(input_source.filename, self.test_filename) + self.assertEqual(input_source.download_url, "") + self.assertFalse(input_source.sha256) + self.assertFalse(input_source.download_date) + self.assertTrue(input_source.is_uploaded) + self.assertTrue( + str(input_source.file_path).startswith(str(self.project.input_path)) + ) + self.assertTrue(Path(input_source.file_path).exists()) diff --git a/scanpipe/tests/test_pipelines.py b/scanpipe/tests/test_pipelines.py index 306ea85e17..3dc8c61bea 100644 --- a/scanpipe/tests/test_pipelines.py +++ b/scanpipe/tests/test_pipelines.py @@ -1,2054 +1,2054 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/nexB/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/nexB/scancode.io for support and download. - -import io -import json -import os -import sys -import tempfile -from contextlib import redirect_stderr -from pathlib import Path -from unittest import mock -from unittest import skipIf -from unittest.mock import patch - -from django.conf import settings -from django.test import TestCase -from django.test import tag - -from packageurl import PackageURL -from scancode.cli_test_utils import purl_with_fake_uuid -from scorecode.models import PackageScore - -from scanpipe import pipes -from scanpipe.models import CodebaseResource -from scanpipe.models import DiscoveredPackage -from scanpipe.models import InputSource -from scanpipe.pipelines import CommonStepsMixin -from scanpipe.pipelines import InputFilesError -from scanpipe.pipelines import Pipeline -from scanpipe.pipelines import analyze_root_filesystem -from scanpipe.pipelines import deploy_to_develop -from scanpipe.pipelines import is_pipeline -from scanpipe.pipelines import scan_single_package -from scanpipe.pipes import d2d -from scanpipe.pipes import flag -from scanpipe.pipes import output -from scanpipe.pipes import scancode -from scanpipe.pipes.input import copy_input -from scanpipe.tests import FIXTURES_REGEN -from scanpipe.tests import make_mock_response -from scanpipe.tests import make_package -from scanpipe.tests import make_project -from scanpipe.tests import package_data1 -from scanpipe.tests.pipelines.do_nothing import DoNothing -from scanpipe.tests.pipelines.download_inputs import DownloadInput -from scanpipe.tests.pipelines.profile_step import ProfileStep -from scanpipe.tests.pipelines.steps_as_attribute import StepsAsAttribute -from scanpipe.tests.pipelines.with_groups import WithGroups - -from_docker_image = os.environ.get("FROM_DOCKER_IMAGE") - - -class ScanPipePipelinesTest(TestCase): - data = Path(__file__).parent / "data" - - def test_scanpipe_pipeline_class_pipeline_name_attribute(self): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline_instance = DoNothing(run) - self.assertEqual("do_nothing", pipeline_instance.pipeline_name) - - def test_scanpipe_pipeline_class_get_info(self): - expected = { - "description": "Description section of the doc string.", - "summary": "Do nothing, in 2 steps.", - "steps": [ - {"name": "step1", "doc": "Step1 doc.", "groups": []}, - {"name": "step2", "doc": "Step2 doc.", "groups": []}, - ], - "available_groups": [], - } - self.assertEqual(expected, DoNothing.get_info()) - - expected = { - "summary": "Profile a step using the @profile decorator.", - "description": "", - "steps": [ - {"name": "step", "doc": "", "groups": []}, - ], - "available_groups": [], - } - self.assertEqual(expected, ProfileStep.get_info()) - - def test_scanpipe_pipeline_class_get_summary(self): - expected = "Do nothing, in 2 steps." - self.assertEqual(expected, DoNothing.get_summary()) - - expected = "Profile a step using the @profile decorator." - self.assertEqual(expected, ProfileStep.get_summary()) - - def test_scanpipe_pipeline_class_log(self): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - pipeline.log("Event1") - pipeline.log("Event2") - - run.refresh_from_db() - self.assertIn("Event1", run.log) - self.assertIn("Event2", run.log) - - def test_scanpipe_pipeline_class_execute(self): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode) - self.assertEqual("", out) - - run.refresh_from_db() - self.assertIn("Pipeline [do_nothing] starting", run.log) - self.assertIn("Step [step1] starting", run.log) - self.assertIn("Step [step1] completed", run.log) - self.assertIn("Step [step2] starting", run.log) - self.assertIn("Step [step2] completed", run.log) - self.assertIn("Pipeline completed", run.log) - - def test_scanpipe_pipeline_class_execute_with_exception(self): - project1 = make_project() - run = project1.add_pipeline("raise_exception") - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(1, exitcode) - self.assertTrue(out.startswith("Error message")) - self.assertIn("Traceback:", out) - self.assertIn("in execute", out) - self.assertIn("step(self)", out) - self.assertIn("in raise_exception", out) - self.assertIn("raise ValueError", out) - - run.refresh_from_db() - self.assertIn("Pipeline [raise_exception] starting", run.log) - self.assertIn("Step [raise_exception_step] starting", run.log) - self.assertIn("Pipeline failed", run.log) - - @mock.patch("scanpipe.tests.pipelines.do_nothing.DoNothing.step1") - @mock.patch("scanpipe.tests.pipelines.do_nothing.DoNothing.step2") - def test_scanpipe_pipeline_class_execute_with_selected_steps(self, step2, step1): - step1.__name__ = "step1" - step1.groups = [] - step2.__name__ = "step2" - step2.groups = [] - - project1 = make_project() - run = project1.add_pipeline("do_nothing") - run.update(selected_steps=["step2", "not_existing_step"]) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode) - self.assertEqual("", out) - - step1.assert_not_called() - step2.assert_called() - - run.refresh_from_db() - self.assertIn("Pipeline [do_nothing] starting", run.log) - self.assertIn("Step [step1] skipped", run.log) - self.assertIn("Step [step2] starting", run.log) - self.assertIn("Step [step2] completed", run.log) - self.assertIn("Pipeline completed", run.log) - - def test_scanpipe_pipeline_class_download_inputs_attribute(self): - project1 = make_project() - run = project1.add_pipeline("download_inputs") - pipeline = run.make_pipeline_instance() - self.assertTrue(pipeline.download_inputs) - expected = (CommonStepsMixin.download_missing_inputs,) - self.assertEqual(expected, pipeline.get_initial_steps()) - expected = (CommonStepsMixin.download_missing_inputs, DownloadInput.step1) - self.assertEqual(expected, pipeline.get_steps()) - pipeline.execute() - self.assertIn("Step [download_missing_inputs]", run.log) - - run = project1.add_pipeline("profile_step") - pipeline = run.make_pipeline_instance() - self.assertFalse(pipeline.download_inputs) - pipeline.execute() - self.assertNotIn("Step [download_missing_inputs]", run.log) - - @mock.patch("requests.sessions.Session.get") - def test_scanpipe_pipeline_class_download_missing_inputs(self, mock_get): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - - file_location = self.data / "aboutcode" / "notice.NOTICE" - input_source = project1.add_input_source( - filename=file_location.name, is_uploaded=True - ) - self.assertFalse(input_source.exists()) - with self.assertRaises(InputFilesError) as error: - pipeline.download_missing_inputs() - error_msg = ( - "InputFilesError encountered with the following issues:\n\n" - "Error 1: Uploaded file filename=notice.NOTICE [uploaded] not available." - "\n\nNo traceback available." - ) - self.assertEqual(error_msg, str(error.exception)) - self.assertIn( - "Uploaded file filename=notice.NOTICE [uploaded] not available.", run.log - ) - - project1.copy_input_from(file_location) - self.assertTrue(input_source.exists()) - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - pipeline.download_missing_inputs() - self.assertEqual("", run.log) - - download_url = "https://download.url/file.zip" - mock_get.return_value = make_mock_response(url=download_url) - input_source2 = project1.add_input_source(download_url=download_url) - pipeline.download_missing_inputs() - self.assertIn("Fetching input from https://download.url/file.zip", run.log) - input_source2.refresh_from_db() - self.assertEqual("file.zip", input_source2.filename) - self.assertTrue(input_source2.exists()) - mock_get.assert_called_once() - - @mock.patch("scanpipe.models.InputSource.fetch") - def test_scanpipe_pipeline_class_download_fetch_exception(self, mock_fetch): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - - mock_fetch.side_effect = Exception("File not found") - download_url = "https://download.url/file.zip" - project1.add_input_source(download_url=download_url) - - with self.assertRaises(InputFilesError) as error: - pipeline.download_missing_inputs() - self.assertIn( - "InputFilesError encountered with the following issues:", - str(error.exception), - ) - self.assertIn("Error 1: File not found", str(error.exception)) - self.assertIn("Traceback (most recent call last):", str(error.exception)) - self.assertIn("Exception: File not found", str(error.exception)) - - self.assertIn("Fetching input from https://download.url/file.zip", run.log) - self.assertIn("https://download.url/file.zip could not be fetched.", run.log) - - @mock.patch("git.repo.base.Repo.clone_from") - def test_scanpipe_pipeline_class_download_missing_inputs_git_repo(self, mock_clone): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - - download_url = "https://github.com/aboutcode-org/scancode.io.git" - input_source = project1.add_input_source(download_url=download_url) - - def mock_make_to_path(**kwargs): - to_path = kwargs.get("to_path") - to_path.mkdir() - - mock_clone.side_effect = mock_make_to_path - mock_clone.return_value = None - - pipeline.download_missing_inputs() - self.assertIn( - "Fetching input from https://github.com/aboutcode-org/scancode.io.git", - run.log, - ) - input_source.refresh_from_db() - self.assertEqual("scancode.io.git", input_source.filename) - self.assertTrue(input_source.exists()) - - @mock.patch("requests.get") - def test_archive_downloads(self, mock_get): - project1 = make_project() - run = project1.add_pipeline("scan_codebase") - pipeline = run.make_pipeline_instance() - test_filename = "sample.tar.gz" - test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" - test_data_path = ( - Path(__file__).parent / "data" / "test-downloads" / test_filename - ) - with open(test_data_path, "rb") as f: - test_content = f.read() - - InputSource.objects.create( - project=project1, - filename=test_filename, - download_url=test_url, - is_uploaded=False, - ) - - mock_get.return_value.content = test_content - mock_get.return_value.status_code = 200 - - pipeline.download_missing_inputs() - input_source.refresh_from_db() - self.assertTrue(input_source.file_path.startswith(settings.CENTRAL_ARCHIVE_PATH)) - self.assertTrue(Path(input_source.file_path).exists()) - - - pipeline.archive_downloads() - input_source = InputSource.refresh_from_db() - self.assertTrue(input_source.sha256) - self.assertTrue(input_source.download_date) - self.assertEqual(input_source.download_url, test_url) - self.assertEqual(input_source.filename, test_filename) - - project2 = make_project(name="project2") - input_source2 = InputSource.objects.create( - project=project2, - filename=test_filename, - download_url=test_url, - is_uploaded=False, - ) - run2 = project2.add_pipeline("scan_codebase") - pipeline2 = run2.make_pipeline_instance() - pipeline2.download_missing_inputs() - input_source2.refresh_from_db() - self.assertEqual(input_source.file_path, input_source2.file_path) - self.assertTrue(Path(input_source2.file_path).exists()) - - def test_scanpipe_pipeline_class_save_errors_context_manager(self): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - self.assertEqual(project1, pipeline.project) - - with pipeline.save_errors(Exception): - raise Exception("Error message") - - message = project1.projectmessages.get() - self.assertEqual("do_nothing", message.model) - self.assertEqual({}, message.details) - self.assertEqual("Error message", message.description) - self.assertIn('raise Exception("Error message")', message.traceback) - - resource1 = CodebaseResource.objects.create(project=project1, path="filename") - with pipeline.save_errors(Exception, resource=resource1): - raise Exception("Error message") - message = project1.projectmessages.latest("created_date") - self.assertEqual({"resource_path": str(resource1.path)}, message.details) - - def test_scanpipe_pipelines_is_pipeline(self): - self.assertFalse(is_pipeline(None)) - self.assertFalse(is_pipeline(Pipeline)) - self.assertTrue(is_pipeline(DoNothing)) - - class SubSubClass(DoNothing): - pass - - self.assertTrue(is_pipeline(SubSubClass)) - - def test_scanpipe_pipeline_class_get_graph(self): - expected = [ - {"name": "step1", "doc": "Step1 doc.", "groups": []}, - {"name": "step2", "doc": "Step2 doc.", "groups": []}, - ] - self.assertEqual(expected, DoNothing.get_graph()) - - def test_scanpipe_pipelines_profile_decorator(self): - project1 = make_project() - run = project1.add_pipeline("profile_step") - pipeline_instance = run.make_pipeline_instance() - - exitcode, out = pipeline_instance.execute() - self.assertEqual(0, exitcode) - - run.refresh_from_db() - self.assertIn("Profiling results at", run.log) - self.assertIn("Pipeline completed", run.log) - - self.assertEqual(1, len(project1.output_root)) - output_file = project1.output_root[0] - self.assertTrue(output_file.startswith("profile-")) - self.assertTrue(output_file.endswith(".html")) - - def test_scanpipe_pipeline_class_get_steps(self): - expected = ( - DoNothing.step1, - DoNothing.step2, - ) - self.assertEqual(expected, DoNothing.get_steps()) - - with self.assertRaises(TypeError) as cm: - StepsAsAttribute.get_steps() - expected = "Use a ``steps(cls)`` classmethod to declare the steps." - self.assertEqual(expected, str(cm.exception)) - - def test_scanpipe_pipeline_class_get_steps_with_groups(self): - expected = (WithGroups.no_groups,) - self.assertEqual(expected, WithGroups.get_steps()) - self.assertEqual(expected, WithGroups.get_steps(groups=[])) - self.assertEqual(expected, WithGroups.get_steps(groups=["not_defined"])) - - expected = ( - WithGroups.grouped_with_foo_and_bar, - WithGroups.grouped_with_bar, - WithGroups.no_groups, - ) - self.assertEqual(expected, WithGroups.get_steps(groups=["bar"])) - self.assertEqual(expected, WithGroups.get_steps(groups=["foo", "bar"])) - - expected = ( - WithGroups.grouped_with_foo_and_bar, - WithGroups.no_groups, - ) - self.assertEqual(expected, WithGroups.get_steps(groups=["foo"])) - - def test_scanpipe_pipeline_class_get_available_groups(self): - self.assertEqual(["bar", "excluded", "foo"], WithGroups.get_available_groups()) - self.assertEqual([], DoNothing.get_available_groups()) - - def test_scanpipe_pipeline_class_env_loaded_from_config_file(self): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - self.assertEqual({}, pipeline.env) - - config_file = project1.input_path / settings.SCANCODEIO_CONFIG_FILE - config_file.write_text("{*this is not valid yml*}") - pipeline = run.make_pipeline_instance() - self.assertEqual({}, pipeline.env) - - config_file.write_text("product_name: Product") - pipeline = run.make_pipeline_instance() - self.assertEqual({"product_name": "Product"}, pipeline.env) - - def test_scanpipe_pipeline_class_env_reloaded_after_extraction(self): - project1 = make_project() - - input_location = self.data / "settings" / "archived-scancode-config.zip" - project1.copy_input_from(input_location) - run = project1.add_pipeline("scan_codebase") - pipeline = run.make_pipeline_instance() - self.assertEqual({}, pipeline.env) - - # Manually run steps, env is reload from the scancode-config.yml contained in - # the archive - pipeline.copy_inputs_to_codebase_directory() - pipeline.extract_archives() - - expected = { - "product_name": "My Product Name", - "product_version": "1.0", - "ignored_patterns": ["*.tmp", "tests/*"], - } - self.assertEqual(expected, pipeline.env) - - def test_scanpipe_pipeline_class_flag_ignored_resources(self): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - self.assertIsNone(pipeline.env.get("ignored_patterns")) - - project1.settings.update({"ignored_patterns": "*.ext"}) - project1.save() - pipeline = run.make_pipeline_instance() - - with mock.patch("scanpipe.pipes.flag.flag_ignored_patterns") as mock_flag: - mock_flag.return_value = None - pipeline.flag_ignored_resources() - - mock_flag.assert_called_once() - patterns_args = ["*.ext", *flag.DEFAULT_IGNORED_PATTERNS] - self.assertEqual(mock_flag.mock_calls[0].kwargs["patterns"], patterns_args) - self.assertEqual(mock_flag.mock_calls[0].kwargs["codebaseresources"].count(), 0) - - def test_scanpipe_pipeline_class_extract_archive(self): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - - target = tempfile.mkdtemp() - input_location = str(self.data / "scancode" / "corrupted.tar.gz") - pipeline.extract_archive(input_location, target) - - projects_errors = project1.projectmessages.all() - self.assertEqual(1, len(projects_errors)) - project_error = projects_errors.get() - self.assertEqual("error", project_error.severity) - self.assertIn("gzip decompression failed", project_error.description) - self.assertEqual("extract_archive", project_error.model) - self.assertEqual({"filename": "corrupted.tar.gz"}, project_error.details) - self.assertEqual("", project_error.traceback) - - def test_scanpipe_pipeline_class_extract_archives(self): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - - input_location = str(self.data / "scancode" / "corrupted.tar.gz") - resource_location = copy_input(input_location, project1.codebase_path) - pipeline.extract_archives() - - projects_errors = project1.projectmessages.all() - self.assertEqual(1, len(projects_errors)) - project_error = projects_errors.get() - self.assertEqual("error", project_error.severity) - self.assertIn("gzip decompression failed", project_error.description) - self.assertEqual("extract_archives", project_error.model) - self.assertEqual( - {"resource_path": str(resource_location)}, project_error.details - ) - self.assertEqual("", project_error.traceback) - - -class RootFSPipelineTest(TestCase): - def test_scanpipe_rootfs_pipeline_extract_input_files_errors(self): - project1 = make_project() - run = project1.add_pipeline("analyze_root_filesystem_or_vm_image") - pipeline_instance = analyze_root_filesystem.RootFS(run) - - # Create 2 files in the input/ directory to generate error twice - project1.move_input_from(tempfile.mkstemp()[1]) - project1.move_input_from(tempfile.mkstemp()[1]) - self.assertEqual(2, len(project1.input_files)) - - with mock.patch("scanpipe.pipes.scancode.extract_archive") as extract_archive: - extract_archive.return_value = {"path/to/resource": ["error1", "error2"]} - pipeline_instance.extract_input_files_to_codebase_directory() - - projects_errors = project1.projectmessages.all() - self.assertEqual(2, len(projects_errors)) - project_error = projects_errors[0] - self.assertEqual("error", project_error.severity) - self.assertEqual("error1\nerror2", project_error.description) - self.assertEqual("extract_archive", project_error.model) - self.assertEqual({"filename": "resource"}, project_error.details) - self.assertEqual("", project_error.traceback) - - -def sort_for_os_compatibility(scan_data): - """Sort the ``scan_data`` files and relations in place. Return ``scan_data``.""" - if files := scan_data.get("files"): - files.sort(key=lambda x: x["path"]) - - if relations := scan_data.get("relations"): - relations.sort(key=lambda x: x["to_resource"]) - - return scan_data - - -@tag("slow") -class PipelinesIntegrationTest(TestCase): - """Integration tests to ensure the proper output for each built-in Pipelines.""" - - # Un-comment the following to display full diffs: - # maxDiff = None - data = Path(__file__).parent / "data" - exclude_from_diff = [ - "start_timestamp", - "end_timestamp", - "date", - "duration", - "input", - "compliance_alert", - "policy", - "tool_version", - "other_tools", - "created_date", - "log", - "uuid", - "size", # directory sizes are OS dependant - "size_count", - "--json-pp", - "--processes", - "--verbose", - # system_environment differs between systems - "system_environment", - "file_type", - # mime type and is_script are inconsistent across systems - "mime_type", - "is_script", - "notes", - "settings", - "description", - "traceback", - ] - - def _without_keys(self, data, exclude_keys): - """Return the `data` excluding the provided `exclude_keys`.""" - if isinstance(data, list): - return [self._without_keys(entry, exclude_keys) for entry in data] - - if isinstance(data, dict): - return { - key: ( - self._without_keys(value, exclude_keys) - if type(value) in [list, dict] - else value - ) - for key, value in data.items() - if key not in exclude_keys - } - - return data - - def purl_fields_with_fake_uuid(self, value, key): - purl_fields = ["purl", "for_packages", "package_uid"] - purl_name = "fixed-name-for-testing-5642512d1758" - purl_namespace = "fixed-namespace-for-testing-5642512d1758" - - if key == "name": - return purl_name - elif key == "namespace": - return purl_namespace - elif key in purl_fields: - purl_old = PackageURL.from_string(value) - if purl_old.type != "local-files": - return purl_with_fake_uuid(value) - - purl = PackageURL( - name=purl_name, - namespace=purl_namespace, - type="local-files", - version=purl_old.version, - qualifiers=purl_old.qualifiers, - subpath=purl_old.subpath, - ) - return purl_with_fake_uuid(purl.to_string()) - - def _normalize_package_uids(self, data): - """ - Return the `data`, where any `package_uid` value has been normalized - with `purl_with_fake_uuid()` - """ - fields_with_package_uids = [ - "package_uid", - "dependency_uid", - "for_package_uid", - "resolved_to_package_uid", - ] - if isinstance(data, list): - return [self._normalize_package_uids(entry) for entry in data] - - if isinstance(data, dict): - is_local_files = False - if data.get("type") and data["type"] == "local-files": - is_local_files = True - normalized_data = {} - for key, value in data.items(): - if isinstance(value, list | dict): - value = self._normalize_package_uids(value) - if key in fields_with_package_uids and value: - value = purl_with_fake_uuid(value) - if key == "for_packages" and value: - value = sorted( - [ - self.purl_fields_with_fake_uuid(package_uid, key) - for package_uid in value - ] - ) - if ( - is_local_files - and key in ("name", "namespace", "purl", "package_uid") - and value - ): - value = self.purl_fields_with_fake_uuid(value, key) - normalized_data[key] = value - return normalized_data - - return data - - def _sort_dependencies(self, data): - """ - Sort dependencies by their "for_package_uid". - - After dependency resolution in some cases we have multiple - dependency requirements resolved to a same package, and they - are not sorted the same way every time. - """ - mappings = data.get("dependencies") - if mappings: - mappings_by_uid = {} - for mapping in mappings: - uid = mapping.get("for_package_uid") or "" - mappings_by_uid[uid] = mapping - data["dependencies"] = list(dict(sorted(mappings_by_uid.items())).values()) - return data - - def test_package_uids_normalized_in_pipeline_integration_tests(self): - self.maxDiff = 1000 - data = { - "type": "local-files", - "package_uid": ( - "pkg:local-files/analysis-438ebaf4/42440f35-1091-4c03-8c96-a0ed3d3caf23" - "?uuid=42440f35-1091-4c03-8c96-a0ed3d3caf24" - ), - "for_packages": [ - ( - "pkg:local-files/analysis-438ebaf4/42440f35-1091-4c03-8c96-a0ed3d3caf23" - "?uuid=42440f35-1091-4c03-8c96-a0ed3d3caf24" - ) - ], - } - normalized_data = self._normalize_package_uids(data=data) - expected_data = { - "type": "local-files", - "package_uid": ( - "pkg:local-files/fixed-namespace-for-testing-5642512d1758/" - "fixed-name-for-testing-5642512d1758?uuid=fixed-uid-done-for-testing-5642512d1758" - ), - "for_packages": [ - ( - "pkg:local-files/fixed-namespace-for-testing-5642512d1758/" - "fixed-name-for-testing-5642512d1758?uuid=fixed-uid-done-for-testing-5642512d1758" - ) - ], - } - self.assertEqual(normalized_data, expected_data) - - def assertPipelineResultEqual( - self, expected_file, result_file, sort_dependencies=False, regen=FIXTURES_REGEN - ): - """Set `regen` to True to regenerate the expected results.""" - result_json = json.loads(Path(result_file).read_text()) - result_json = self._normalize_package_uids(result_json) - result_data = self._without_keys(result_json, self.exclude_from_diff) - if sort_dependencies: - result_data = self._sort_dependencies(result_data) - result_data = sort_for_os_compatibility(result_data) - - if regen: - expected_file.write_text(json.dumps(result_data, indent=2)) - - expected_json = json.loads(expected_file.read_text()) - expected_json = self._normalize_package_uids(expected_json) - expected_data = self._without_keys(expected_json, self.exclude_from_diff) - if sort_dependencies: - result_data = self._sort_dependencies(result_data) - expected_data = sort_for_os_compatibility(expected_data) - - self.assertEqual(expected_data, result_data) - - @skipIf(from_docker_image, "Random failure in the Docker context.") - def test_scanpipe_scan_package_pipeline_integration(self): - pipeline_name = "scan_single_package" - project1 = make_project() - - input_location = self.data / "scancode" / "is-npm-1.0.0.tgz" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(4, project1.codebaseresources.count()) - self.assertEqual(1, project1.discoveredpackages.count()) - self.assertEqual(1, project1.discovereddependencies.count()) - - scancode_file = project1.get_latest_output(filename="scancode") - expected_file = self.data / "scancode" / "is-npm-1.0.0_scan_package.json" - self.assertPipelineResultEqual(expected_file, scancode_file) - - summary_file = project1.get_latest_output(filename="summary") - expected_file = ( - self.data / "scancode" / "is-npm-1.0.0_scan_package_summary.json" - ) - self.assertPipelineResultEqual(expected_file, summary_file) - - # Ensure that we only have one instance of is-npm in `key_files_packages` - summary_data = json.loads(Path(summary_file).read_text()) - key_files_packages = summary_data.get("key_files_packages", []) - self.assertEqual(1, len(key_files_packages)) - key_file_package = key_files_packages[0] - key_file_package_purl = key_file_package.get("purl", "") - self.assertEqual("pkg:npm/is-npm@1.0.0", key_file_package_purl) - - @skipIf(from_docker_image, "Random failure in the Docker context.") - def test_scanpipe_scan_package_pipeline_integration_multiple_packages(self): - pipeline_name = "scan_single_package" - project1 = make_project() - - input_location = self.data / "scancode" / "multiple-is-npm-1.0.0.tar.gz" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(9, project1.codebaseresources.count()) - self.assertEqual(2, project1.discoveredpackages.count()) - self.assertEqual(2, project1.discovereddependencies.count()) - - scancode_file = project1.get_latest_output(filename="scancode") - expected_file = ( - self.data / "scancode" / "multiple-is-npm-1.0.0_scan_package.json" - ) - # Do not override the regen as this file is generated in regen_test_data - self.assertPipelineResultEqual(expected_file, scancode_file) - - summary_file = project1.get_latest_output(filename="summary") - expected_file = ( - self.data / "scancode" / "multiple-is-npm-1.0.0_scan_package_summary.json" - ) - self.assertPipelineResultEqual(expected_file, summary_file) - - @mock.patch("scanpipe.pipelines.scan_single_package.is_archive") - def test_scanpipe_scan_package_single_extract_input_to_codebase_directory( - self, mock_is_archive - ): - project1 = make_project() - run = project1.add_pipeline("scan_single_package") - pipeline_instance = scan_single_package.ScanSinglePackage(run) - - project1.move_input_from(tempfile.mkstemp(suffix=".zip")[1]) - self.assertEqual(1, len(project1.input_files)) - - mock_is_archive.return_value = True - pipeline_instance.get_package_input() - with mock.patch("scanpipe.pipes.scancode.extract_archive") as extract_archive: - extract_archive.return_value = {"path/to/resource": ["error1", "error2"]} - pipeline_instance.extract_input_to_codebase_directory() - - projects_errors = project1.projectmessages.all() - self.assertEqual(1, len(projects_errors)) - project_error = projects_errors[0] - self.assertEqual("error", project_error.severity) - self.assertEqual("error1\nerror2", project_error.description) - self.assertEqual("extract_archive", project_error.model) - self.assertEqual({"filename": "resource"}, project_error.details) - self.assertEqual("", project_error.traceback) - - def test_scanpipe_scan_package_single_file(self): - pipeline_name = "scan_single_package" - project1 = make_project() - - input_location = self.data / "manifests" / "openpdf-parent-1.3.11.pom.xml" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(1, project1.codebaseresources.count()) - self.assertEqual(1, project1.discoveredpackages.count()) - self.assertEqual(10, project1.discovereddependencies.count()) - - scancode_file = project1.get_latest_output(filename="scancode") - expected_file = ( - self.data / "manifests" / "openpdf-parent-1.3.11_scan_package.json" - ) - self.assertPipelineResultEqual(expected_file, scancode_file) - - @mock.patch("git.repo.base.Repo.clone_from") - def test_scanpipe_scan_package_single_package_git_repo(self, mock_clone): - pipeline_name = "scan_single_package" - project1 = make_project() - - download_url = "https://github.com/aboutcode-org/scancode.io.git" - project1.add_input_source(download_url=download_url) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - # Create the "fetched" git directory content - def mock_make_git_directory(**kwargs): - to_path = kwargs.get("to_path") # scancode.io.git - to_path.mkdir() - file_location = self.data / "aboutcode" / "notice.NOTICE" - copy_input(file_location, to_path) - - mock_clone.side_effect = mock_make_git_directory - mock_clone.return_value = None - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(2, project1.codebaseresources.count()) - self.assertEqual(0, project1.discoveredpackages.count()) - - def test_scanpipe_scan_codebase_pipeline_integration(self): - pipeline_name = "scan_codebase" - project1 = make_project() - - filename = "is-npm-1.0.0.tgz" - input_location = self.data / "scancode" / filename - project1.copy_input_from(input_location) - project1.add_input_source("https://download.url", filename) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(6, project1.codebaseresources.count()) - self.assertEqual(1, project1.discoveredpackages.count()) - self.assertEqual(1, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = self.data / "scancode" / "is-npm-1.0.0_scan_codebase.json" - self.assertPipelineResultEqual(expected_file, result_file) - - def test_scanpipe_scan_codebase_creates_top_level_paths(self): - pipeline_name = "scan_codebase" - project1 = make_project() - - filename = "is-npm-1.0.0.tgz" - input_location = self.data / "scancode" / filename - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - expected_top_level_paths = ["is-npm-1.0.0.tgz", "is-npm-1.0.0.tgz-extract"] - - top_level_resources = project1.codebaseresources.filter(parent_path="") - top_level_paths = [resource.path for resource in top_level_resources] - - self.assertListEqual(top_level_paths, expected_top_level_paths) - - def test_scanpipe_scan_codebase_creates_parent_path_field(self): - pipeline_name = "scan_codebase" - project1 = make_project() - - filename = "is-npm-1.0.0.tgz" - input_location = self.data / "scancode" / filename - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - expected_top_level_paths = ["is-npm-1.0.0.tgz", "is-npm-1.0.0.tgz-extract"] - expected_nested_paths = [ - "is-npm-1.0.0.tgz-extract/package/index.js", - "is-npm-1.0.0.tgz-extract/package/package.json", - "is-npm-1.0.0.tgz-extract/package/readme.md", - ] - - top_level_resources = project1.codebaseresources.filter(parent_path="") - top_level_paths = [resource.path for resource in top_level_resources] - - self.assertListEqual(top_level_paths, expected_top_level_paths) - - nested_resources = project1.codebaseresources.filter( - parent_path="is-npm-1.0.0.tgz-extract/package" - ) - nested_paths = [resource.path for resource in nested_resources] - - self.assertListEqual(nested_paths, expected_nested_paths) - - def test_scanpipe_inspect_packages_creates_packages_npm(self): - pipeline_name = "inspect_packages" - project1 = make_project() - - filename = "is-npm-1.0.0.tgz" - input_location = self.data / "scancode" / filename - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(6, project1.codebaseresources.count()) - self.assertEqual(1, project1.discoveredpackages.count()) - self.assertEqual(1, project1.discovereddependencies.count()) - - package = project1.discoveredpackages.get() - dependency = project1.discovereddependencies.get() - - self.assertEqual(3, package.codebase_resources.count()) - self.assertEqual("pkg:npm/is-npm@1.0.0", dependency.for_package.purl) - self.assertEqual(package.datasource_ids, [dependency.datasource_id]) - self.assertEqual( - package.codebase_resources.get( - path="is-npm-1.0.0.tgz-extract/package/package.json" - ).path, - dependency.datafile_resource.path, - ) - - def test_scanpipe_inspect_packages_creates_packages_pypi(self): - pipeline_name = "inspect_packages" - project1 = make_project() - - input_location = self.data / "manifests" / "python-inspector-0.10.0.zip" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - self.assertEqual(6, project1.codebaseresources.count()) - self.assertEqual(0, project1.discoveredpackages.count()) - self.assertEqual(26, project1.discovereddependencies.count()) - - @skipIf(sys.platform == "darwin", "Not supported on macOS") - def test_scanpipe_inspect_packages_with_resolved_dependencies_npm(self): - pipeline_name = "inspect_packages" - project1 = make_project() - - input_location = self.data / "dependencies" / "resolved_dependencies_npm.zip" - project1.copy_input_from(input_location) - - run = project1.add_pipeline( - pipeline_name=pipeline_name, - selected_groups=["StaticResolver"], - ) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - self.assertEqual(4, project1.codebaseresources.count()) - self.assertEqual(7, project1.discoveredpackages.count()) - self.assertEqual(6, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = ( - self.data - / "dependencies" - / "resolved_dependencies_npm_inspect_packages.json" - ) - self.assertPipelineResultEqual(expected_file, result_file) - - @skipIf(sys.platform == "darwin", "Not supported on macOS") - def test_scanpipe_inspect_packages_with_resolved_dependencies_poetry(self): - pipeline_name = "inspect_packages" - project1 = make_project() - - input_location = self.data / "dependencies" / "resolved_dependencies_poetry.zip" - project1.copy_input_from(input_location) - - run = project1.add_pipeline( - pipeline_name=pipeline_name, - selected_groups=["StaticResolver"], - ) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - self.assertEqual(5, project1.codebaseresources.count()) - self.assertEqual(6, project1.discoveredpackages.count()) - self.assertEqual(10, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = ( - self.data - / "dependencies" - / "resolved_dependencies_poetry_inspect_packages.json" - ) - self.assertPipelineResultEqual(expected_file, result_file) - - @skipIf(sys.platform == "darwin", "Not supported on macOS") - def test_scanpipe_resolved_dependencies_cocoapods(self): - pipeline_name = "resolve_dependencies" - project1 = make_project() - - input_location = ( - self.data / "dependencies" / "resolved_dependencies_cocoapods.zip" - ) - project1.copy_input_from(input_location) - - run = project1.add_pipeline( - pipeline_name=pipeline_name, - selected_groups=["StaticResolver"], - ) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - self.assertEqual(3, project1.codebaseresources.count()) - self.assertEqual(25, project1.discoveredpackages.count()) - self.assertEqual(30, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = ( - self.data / "dependencies" / "resolved_dependencies_cocoapods.json" - ) - self.assertPipelineResultEqual( - expected_file, result_file, sort_dependencies=True - ) - - def test_scanpipe_resolved_dependencies_pip_inspect(self): - pipeline_name = "resolve_dependencies" - project1 = make_project() - - input_location = self.data / "dependencies" / "resolved_dependencies_pip.zip" - project1.copy_input_from(input_location) - - run = project1.add_pipeline( - pipeline_name=pipeline_name, - selected_groups=["StaticResolver"], - ) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - self.assertEqual(3, project1.codebaseresources.count()) - self.assertEqual(4, project1.discoveredpackages.count()) - self.assertEqual(17, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = self.data / "dependencies" / "resolved_dependencies_pip.json" - self.assertPipelineResultEqual( - expected_file, - result_file, - ) - - def test_scanpipe_resolved_dependencies_nuget(self): - pipeline_name = "resolve_dependencies" - project1 = make_project() - - input_location = self.data / "dependencies" / "resolved_dependencies_nuget.zip" - project1.copy_input_from(input_location) - - run = project1.add_pipeline( - pipeline_name=pipeline_name, - selected_groups=["StaticResolver"], - ) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - self.assertEqual(3, project1.codebaseresources.count()) - self.assertEqual(34, project1.discoveredpackages.count()) - self.assertEqual(108, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = self.data / "dependencies" / "resolved_dependencies_nuget.json" - self.assertPipelineResultEqual( - expected_file, - result_file, - sort_dependencies=True, - ) - - def test_scanpipe_scan_codebase_can_process_wheel(self): - pipeline_name = "scan_codebase" - project1 = make_project() - - filename = "daglib-0.6.0-py3-none-any.whl" - input_location = self.data / "scancode" / filename - project1.copy_input_from(input_location) - project1.add_input_source("https://download.url", filename) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(11, project1.codebaseresources.count()) - self.assertEqual(2, project1.discoveredpackages.count()) - self.assertEqual(8, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = ( - self.data / "scancode" / "daglib-0.6.0-py3-none-any.whl_scan_codebase.json" - ) - self.assertPipelineResultEqual(expected_file, result_file) - - @skipIf(sys.platform != "linux", "Expected results are inconsistent across OS") - def test_scanpipe_docker_pipeline_alpine_integration(self): - pipeline_name = "analyze_docker_image" - project1 = make_project() - - filename = "alpine_3_15_4.tar.gz" - input_location = self.data / "docker" / filename - project1.copy_input_from(input_location) - project1.add_input_source("https://download.url", filename) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(510, project1.codebaseresources.count()) - self.assertEqual(14, project1.discoveredpackages.count()) - self.assertEqual(0, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = self.data / "docker" / "alpine_3_15_4_scan_codebase.json" - self.assertPipelineResultEqual(expected_file, result_file) - - def test_scanpipe_docker_pipeline_does_not_report_errors_for_broken_symlinks(self): - pipeline_name = "analyze_docker_image" - project1 = make_project() - - filename = "minitag.tar" - input_location = self.data / "image-with-symlinks" / filename - project1.copy_input_from(input_location) - project1.add_input_source("https://download.url", filename) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - with redirect_stderr(io.StringIO()): - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - project_messages = project1.projectmessages.all() - self.assertEqual(1, len(project_messages)) - self.assertEqual("Distro not found.", project_messages[0].description) - - result_file = output.to_json(project1) - expected_file = ( - self.data / "image-with-symlinks" / (filename + "-expected-scan.json") - ) - self.assertPipelineResultEqual(expected_file, result_file) - - @skipIf(sys.platform != "linux", "RPM related features only supported on Linux.") - def test_scanpipe_docker_pipeline_rpm_integration(self): - pipeline_name = "analyze_docker_image" - project1 = make_project() - - filename = "centos.tar.gz" - input_location = self.data / "docker" / filename - project1.copy_input_from(input_location) - project1.add_input_source("https://download.url", filename) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(29, project1.codebaseresources.count()) - self.assertEqual(101, project1.discoveredpackages.count()) - self.assertEqual(0, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = self.data / "docker" / "centos_scan_codebase.json" - self.assertPipelineResultEqual(expected_file, result_file) - - def test_scanpipe_docker_pipeline_debian_integration(self): - pipeline_name = "analyze_docker_image" - project1 = make_project() - - filename = "debian.tar.gz" - input_location = self.data / "docker" / filename - project1.copy_input_from(input_location) - project1.add_input_source("https://download.url", filename) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(16, project1.codebaseresources.count()) - self.assertEqual(2, project1.discoveredpackages.count()) - self.assertEqual(0, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = self.data / "docker" / "debian_scan_codebase.json" - self.assertPipelineResultEqual(expected_file, result_file) - - def test_scanpipe_docker_pipeline_distroless_debian_integration(self): - pipeline_name = "analyze_docker_image" - project1 = make_project() - - filename = "gcr_io_distroless_base.tar.gz" - input_location = self.data / "docker" / filename - project1.copy_input_from(input_location) - project1.add_input_source("https://download.url", filename) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(2458, project1.codebaseresources.count()) - self.assertEqual(6, project1.discoveredpackages.count()) - self.assertEqual(0, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = ( - self.data / "docker" / "gcr_io_distroless_base_scan_codebase.json" - ) - self.assertPipelineResultEqual(expected_file, result_file) - - def test_scanpipe_rootfs_pipeline_integration(self): - pipeline_name = "analyze_root_filesystem_or_vm_image" - project1 = make_project() - - input_location = self.data / "rootfs" / "basic-rootfs.tar.gz" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(17, project1.codebaseresources.count()) - self.assertEqual(2, project1.discoveredpackages.count()) - self.assertEqual(0, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = self.data / "rootfs" / "basic-rootfs_root_filesystems.json" - self.assertPipelineResultEqual(expected_file, result_file) - - def test_scanpipe_load_inventory_pipeline_integration(self): - pipeline_name = "load_inventory" - project1 = make_project() - - input_location = self.data / "asgiref" / "asgiref-3.3.0_toolkit_scan.json" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(18, project1.codebaseresources.count()) - self.assertEqual(2, project1.discoveredpackages.count()) - self.assertEqual(4, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = ( - self.data / "asgiref" / "asgiref-3.3.0_load_inventory_expected.json" - ) - self.assertPipelineResultEqual(expected_file, result_file) - - # Using the ScanCode.io JSON output as the input - project2 = make_project() - - input_location = self.data / "asgiref" / "asgiref-3.3.0_scanpipe_output.json" - project2.copy_input_from(input_location) - - run = project2.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(18, project2.codebaseresources.count()) - self.assertEqual(2, project2.discoveredpackages.count()) - self.assertEqual(4, project2.discovereddependencies.count()) - - @mock.patch("scanpipe.pipes.vulnerablecode.is_available") - @mock.patch("scanpipe.pipes.vulnerablecode.is_configured") - @mock.patch("scanpipe.pipes.vulnerablecode.bulk_search_by_purl") - def test_scanpipe_find_vulnerabilities_pipeline_integration( - self, mock_bulk_search_by_purl, mock_is_configured, mock_is_available - ): - pipeline_name = "find_vulnerabilities" - project1 = make_project() - package1 = DiscoveredPackage.create_from_data(project1, package_data1) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - mock_is_configured.return_value = False - mock_is_available.return_value = False - exitcode, out = pipeline.execute() - self.assertEqual(1, exitcode, msg=out) - self.assertIn("VulnerableCode is not configured.", out) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - mock_is_configured.return_value = True - mock_is_available.return_value = True - vulnerability_data = [ - { - "purl": "pkg:deb/debian/adduser@3.118?arch=all", - "affected_by_vulnerabilities": [ - { - "vulnerability_id": "VCID-cah8-awtr-aaad", - "summary": "An issue was discovered.", - }, - ], - }, - { - "purl": "pkg:deb/debian/adduser@3.118?qualifiers=1", - "affected_by_vulnerabilities": [ - { - "vulnerability_id": "VCID-cah8-awtr-aaad", - "summary": "An issue was discovered.", - }, - ], - }, - ] - mock_bulk_search_by_purl.return_value = vulnerability_data - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - package1.refresh_from_db() - expected = vulnerability_data[0]["affected_by_vulnerabilities"] - self.assertEqual(expected, package1.affected_by_vulnerabilities) - - @mock.patch("scorecode.ossf_scorecard.is_available") - def test_scanpipe_fetch_scores_pipeline_integration(self, mock_is_available): - pipeline_name = "fetch_scores" - project1 = make_project() - package1 = DiscoveredPackage.create_from_data(project1, package_data1) - package1.vcs_url = "https://github.com/ossf/scorecard" - package1.save() - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - mock_is_available.return_value = False - exitcode, out = pipeline.execute() - self.assertEqual(1, exitcode, msg=out) - self.assertIn("ScoreCode service is not available.", out) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - mock_is_available.return_value = True - - package_score_data = { - "scoring_tool": "ossf_scorecard", - "scoring_tool_version": "v5.2.1", - "score": "9.7", - "scoring_tool_documentation_url": "https://github.com/[trunc...]", - "score_date": "2025-07-24T18:50:16Z", - } - with mock.patch("scorecode.ossf_scorecard.fetch_scorecard") as fetch: - fetch.return_value = PackageScore(**package_score_data) - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - package1.refresh_from_db() - scorecard_entry = package1.scores.filter(scoring_tool="ossf-scorecard").first() - self.assertIsNotNone(scorecard_entry) - self.assertEqual("ossf-scorecard", scorecard_entry.scoring_tool) - self.assertEqual("v5.2.1", scorecard_entry.scoring_tool_version) - self.assertTrue(scorecard_entry.score) - - def test_scanpipe_resolve_dependencies_pipeline_integration(self): - pipeline_name = "resolve_dependencies" - project1 = make_project() - selected_groups = ["DynamicResolver"] - - run = project1.add_pipeline( - pipeline_name=pipeline_name, selected_groups=selected_groups - ) - pipeline = run.make_pipeline_instance() - - project1.move_input_from(tempfile.mkstemp()[1]) - pipeline.execute() - self.assertEqual(1, project1.projectmessages.count()) - message = project1.projectmessages.get() - self.assertEqual("get_packages_from_manifest", message.model) - expected = "No resources containing package data found in codebase." - self.assertIn(expected, message.description) - - def test_scanpipe_resolve_dependencies_pipeline_integration_empty_manifest(self): - pipeline_name = "resolve_dependencies" - project1 = make_project() - selected_groups = ["DynamicResolver"] - - run = project1.add_pipeline( - pipeline_name=pipeline_name, selected_groups=selected_groups - ) - pipeline = run.make_pipeline_instance() - - project1.move_input_from(tempfile.mkstemp(suffix="requirements.txt")[1]) - pipeline.execute() - self.assertEqual(1, project1.projectmessages.count()) - message = project1.projectmessages.get() - self.assertEqual("get_packages_from_manifest", message.model) - expected = "No packages could be resolved" - self.assertIn(expected, message.description) - - @mock.patch("scanpipe.pipes.resolve.python_inspector.resolve_dependencies") - def test_scanpipe_resolve_dependencies_pipeline_integration_misc( - self, mock_resolve_dependencies - ): - pipeline_name = "resolve_dependencies" - project1 = make_project() - selected_groups = ["DynamicResolver"] - - input_location = self.data / "manifests" / "requirements.txt" - project1.copy_input_from(input_location) - - run = project1.add_pipeline( - pipeline_name=pipeline_name, selected_groups=selected_groups - ) - pipeline = run.make_pipeline_instance() - - mock_resolve_dependencies.return_value = mock.Mock(packages=[package_data1]) - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - self.assertEqual(1, project1.discoveredpackages.count()) - - @mock.patch("scanpipe.pipes.resolve.python_inspector.resolve_dependencies") - def test_scanpipe_resolve_dependencies_pipeline_pypi_integration( - self, mock_resolve_dependencies - ): - pipeline_name = "resolve_dependencies" - project1 = make_project() - selected_groups = ["DynamicResolver"] - - run = project1.add_pipeline( - pipeline_name=pipeline_name, selected_groups=selected_groups - ) - pipeline = run.make_pipeline_instance() - - project1.move_input_from(tempfile.mkstemp(suffix="requirements.txt")[1]) - mock_resolve_dependencies.return_value = mock.Mock(packages=[package_data1]) - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(1, project1.discoveredpackages.count()) - discoveredpackage = project1.discoveredpackages.get() - exclude_fields = ["qualifiers", "release_date", "size"] - for field_name, value in package_data1.items(): - if value and field_name not in exclude_fields: - self.assertEqual(value, getattr(discoveredpackage, field_name)) - - def test_scanpipe_load_sbom_pipeline_aboutfile_integration(self): - pipeline_name = "load_sbom" - project1 = make_project() - - input_location = self.data / "manifests" / "Django-4.0.8-py3-none-any.whl.ABOUT" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(1, project1.discoveredpackages.count()) - discoveredpackage = project1.discoveredpackages.get() - self.assertEqual("pypi", discoveredpackage.type) - self.assertEqual("django", discoveredpackage.name) - self.assertEqual("4.0.8", discoveredpackage.version) - self.assertEqual("bsd-new", discoveredpackage.declared_license_expression) - - def test_scanpipe_load_sbom_pipeline_spdx_integration(self): - pipeline_name = "load_sbom" - project1 = make_project() - - input_location = self.data / "manifests" / "toml.spdx.json" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(1, project1.discoveredpackages.count()) - discoveredpackage = project1.discoveredpackages.get() - self.assertEqual("pypi", discoveredpackage.type) - self.assertEqual("toml", discoveredpackage.name) - self.assertEqual("0.10.2", discoveredpackage.version) - self.assertEqual("https://github.com/uiri/toml", discoveredpackage.homepage_url) - self.assertEqual("MIT", discoveredpackage.extracted_license_statement) - self.assertEqual("mit", discoveredpackage.declared_license_expression) - - def test_scanpipe_load_sbom_pipeline_cyclonedx_integration(self): - pipeline_name = "load_sbom" - project1 = make_project() - - input_location = self.data / "cyclonedx" / "nested.cdx.json" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(3, project1.discoveredpackages.count()) - packages = project1.discoveredpackages.all() - expected_data = { - "pkg:pypi/toml@0.10.2?extension=tar.gz": { - "type": "pypi", - "name": "toml", - "version": "0.10.2", - "extracted_license_statement": "OFL-1.1\nApache-2.0", - "declared_license_expression": "ofl-1.1 OR apache-2.0", - "homepage_url": "https://cyclonedx.org/website", - "bug_tracking_url": "https://cyclonedx.org/issue-tracker", - "vcs_url": "https://cyclonedx.org/vcs", - "filename": "", - }, - "pkg:pypi/billiard@3.6.3.0": { - "type": "pypi", - "name": "billiard", - "version": "3.6.3.0", - "extracted_license_statement": "BSD-3-Clause", - "declared_license_expression": "bsd-new", - "homepage_url": "", - "bug_tracking_url": "", - "vcs_url": "", - "extra_data": "", - "filename": "", - }, - "pkg:pypi/fictional@9.10.2": { - "type": "pypi", - "name": "fictional", - "version": "9.10.2", - "extracted_license_statement": ( - "LGPL-3.0-or-later" - " AND " - "LicenseRef-scancode-openssl-exception-lgpl3.0plus" - ), - "declared_license_expression": ( - "lgpl-3.0-plus AND openssl-exception-lgpl-3.0-plus" - ), - "homepage_url": "https://home.page", - "bug_tracking_url": "", - "vcs_url": "", - "extra_data": "", - "filename": "package.zip", - }, - } - - for package in packages: - expected = expected_data.get(str(package)) - self.assertEqual(expected["type"], package.type) - self.assertEqual(expected["name"], package.name) - self.assertEqual(expected["version"], package.version) - self.assertEqual(expected["homepage_url"], package.homepage_url) - self.assertEqual( - expected["extracted_license_statement"], - package.extracted_license_statement, - ) - self.assertEqual( - expected["declared_license_expression"], - package.declared_license_expression, - ) - self.assertEqual(expected["filename"], package.filename) - - def test_scanpipe_load_sbom_pipeline_cyclonedx_with_dependencies_integration(self): - pipeline_name = "load_sbom" - project1 = make_project() - - input_location = self.data / "cyclonedx" / "laravel-7.12.0" / "bom.1.4.json" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(62, project1.discoveredpackages.count()) - self.assertEqual(112, project1.discovereddependencies.count()) - dependency = project1.discovereddependencies.all()[0] - self.assertEqual("bom.1.4.json", str(dependency.datafile_resource)) - - def test_scanpipe_load_sbom_pipeline_cyclonedx_with_vulnerabilities(self): - pipeline_name = "load_sbom" - project1 = make_project() - - input_location = ( - self.data / "cyclonedx" / "python-3.13.0-vulnerabilities.cdx.json" - ) - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(1, project1.discoveredpackages.count()) - package = project1.discoveredpackages.get() - expected = [ - { - "vulnerability_id": "CVE-2005-2541", - "summary": "Tar 1.15.1 does not properly warn the user when...", - } - ] - self.assertEqual(expected, package.affected_by_vulnerabilities) - - @mock.patch("scanpipe.pipes.purldb.request_post") - @mock.patch("uuid.uuid4") - def test_scanpipe_deploy_to_develop_pipeline_integration( - self, mock_uuid4, mock_request - ): - forced_uuid = "b74fe5df-e965-415e-ba65-f38421a0695d" - mock_uuid4.return_value = forced_uuid - mock_request.return_value = None - pipeline_name = "map_deploy_to_develop" - project1 = make_project(name="Analysis", uuid=forced_uuid) - selected_groups = ["Java"] - - jar_location = self.data / "d2d" / "jars" - project1.copy_input_from(jar_location / "from-flume-ng-node-1.9.0.zip") - project1.copy_input_from(jar_location / "to-flume-ng-node-1.9.0.zip") - - run = project1.add_pipeline( - pipeline_name=pipeline_name, selected_groups=selected_groups - ) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(57, project1.codebaseresources.count()) - self.assertEqual(18, project1.codebaserelations.count()) - self.assertEqual(1, project1.discoveredpackages.count()) - self.assertEqual(0, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = self.data / "d2d" / "flume-ng-node-d2d.json" - self.assertPipelineResultEqual(expected_file, result_file) - - @skipIf(sys.platform == "darwin", "Not supported on macOS") - def test_scanpipe_deploy_to_develop_pipeline_integration_elfs(self): - pipeline_name = "map_deploy_to_develop" - project1 = make_project(name="Analysis") - selected_groups = ["Elf"] - - elf_location = self.data / "d2d-elfs" - project1.copy_input_from(elf_location / "from-brotli-d2d.zip") - project1.copy_input_from(elf_location / "to-brotli-d2d.zip") - - run = project1.add_pipeline( - pipeline_name=pipeline_name, selected_groups=selected_groups - ) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(17, project1.codebaseresources.count()) - self.assertEqual(7, project1.codebaserelations.count()) - - result_file = output.to_json(project1) - expected_file = self.data / "d2d-elfs" / "brotli-elf-d2d.json" - self.assertPipelineResultEqual(expected_file, result_file) - - def test_scanpipe_deploy_to_develop_pipeline_extract_input_files_errors(self): - project1 = make_project() - run = project1.add_pipeline("map_deploy_to_develop") - pipeline_instance = deploy_to_develop.DeployToDevelop(run) - - # Create 2 files in the input/ directory to generate error twice - project1.move_input_from(tempfile.mkstemp(prefix="from-")[1]) - project1.move_input_from(tempfile.mkstemp(prefix="to-")[1]) - self.assertEqual(2, len(project1.input_files)) - - pipeline_instance.get_inputs() - with mock.patch("scanpipe.pipes.scancode.extract_archive") as extract_archive: - extract_archive.return_value = {"path/to/resource": ["error1", "error2"]} - inputs_with_codebase_path_destination = [ - (pipeline_instance.from_files, project1.codebase_path / d2d.FROM), - (pipeline_instance.to_files, project1.codebase_path / d2d.TO), - ] - - for input_files, codebase_path in inputs_with_codebase_path_destination: - for input_file_path in input_files: - pipeline_instance.extract_archive(input_file_path, codebase_path) - - projects_errors = project1.projectmessages.all() - self.assertEqual(2, len(projects_errors)) - project_error = projects_errors[0] - self.assertEqual("error", project_error.severity) - self.assertEqual("error1\nerror2", project_error.description) - self.assertEqual("extract_archive", project_error.model) - self.assertEqual({"filename": "resource"}, project_error.details) - self.assertEqual("", project_error.traceback) - - @mock.patch("scanpipe.pipes.purldb.request_post") - @mock.patch("uuid.uuid4") - def test_scanpipe_deploy_to_develop_pipeline_with_about_file( - self, mock_uuid4, mock_request - ): - forced_uuid = "90cb6382-431c-4187-be76-d4f1a2199a2f" - mock_uuid4.return_value = forced_uuid - mock_request.return_value = None - pipeline_name = "map_deploy_to_develop" - project1 = make_project(name="Analysis", uuid=forced_uuid) - selected_groups = ["Java"] - - data_dir = self.data / "d2d" / "about_files" - project1.copy_input_from(data_dir / "from-with-about-file.zip") - project1.copy_input_from(data_dir / "to-with-jar.zip") - - run = project1.add_pipeline( - pipeline_name=pipeline_name, selected_groups=selected_groups - ) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(44, project1.codebaseresources.count()) - self.assertEqual(31, project1.codebaserelations.count()) - self.assertEqual(2, project1.discoveredpackages.count()) - self.assertEqual(0, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = data_dir / "expected.json" - self.assertPipelineResultEqual(expected_file, result_file) - - self.assertEqual(1, project1.projectmessages.count()) - message = project1.projectmessages.get() - self.assertEqual("map_about_files", message.model) - expected = ( - "Resource paths listed at about_resource is not found in the to/ codebase" - ) - self.assertIn(expected, message.description) - - @mock.patch("scanpipe.pipes.purldb.request_post") - @mock.patch("scanpipe.pipes.purldb.is_available") - def test_scanpipe_populate_purldb_pipeline_integration( - self, mock_is_available, mock_request_post - ): - pipeline_name1 = "load_inventory" - pipeline_name2 = "populate_purldb" - project1 = make_project() - - input_location = self.data / "asgiref" / "asgiref-3.3.0_toolkit_scan.json" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name1) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - def mock_request_post_return(url, data, headers, timeout): - payload = json.loads(data) - return { - "queued_packages_count": len(payload["packages"]), - "queued_packages": payload["packages"], - "unqueued_packages_count": 1, - "unqueued_packages": [], - "unsupported_packages_count": 1, - "unsupported_packages": [], - } - - mock_request_post.side_effect = mock_request_post_return - mock_is_available.return_value = True - - run = project1.add_pipeline(pipeline_name2) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertIn("Populating PurlDB with 2 PURLs from DiscoveredPackage", run.log) - self.assertIn("Successfully queued 2 PURLs for indexing in PurlDB", run.log) - self.assertIn("1 PURLs were already present in PurlDB index queue", run.log) - self.assertIn("Couldn't index 1 unsupported PURLs", run.log) - - @mock.patch("scanpipe.pipes.purldb.request_post") - @mock.patch("scanpipe.pipes.purldb.is_available") - def test_scanpipe_populate_purldb_pipeline_integration_without_assembly( - self, mock_is_available, mock_request_post - ): - pipeline_name = "populate_purldb" - project1 = make_project() - - def mock_request_post_return(url, data, headers, timeout): - payload = json.loads(data) - return { - "queued_packages_count": len(payload["packages"]), - "queued_packages": payload["packages"], - "unqueued_packages_count": 1, - "unqueued_packages": [], - "unsupported_packages_count": 1, - "unsupported_packages": [], - } - - mock_request_post.side_effect = mock_request_post_return - mock_is_available.return_value = True - - package_json_location = self.data / "manifests" / "package.json" - copy_input(package_json_location, project1.codebase_path) - pipes.collect_and_create_codebase_resources(project1) - - scancode.scan_for_application_packages(project1, assemble=False) - scancode.process_package_data(project1) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertIn("Populating PurlDB with 1 PURLs from DiscoveredPackage", run.log) - self.assertIn( - "Populating PurlDB with 6 unresolved PURLs from DiscoveredDependency", - run.log, - ) - self.assertIn("1 PURLs were already present in PurlDB index queue", run.log) - self.assertIn("Couldn't index 1 unsupported PURLs", run.log) - - @skipIf(sys.platform == "darwin", "Not supported on macOS") - def test_scanpipe_collect_symbols_ctags_pipeline_integration(self): - pipeline_name = "collect_symbols_ctags" - project1 = make_project() - - dir = project1.codebase_path / "codefile" - dir.mkdir(parents=True) - - file_location = self.data / "d2d-javascript" / "from" / "main.js" - copy_input(file_location, dir) - - pipes.collect_and_create_codebase_resources(project1) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - main_file = project1.codebaseresources.files()[0] - result_extra_data_symbols = main_file.extra_data.get("source_symbols") - expected_extra_data_symbols = ["generatePassword", "passwordLength", "charSet"] - self.assertCountEqual(expected_extra_data_symbols, result_extra_data_symbols) - - @skipIf(sys.platform != "linux", "Only supported on Linux") - def test_scanpipe_collect_strings_gettext_pipeline_integration(self): - pipeline_name = "collect_strings_gettext" - project1 = make_project() - - dir = project1.codebase_path / "codefile" - dir.mkdir(parents=True) - - file_location = self.data / "d2d-javascript" / "from" / "main.js" - copy_input(file_location, dir) - - pipes.collect_and_create_codebase_resources(project1) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - main_file = project1.codebaseresources.files()[0] - result_extra_data_strings = main_file.extra_data.get("source_strings") - expected_extra_data_strings = [ - "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_-+=", # noqa - "Enter the desired length of your password:", - ] - self.assertCountEqual(expected_extra_data_strings, result_extra_data_strings) - - @skipIf(sys.platform == "darwin", "Not supported on macOS") - def test_scanpipe_collect_symbols_pygments_pipeline_integration(self): - pipeline_name = "collect_symbols_pygments" - project1 = make_project() - - dir = project1.codebase_path / "codefile" - dir.mkdir(parents=True) - - file_location = self.data / "source-inspector" / "test3.cpp" - copy_input(file_location, dir) - - pipes.collect_and_create_codebase_resources(project1) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - main_file = project1.codebaseresources.files()[0] - result_extra_data = main_file.extra_data - - expected_extra_data = ( - self.data / "source-inspector" / "test3.cpp-pygments-expected.json" - ) - - with open(expected_extra_data) as f: - expected_extra_data = json.load(f) - - self.assertDictEqual(expected_extra_data, result_extra_data) - - @skipIf(sys.platform == "darwin", "Not supported on macOS") - def test_scanpipe_collect_symbols_tree_sitter_pipeline_integration(self): - pipeline_name = "collect_symbols_tree_sitter" - project1 = make_project() - - dir = project1.codebase_path / "codefile" - dir.mkdir(parents=True) - - file_location = self.data / "source-inspector" / "test3.cpp" - copy_input(file_location, dir) - - pipes.collect_and_create_codebase_resources(project1) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - main_file = project1.codebaseresources.files()[0] - result_extra_data = main_file.extra_data - - expected_extra_data = ( - self.data / "source-inspector" / "test3.cpp-tree-sitter-expected.json" - ) - - with open(expected_extra_data) as f: - expected_extra_data = json.load(f) - - self.assertDictEqual(expected_extra_data, result_extra_data) - - @mock.patch("scanpipe.pipes.purldb.is_available") - @mock.patch("scanpipe.pipes.purldb.is_configured") - @mock.patch("scanpipe.pipes.purldb.collect_data_for_purl") - def test_scanpipe_enrich_with_purldb_pipeline_integration( - self, mock_collect_data, mock_is_configured, mock_is_available - ): - pipeline_name = "enrich_with_purldb" - project1 = make_project() - package1 = make_package(project1, package_url="pkg:npm/csvtojson@2.0.10") - - mock_is_configured.return_value = True - mock_is_available.return_value = True - - purldb_entry_file = self.data / "purldb" / "csvtojson-2.0.10.json" - purldb_entry = json.loads(purldb_entry_file.read_text()) - mock_collect_data.return_value = [purldb_entry] - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - package1.refresh_from_db() - self.assertTrue(package1.extra_data.get("enrich_with_purldb")) - - run.refresh_from_db() - self.assertIn("pkg:npm/csvtojson@2.0.10 ['release_date'", run.log) - self.assertIn("1 discovered package enriched with the PurlDB.", run.log) +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/nexB/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/nexB/scancode.io for support and download. + +import io +import json +import os +import sys +import tempfile +from contextlib import redirect_stderr +from pathlib import Path +from unittest import mock +from unittest import skipIf + +from django.conf import settings +from django.test import TestCase +from django.test import tag + +from packageurl import PackageURL +from scancode.cli_test_utils import purl_with_fake_uuid +from scorecode.models import PackageScore + +from scanpipe import pipes +from scanpipe.models import CodebaseResource +from scanpipe.models import DiscoveredPackage +from scanpipe.models import InputSource +from scanpipe.pipelines import CommonStepsMixin +from scanpipe.pipelines import InputFilesError +from scanpipe.pipelines import Pipeline +from scanpipe.pipelines import analyze_root_filesystem +from scanpipe.pipelines import deploy_to_develop +from scanpipe.pipelines import is_pipeline +from scanpipe.pipelines import scan_single_package +from scanpipe.pipes import d2d +from scanpipe.pipes import flag +from scanpipe.pipes import output +from scanpipe.pipes import scancode +from scanpipe.pipes.input import copy_input +from scanpipe.tests import FIXTURES_REGEN +from scanpipe.tests import make_mock_response +from scanpipe.tests import make_package +from scanpipe.tests import make_project +from scanpipe.tests import package_data1 +from scanpipe.tests.pipelines.do_nothing import DoNothing +from scanpipe.tests.pipelines.download_inputs import DownloadInput +from scanpipe.tests.pipelines.profile_step import ProfileStep +from scanpipe.tests.pipelines.steps_as_attribute import StepsAsAttribute +from scanpipe.tests.pipelines.with_groups import WithGroups + +from_docker_image = os.environ.get("FROM_DOCKER_IMAGE") + + +class ScanPipePipelinesTest(TestCase): + data = Path(__file__).parent / "data" + + def test_scanpipe_pipeline_class_pipeline_name_attribute(self): + project1 = make_project() + run = project1.add_pipeline("do_nothing") + pipeline_instance = DoNothing(run) + self.assertEqual("do_nothing", pipeline_instance.pipeline_name) + + def test_scanpipe_pipeline_class_get_info(self): + expected = { + "description": "Description section of the doc string.", + "summary": "Do nothing, in 2 steps.", + "steps": [ + {"name": "step1", "doc": "Step1 doc.", "groups": []}, + {"name": "step2", "doc": "Step2 doc.", "groups": []}, + ], + "available_groups": [], + } + self.assertEqual(expected, DoNothing.get_info()) + + expected = { + "summary": "Profile a step using the @profile decorator.", + "description": "", + "steps": [ + {"name": "step", "doc": "", "groups": []}, + ], + "available_groups": [], + } + self.assertEqual(expected, ProfileStep.get_info()) + + def test_scanpipe_pipeline_class_get_summary(self): + expected = "Do nothing, in 2 steps." + self.assertEqual(expected, DoNothing.get_summary()) + + expected = "Profile a step using the @profile decorator." + self.assertEqual(expected, ProfileStep.get_summary()) + + def test_scanpipe_pipeline_class_log(self): + project1 = make_project() + run = project1.add_pipeline("do_nothing") + pipeline = run.make_pipeline_instance() + pipeline.log("Event1") + pipeline.log("Event2") + + run.refresh_from_db() + self.assertIn("Event1", run.log) + self.assertIn("Event2", run.log) + + def test_scanpipe_pipeline_class_execute(self): + project1 = make_project() + run = project1.add_pipeline("do_nothing") + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode) + self.assertEqual("", out) + + run.refresh_from_db() + self.assertIn("Pipeline [do_nothing] starting", run.log) + self.assertIn("Step [step1] starting", run.log) + self.assertIn("Step [step1] completed", run.log) + self.assertIn("Step [step2] starting", run.log) + self.assertIn("Step [step2] completed", run.log) + self.assertIn("Pipeline completed", run.log) + + def test_scanpipe_pipeline_class_execute_with_exception(self): + project1 = make_project() + run = project1.add_pipeline("raise_exception") + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(1, exitcode) + self.assertTrue(out.startswith("Error message")) + self.assertIn("Traceback:", out) + self.assertIn("in execute", out) + self.assertIn("step(self)", out) + self.assertIn("in raise_exception", out) + self.assertIn("raise ValueError", out) + + run.refresh_from_db() + self.assertIn("Pipeline [raise_exception] starting", run.log) + self.assertIn("Step [raise_exception_step] starting", run.log) + self.assertIn("Pipeline failed", run.log) + + @mock.patch("scanpipe.tests.pipelines.do_nothing.DoNothing.step1") + @mock.patch("scanpipe.tests.pipelines.do_nothing.DoNothing.step2") + def test_scanpipe_pipeline_class_execute_with_selected_steps(self, step2, step1): + step1.__name__ = "step1" + step1.groups = [] + step2.__name__ = "step2" + step2.groups = [] + + project1 = make_project() + run = project1.add_pipeline("do_nothing") + run.update(selected_steps=["step2", "not_existing_step"]) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode) + self.assertEqual("", out) + + step1.assert_not_called() + step2.assert_called() + + run.refresh_from_db() + self.assertIn("Pipeline [do_nothing] starting", run.log) + self.assertIn("Step [step1] skipped", run.log) + self.assertIn("Step [step2] starting", run.log) + self.assertIn("Step [step2] completed", run.log) + self.assertIn("Pipeline completed", run.log) + + def test_scanpipe_pipeline_class_download_inputs_attribute(self): + project1 = make_project() + run = project1.add_pipeline("download_inputs") + pipeline = run.make_pipeline_instance() + self.assertTrue(pipeline.download_inputs) + expected = (CommonStepsMixin.download_missing_inputs,) + self.assertEqual(expected, pipeline.get_initial_steps()) + expected = (CommonStepsMixin.download_missing_inputs, DownloadInput.step1) + self.assertEqual(expected, pipeline.get_steps()) + pipeline.execute() + self.assertIn("Step [download_missing_inputs]", run.log) + + run = project1.add_pipeline("profile_step") + pipeline = run.make_pipeline_instance() + self.assertFalse(pipeline.download_inputs) + pipeline.execute() + self.assertNotIn("Step [download_missing_inputs]", run.log) + + @mock.patch("requests.sessions.Session.get") + def test_scanpipe_pipeline_class_download_missing_inputs(self, mock_get): + project1 = make_project() + run = project1.add_pipeline("do_nothing") + pipeline = run.make_pipeline_instance() + + file_location = self.data / "aboutcode" / "notice.NOTICE" + input_source = project1.add_input_source( + filename=file_location.name, is_uploaded=True + ) + self.assertFalse(input_source.exists()) + with self.assertRaises(InputFilesError) as error: + pipeline.download_missing_inputs() + error_msg = ( + "InputFilesError encountered with the following issues:\n\n" + "Error 1: Uploaded file filename=notice.NOTICE [uploaded] not available." + "\n\nNo traceback available." + ) + self.assertEqual(error_msg, str(error.exception)) + self.assertIn( + "Uploaded file filename=notice.NOTICE [uploaded] not available.", run.log + ) + + project1.copy_input_from(file_location) + self.assertTrue(input_source.exists()) + run = project1.add_pipeline("do_nothing") + pipeline = run.make_pipeline_instance() + pipeline.download_missing_inputs() + self.assertEqual("", run.log) + + download_url = "https://download.url/file.zip" + mock_get.return_value = make_mock_response(url=download_url) + input_source2 = project1.add_input_source(download_url=download_url) + pipeline.download_missing_inputs() + self.assertIn("Fetching input from https://download.url/file.zip", run.log) + input_source2.refresh_from_db() + self.assertEqual("file.zip", input_source2.filename) + self.assertTrue(input_source2.exists()) + mock_get.assert_called_once() + + @mock.patch("scanpipe.models.InputSource.fetch") + def test_scanpipe_pipeline_class_download_fetch_exception(self, mock_fetch): + project1 = make_project() + run = project1.add_pipeline("do_nothing") + pipeline = run.make_pipeline_instance() + + mock_fetch.side_effect = Exception("File not found") + download_url = "https://download.url/file.zip" + project1.add_input_source(download_url=download_url) + + with self.assertRaises(InputFilesError) as error: + pipeline.download_missing_inputs() + self.assertIn( + "InputFilesError encountered with the following issues:", + str(error.exception), + ) + self.assertIn("Error 1: File not found", str(error.exception)) + self.assertIn("Traceback (most recent call last):", str(error.exception)) + self.assertIn("Exception: File not found", str(error.exception)) + + self.assertIn("Fetching input from https://download.url/file.zip", run.log) + self.assertIn("https://download.url/file.zip could not be fetched.", run.log) + + @mock.patch("git.repo.base.Repo.clone_from") + def test_scanpipe_pipeline_class_download_missing_inputs_git_repo(self, mock_clone): + project1 = make_project() + run = project1.add_pipeline("do_nothing") + pipeline = run.make_pipeline_instance() + + download_url = "https://github.com/aboutcode-org/scancode.io.git" + input_source = project1.add_input_source(download_url=download_url) + + def mock_make_to_path(**kwargs): + to_path = kwargs.get("to_path") + to_path.mkdir() + + mock_clone.side_effect = mock_make_to_path + mock_clone.return_value = None + + pipeline.download_missing_inputs() + self.assertIn( + "Fetching input from https://github.com/aboutcode-org/scancode.io.git", + run.log, + ) + input_source.refresh_from_db() + self.assertEqual("scancode.io.git", input_source.filename) + self.assertTrue(input_source.exists()) + + @mock.patch("requests.get") + def test_archive_downloads(self, mock_get): + project1 = make_project() + run = project1.add_pipeline("scan_codebase") + pipeline = run.make_pipeline_instance() + test_filename = "sample.tar.gz" + test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" + test_data_path = ( + Path(__file__).parent / "data" / "test-downloads" / test_filename + ) + with open(test_data_path, "rb") as f: + test_content = f.read() + + input_source=InputSource.objects.create( + project=project1, + filename=test_filename, + download_url=test_url, + is_uploaded=False, + ) + + mock_get.return_value.content = test_content + mock_get.return_value.status_code = 200 + + pipeline.download_missing_inputs() + input_source.refresh_from_db() + self.assertTrue( + input_source.file_path.startswith(settings.CENTRAL_ARCHIVE_PATH) + ) + self.assertTrue(Path(input_source.file_path).exists()) + + pipeline.archive_downloads() + input_source = InputSource.refresh_from_db() + self.assertTrue(input_source.sha256) + self.assertTrue(input_source.download_date) + self.assertEqual(input_source.download_url, test_url) + self.assertEqual(input_source.filename, test_filename) + + project2 = make_project(name="project2") + input_source2 = InputSource.objects.create( + project=project2, + filename=test_filename, + download_url=test_url, + is_uploaded=False, + ) + run2 = project2.add_pipeline("scan_codebase") + pipeline2 = run2.make_pipeline_instance() + pipeline2.download_missing_inputs() + input_source2.refresh_from_db() + self.assertEqual(input_source.file_path, input_source2.file_path) + self.assertTrue(Path(input_source2.file_path).exists()) + + def test_scanpipe_pipeline_class_save_errors_context_manager(self): + project1 = make_project() + run = project1.add_pipeline("do_nothing") + pipeline = run.make_pipeline_instance() + self.assertEqual(project1, pipeline.project) + + with pipeline.save_errors(Exception): + raise Exception("Error message") + + message = project1.projectmessages.get() + self.assertEqual("do_nothing", message.model) + self.assertEqual({}, message.details) + self.assertEqual("Error message", message.description) + self.assertIn('raise Exception("Error message")', message.traceback) + + resource1 = CodebaseResource.objects.create(project=project1, path="filename") + with pipeline.save_errors(Exception, resource=resource1): + raise Exception("Error message") + message = project1.projectmessages.latest("created_date") + self.assertEqual({"resource_path": str(resource1.path)}, message.details) + + def test_scanpipe_pipelines_is_pipeline(self): + self.assertFalse(is_pipeline(None)) + self.assertFalse(is_pipeline(Pipeline)) + self.assertTrue(is_pipeline(DoNothing)) + + class SubSubClass(DoNothing): + pass + + self.assertTrue(is_pipeline(SubSubClass)) + + def test_scanpipe_pipeline_class_get_graph(self): + expected = [ + {"name": "step1", "doc": "Step1 doc.", "groups": []}, + {"name": "step2", "doc": "Step2 doc.", "groups": []}, + ] + self.assertEqual(expected, DoNothing.get_graph()) + + def test_scanpipe_pipelines_profile_decorator(self): + project1 = make_project() + run = project1.add_pipeline("profile_step") + pipeline_instance = run.make_pipeline_instance() + + exitcode, out = pipeline_instance.execute() + self.assertEqual(0, exitcode) + + run.refresh_from_db() + self.assertIn("Profiling results at", run.log) + self.assertIn("Pipeline completed", run.log) + + self.assertEqual(1, len(project1.output_root)) + output_file = project1.output_root[0] + self.assertTrue(output_file.startswith("profile-")) + self.assertTrue(output_file.endswith(".html")) + + def test_scanpipe_pipeline_class_get_steps(self): + expected = ( + DoNothing.step1, + DoNothing.step2, + ) + self.assertEqual(expected, DoNothing.get_steps()) + + with self.assertRaises(TypeError) as cm: + StepsAsAttribute.get_steps() + expected = "Use a ``steps(cls)`` classmethod to declare the steps." + self.assertEqual(expected, str(cm.exception)) + + def test_scanpipe_pipeline_class_get_steps_with_groups(self): + expected = (WithGroups.no_groups,) + self.assertEqual(expected, WithGroups.get_steps()) + self.assertEqual(expected, WithGroups.get_steps(groups=[])) + self.assertEqual(expected, WithGroups.get_steps(groups=["not_defined"])) + + expected = ( + WithGroups.grouped_with_foo_and_bar, + WithGroups.grouped_with_bar, + WithGroups.no_groups, + ) + self.assertEqual(expected, WithGroups.get_steps(groups=["bar"])) + self.assertEqual(expected, WithGroups.get_steps(groups=["foo", "bar"])) + + expected = ( + WithGroups.grouped_with_foo_and_bar, + WithGroups.no_groups, + ) + self.assertEqual(expected, WithGroups.get_steps(groups=["foo"])) + + def test_scanpipe_pipeline_class_get_available_groups(self): + self.assertEqual(["bar", "excluded", "foo"], WithGroups.get_available_groups()) + self.assertEqual([], DoNothing.get_available_groups()) + + def test_scanpipe_pipeline_class_env_loaded_from_config_file(self): + project1 = make_project() + run = project1.add_pipeline("do_nothing") + pipeline = run.make_pipeline_instance() + self.assertEqual({}, pipeline.env) + + config_file = project1.input_path / settings.SCANCODEIO_CONFIG_FILE + config_file.write_text("{*this is not valid yml*}") + pipeline = run.make_pipeline_instance() + self.assertEqual({}, pipeline.env) + + config_file.write_text("product_name: Product") + pipeline = run.make_pipeline_instance() + self.assertEqual({"product_name": "Product"}, pipeline.env) + + def test_scanpipe_pipeline_class_env_reloaded_after_extraction(self): + project1 = make_project() + + input_location = self.data / "settings" / "archived-scancode-config.zip" + project1.copy_input_from(input_location) + run = project1.add_pipeline("scan_codebase") + pipeline = run.make_pipeline_instance() + self.assertEqual({}, pipeline.env) + + # Manually run steps, env is reload from the scancode-config.yml contained in + # the archive + pipeline.copy_inputs_to_codebase_directory() + pipeline.extract_archives() + + expected = { + "product_name": "My Product Name", + "product_version": "1.0", + "ignored_patterns": ["*.tmp", "tests/*"], + } + self.assertEqual(expected, pipeline.env) + + def test_scanpipe_pipeline_class_flag_ignored_resources(self): + project1 = make_project() + run = project1.add_pipeline("do_nothing") + pipeline = run.make_pipeline_instance() + self.assertIsNone(pipeline.env.get("ignored_patterns")) + + project1.settings.update({"ignored_patterns": "*.ext"}) + project1.save() + pipeline = run.make_pipeline_instance() + + with mock.patch("scanpipe.pipes.flag.flag_ignored_patterns") as mock_flag: + mock_flag.return_value = None + pipeline.flag_ignored_resources() + + mock_flag.assert_called_once() + patterns_args = ["*.ext", *flag.DEFAULT_IGNORED_PATTERNS] + self.assertEqual(mock_flag.mock_calls[0].kwargs["patterns"], patterns_args) + self.assertEqual(mock_flag.mock_calls[0].kwargs["codebaseresources"].count(), 0) + + def test_scanpipe_pipeline_class_extract_archive(self): + project1 = make_project() + run = project1.add_pipeline("do_nothing") + pipeline = run.make_pipeline_instance() + + target = tempfile.mkdtemp() + input_location = str(self.data / "scancode" / "corrupted.tar.gz") + pipeline.extract_archive(input_location, target) + + projects_errors = project1.projectmessages.all() + self.assertEqual(1, len(projects_errors)) + project_error = projects_errors.get() + self.assertEqual("error", project_error.severity) + self.assertIn("gzip decompression failed", project_error.description) + self.assertEqual("extract_archive", project_error.model) + self.assertEqual({"filename": "corrupted.tar.gz"}, project_error.details) + self.assertEqual("", project_error.traceback) + + def test_scanpipe_pipeline_class_extract_archives(self): + project1 = make_project() + run = project1.add_pipeline("do_nothing") + pipeline = run.make_pipeline_instance() + + input_location = str(self.data / "scancode" / "corrupted.tar.gz") + resource_location = copy_input(input_location, project1.codebase_path) + pipeline.extract_archives() + + projects_errors = project1.projectmessages.all() + self.assertEqual(1, len(projects_errors)) + project_error = projects_errors.get() + self.assertEqual("error", project_error.severity) + self.assertIn("gzip decompression failed", project_error.description) + self.assertEqual("extract_archives", project_error.model) + self.assertEqual( + {"resource_path": str(resource_location)}, project_error.details + ) + self.assertEqual("", project_error.traceback) + + +class RootFSPipelineTest(TestCase): + def test_scanpipe_rootfs_pipeline_extract_input_files_errors(self): + project1 = make_project() + run = project1.add_pipeline("analyze_root_filesystem_or_vm_image") + pipeline_instance = analyze_root_filesystem.RootFS(run) + + # Create 2 files in the input/ directory to generate error twice + project1.move_input_from(tempfile.mkstemp()[1]) + project1.move_input_from(tempfile.mkstemp()[1]) + self.assertEqual(2, len(project1.input_files)) + + with mock.patch("scanpipe.pipes.scancode.extract_archive") as extract_archive: + extract_archive.return_value = {"path/to/resource": ["error1", "error2"]} + pipeline_instance.extract_input_files_to_codebase_directory() + + projects_errors = project1.projectmessages.all() + self.assertEqual(2, len(projects_errors)) + project_error = projects_errors[0] + self.assertEqual("error", project_error.severity) + self.assertEqual("error1\nerror2", project_error.description) + self.assertEqual("extract_archive", project_error.model) + self.assertEqual({"filename": "resource"}, project_error.details) + self.assertEqual("", project_error.traceback) + + +def sort_for_os_compatibility(scan_data): + """Sort the ``scan_data`` files and relations in place. Return ``scan_data``.""" + if files := scan_data.get("files"): + files.sort(key=lambda x: x["path"]) + + if relations := scan_data.get("relations"): + relations.sort(key=lambda x: x["to_resource"]) + + return scan_data + + +@tag("slow") +class PipelinesIntegrationTest(TestCase): + """Integration tests to ensure the proper output for each built-in Pipelines.""" + + # Un-comment the following to display full diffs: + # maxDiff = None + data = Path(__file__).parent / "data" + exclude_from_diff = [ + "start_timestamp", + "end_timestamp", + "date", + "duration", + "input", + "compliance_alert", + "policy", + "tool_version", + "other_tools", + "created_date", + "log", + "uuid", + "size", # directory sizes are OS dependant + "size_count", + "--json-pp", + "--processes", + "--verbose", + # system_environment differs between systems + "system_environment", + "file_type", + # mime type and is_script are inconsistent across systems + "mime_type", + "is_script", + "notes", + "settings", + "description", + "traceback", + ] + + def _without_keys(self, data, exclude_keys): + """Return the `data` excluding the provided `exclude_keys`.""" + if isinstance(data, list): + return [self._without_keys(entry, exclude_keys) for entry in data] + + if isinstance(data, dict): + return { + key: ( + self._without_keys(value, exclude_keys) + if type(value) in [list, dict] + else value + ) + for key, value in data.items() + if key not in exclude_keys + } + + return data + + def purl_fields_with_fake_uuid(self, value, key): + purl_fields = ["purl", "for_packages", "package_uid"] + purl_name = "fixed-name-for-testing-5642512d1758" + purl_namespace = "fixed-namespace-for-testing-5642512d1758" + + if key == "name": + return purl_name + elif key == "namespace": + return purl_namespace + elif key in purl_fields: + purl_old = PackageURL.from_string(value) + if purl_old.type != "local-files": + return purl_with_fake_uuid(value) + + purl = PackageURL( + name=purl_name, + namespace=purl_namespace, + type="local-files", + version=purl_old.version, + qualifiers=purl_old.qualifiers, + subpath=purl_old.subpath, + ) + return purl_with_fake_uuid(purl.to_string()) + + def _normalize_package_uids(self, data): + """ + Return the `data`, where any `package_uid` value has been normalized + with `purl_with_fake_uuid()` + """ + fields_with_package_uids = [ + "package_uid", + "dependency_uid", + "for_package_uid", + "resolved_to_package_uid", + ] + if isinstance(data, list): + return [self._normalize_package_uids(entry) for entry in data] + + if isinstance(data, dict): + is_local_files = False + if data.get("type") and data["type"] == "local-files": + is_local_files = True + normalized_data = {} + for key, value in data.items(): + if isinstance(value, list | dict): + value = self._normalize_package_uids(value) + if key in fields_with_package_uids and value: + value = purl_with_fake_uuid(value) + if key == "for_packages" and value: + value = sorted( + [ + self.purl_fields_with_fake_uuid(package_uid, key) + for package_uid in value + ] + ) + if ( + is_local_files + and key in ("name", "namespace", "purl", "package_uid") + and value + ): + value = self.purl_fields_with_fake_uuid(value, key) + normalized_data[key] = value + return normalized_data + + return data + + def _sort_dependencies(self, data): + """ + Sort dependencies by their "for_package_uid". + + After dependency resolution in some cases we have multiple + dependency requirements resolved to a same package, and they + are not sorted the same way every time. + """ + mappings = data.get("dependencies") + if mappings: + mappings_by_uid = {} + for mapping in mappings: + uid = mapping.get("for_package_uid") or "" + mappings_by_uid[uid] = mapping + data["dependencies"] = list(dict(sorted(mappings_by_uid.items())).values()) + return data + + def test_package_uids_normalized_in_pipeline_integration_tests(self): + self.maxDiff = 1000 + data = { + "type": "local-files", + "package_uid": ( + "pkg:local-files/analysis-438ebaf4/42440f35-1091-4c03-8c96-a0ed3d3caf23" + "?uuid=42440f35-1091-4c03-8c96-a0ed3d3caf24" + ), + "for_packages": [ + ( + "pkg:local-files/analysis-438ebaf4/42440f35-1091-4c03-8c96-a0ed3d3caf23" + "?uuid=42440f35-1091-4c03-8c96-a0ed3d3caf24" + ) + ], + } + normalized_data = self._normalize_package_uids(data=data) + expected_data = { + "type": "local-files", + "package_uid": ( + "pkg:local-files/fixed-namespace-for-testing-5642512d1758/" + "fixed-name-for-testing-5642512d1758?uuid=fixed-uid-done-for-testing-5642512d1758" + ), + "for_packages": [ + ( + "pkg:local-files/fixed-namespace-for-testing-5642512d1758/" + "fixed-name-for-testing-5642512d1758?uuid=fixed-uid-done-for-testing-5642512d1758" + ) + ], + } + self.assertEqual(normalized_data, expected_data) + + def assertPipelineResultEqual( + self, expected_file, result_file, sort_dependencies=False, regen=FIXTURES_REGEN + ): + """Set `regen` to True to regenerate the expected results.""" + result_json = json.loads(Path(result_file).read_text()) + result_json = self._normalize_package_uids(result_json) + result_data = self._without_keys(result_json, self.exclude_from_diff) + if sort_dependencies: + result_data = self._sort_dependencies(result_data) + result_data = sort_for_os_compatibility(result_data) + + if regen: + expected_file.write_text(json.dumps(result_data, indent=2)) + + expected_json = json.loads(expected_file.read_text()) + expected_json = self._normalize_package_uids(expected_json) + expected_data = self._without_keys(expected_json, self.exclude_from_diff) + if sort_dependencies: + result_data = self._sort_dependencies(result_data) + expected_data = sort_for_os_compatibility(expected_data) + + self.assertEqual(expected_data, result_data) + + @skipIf(from_docker_image, "Random failure in the Docker context.") + def test_scanpipe_scan_package_pipeline_integration(self): + pipeline_name = "scan_single_package" + project1 = make_project() + + input_location = self.data / "scancode" / "is-npm-1.0.0.tgz" + project1.copy_input_from(input_location) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(4, project1.codebaseresources.count()) + self.assertEqual(1, project1.discoveredpackages.count()) + self.assertEqual(1, project1.discovereddependencies.count()) + + scancode_file = project1.get_latest_output(filename="scancode") + expected_file = self.data / "scancode" / "is-npm-1.0.0_scan_package.json" + self.assertPipelineResultEqual(expected_file, scancode_file) + + summary_file = project1.get_latest_output(filename="summary") + expected_file = ( + self.data / "scancode" / "is-npm-1.0.0_scan_package_summary.json" + ) + self.assertPipelineResultEqual(expected_file, summary_file) + + # Ensure that we only have one instance of is-npm in `key_files_packages` + summary_data = json.loads(Path(summary_file).read_text()) + key_files_packages = summary_data.get("key_files_packages", []) + self.assertEqual(1, len(key_files_packages)) + key_file_package = key_files_packages[0] + key_file_package_purl = key_file_package.get("purl", "") + self.assertEqual("pkg:npm/is-npm@1.0.0", key_file_package_purl) + + @skipIf(from_docker_image, "Random failure in the Docker context.") + def test_scanpipe_scan_package_pipeline_integration_multiple_packages(self): + pipeline_name = "scan_single_package" + project1 = make_project() + + input_location = self.data / "scancode" / "multiple-is-npm-1.0.0.tar.gz" + project1.copy_input_from(input_location) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(9, project1.codebaseresources.count()) + self.assertEqual(2, project1.discoveredpackages.count()) + self.assertEqual(2, project1.discovereddependencies.count()) + + scancode_file = project1.get_latest_output(filename="scancode") + expected_file = ( + self.data / "scancode" / "multiple-is-npm-1.0.0_scan_package.json" + ) + # Do not override the regen as this file is generated in regen_test_data + self.assertPipelineResultEqual(expected_file, scancode_file) + + summary_file = project1.get_latest_output(filename="summary") + expected_file = ( + self.data / "scancode" / "multiple-is-npm-1.0.0_scan_package_summary.json" + ) + self.assertPipelineResultEqual(expected_file, summary_file) + + @mock.patch("scanpipe.pipelines.scan_single_package.is_archive") + def test_scanpipe_scan_package_single_extract_input_to_codebase_directory( + self, mock_is_archive + ): + project1 = make_project() + run = project1.add_pipeline("scan_single_package") + pipeline_instance = scan_single_package.ScanSinglePackage(run) + + project1.move_input_from(tempfile.mkstemp(suffix=".zip")[1]) + self.assertEqual(1, len(project1.input_files)) + + mock_is_archive.return_value = True + pipeline_instance.get_package_input() + with mock.patch("scanpipe.pipes.scancode.extract_archive") as extract_archive: + extract_archive.return_value = {"path/to/resource": ["error1", "error2"]} + pipeline_instance.extract_input_to_codebase_directory() + + projects_errors = project1.projectmessages.all() + self.assertEqual(1, len(projects_errors)) + project_error = projects_errors[0] + self.assertEqual("error", project_error.severity) + self.assertEqual("error1\nerror2", project_error.description) + self.assertEqual("extract_archive", project_error.model) + self.assertEqual({"filename": "resource"}, project_error.details) + self.assertEqual("", project_error.traceback) + + def test_scanpipe_scan_package_single_file(self): + pipeline_name = "scan_single_package" + project1 = make_project() + + input_location = self.data / "manifests" / "openpdf-parent-1.3.11.pom.xml" + project1.copy_input_from(input_location) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(1, project1.codebaseresources.count()) + self.assertEqual(1, project1.discoveredpackages.count()) + self.assertEqual(10, project1.discovereddependencies.count()) + + scancode_file = project1.get_latest_output(filename="scancode") + expected_file = ( + self.data / "manifests" / "openpdf-parent-1.3.11_scan_package.json" + ) + self.assertPipelineResultEqual(expected_file, scancode_file) + + @mock.patch("git.repo.base.Repo.clone_from") + def test_scanpipe_scan_package_single_package_git_repo(self, mock_clone): + pipeline_name = "scan_single_package" + project1 = make_project() + + download_url = "https://github.com/aboutcode-org/scancode.io.git" + project1.add_input_source(download_url=download_url) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + # Create the "fetched" git directory content + def mock_make_git_directory(**kwargs): + to_path = kwargs.get("to_path") # scancode.io.git + to_path.mkdir() + file_location = self.data / "aboutcode" / "notice.NOTICE" + copy_input(file_location, to_path) + + mock_clone.side_effect = mock_make_git_directory + mock_clone.return_value = None + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(2, project1.codebaseresources.count()) + self.assertEqual(0, project1.discoveredpackages.count()) + + def test_scanpipe_scan_codebase_pipeline_integration(self): + pipeline_name = "scan_codebase" + project1 = make_project() + + filename = "is-npm-1.0.0.tgz" + input_location = self.data / "scancode" / filename + project1.copy_input_from(input_location) + project1.add_input_source("https://download.url", filename) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(6, project1.codebaseresources.count()) + self.assertEqual(1, project1.discoveredpackages.count()) + self.assertEqual(1, project1.discovereddependencies.count()) + + result_file = output.to_json(project1) + expected_file = self.data / "scancode" / "is-npm-1.0.0_scan_codebase.json" + self.assertPipelineResultEqual(expected_file, result_file) + + def test_scanpipe_scan_codebase_creates_top_level_paths(self): + pipeline_name = "scan_codebase" + project1 = make_project() + + filename = "is-npm-1.0.0.tgz" + input_location = self.data / "scancode" / filename + project1.copy_input_from(input_location) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + expected_top_level_paths = ["is-npm-1.0.0.tgz", "is-npm-1.0.0.tgz-extract"] + + top_level_resources = project1.codebaseresources.filter(parent_path="") + top_level_paths = [resource.path for resource in top_level_resources] + + self.assertListEqual(top_level_paths, expected_top_level_paths) + + def test_scanpipe_scan_codebase_creates_parent_path_field(self): + pipeline_name = "scan_codebase" + project1 = make_project() + + filename = "is-npm-1.0.0.tgz" + input_location = self.data / "scancode" / filename + project1.copy_input_from(input_location) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + expected_top_level_paths = ["is-npm-1.0.0.tgz", "is-npm-1.0.0.tgz-extract"] + expected_nested_paths = [ + "is-npm-1.0.0.tgz-extract/package/index.js", + "is-npm-1.0.0.tgz-extract/package/package.json", + "is-npm-1.0.0.tgz-extract/package/readme.md", + ] + + top_level_resources = project1.codebaseresources.filter(parent_path="") + top_level_paths = [resource.path for resource in top_level_resources] + + self.assertListEqual(top_level_paths, expected_top_level_paths) + + nested_resources = project1.codebaseresources.filter( + parent_path="is-npm-1.0.0.tgz-extract/package" + ) + nested_paths = [resource.path for resource in nested_resources] + + self.assertListEqual(nested_paths, expected_nested_paths) + + def test_scanpipe_inspect_packages_creates_packages_npm(self): + pipeline_name = "inspect_packages" + project1 = make_project() + + filename = "is-npm-1.0.0.tgz" + input_location = self.data / "scancode" / filename + project1.copy_input_from(input_location) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(6, project1.codebaseresources.count()) + self.assertEqual(1, project1.discoveredpackages.count()) + self.assertEqual(1, project1.discovereddependencies.count()) + + package = project1.discoveredpackages.get() + dependency = project1.discovereddependencies.get() + + self.assertEqual(3, package.codebase_resources.count()) + self.assertEqual("pkg:npm/is-npm@1.0.0", dependency.for_package.purl) + self.assertEqual(package.datasource_ids, [dependency.datasource_id]) + self.assertEqual( + package.codebase_resources.get( + path="is-npm-1.0.0.tgz-extract/package/package.json" + ).path, + dependency.datafile_resource.path, + ) + + def test_scanpipe_inspect_packages_creates_packages_pypi(self): + pipeline_name = "inspect_packages" + project1 = make_project() + + input_location = self.data / "manifests" / "python-inspector-0.10.0.zip" + project1.copy_input_from(input_location) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + self.assertEqual(6, project1.codebaseresources.count()) + self.assertEqual(0, project1.discoveredpackages.count()) + self.assertEqual(26, project1.discovereddependencies.count()) + + @skipIf(sys.platform == "darwin", "Not supported on macOS") + def test_scanpipe_inspect_packages_with_resolved_dependencies_npm(self): + pipeline_name = "inspect_packages" + project1 = make_project() + + input_location = self.data / "dependencies" / "resolved_dependencies_npm.zip" + project1.copy_input_from(input_location) + + run = project1.add_pipeline( + pipeline_name=pipeline_name, + selected_groups=["StaticResolver"], + ) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + self.assertEqual(4, project1.codebaseresources.count()) + self.assertEqual(7, project1.discoveredpackages.count()) + self.assertEqual(6, project1.discovereddependencies.count()) + + result_file = output.to_json(project1) + expected_file = ( + self.data + / "dependencies" + / "resolved_dependencies_npm_inspect_packages.json" + ) + self.assertPipelineResultEqual(expected_file, result_file) + + @skipIf(sys.platform == "darwin", "Not supported on macOS") + def test_scanpipe_inspect_packages_with_resolved_dependencies_poetry(self): + pipeline_name = "inspect_packages" + project1 = make_project() + + input_location = self.data / "dependencies" / "resolved_dependencies_poetry.zip" + project1.copy_input_from(input_location) + + run = project1.add_pipeline( + pipeline_name=pipeline_name, + selected_groups=["StaticResolver"], + ) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + self.assertEqual(5, project1.codebaseresources.count()) + self.assertEqual(6, project1.discoveredpackages.count()) + self.assertEqual(10, project1.discovereddependencies.count()) + + result_file = output.to_json(project1) + expected_file = ( + self.data + / "dependencies" + / "resolved_dependencies_poetry_inspect_packages.json" + ) + self.assertPipelineResultEqual(expected_file, result_file) + + @skipIf(sys.platform == "darwin", "Not supported on macOS") + def test_scanpipe_resolved_dependencies_cocoapods(self): + pipeline_name = "resolve_dependencies" + project1 = make_project() + + input_location = ( + self.data / "dependencies" / "resolved_dependencies_cocoapods.zip" + ) + project1.copy_input_from(input_location) + + run = project1.add_pipeline( + pipeline_name=pipeline_name, + selected_groups=["StaticResolver"], + ) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + self.assertEqual(3, project1.codebaseresources.count()) + self.assertEqual(25, project1.discoveredpackages.count()) + self.assertEqual(30, project1.discovereddependencies.count()) + + result_file = output.to_json(project1) + expected_file = ( + self.data / "dependencies" / "resolved_dependencies_cocoapods.json" + ) + self.assertPipelineResultEqual( + expected_file, result_file, sort_dependencies=True + ) + + def test_scanpipe_resolved_dependencies_pip_inspect(self): + pipeline_name = "resolve_dependencies" + project1 = make_project() + + input_location = self.data / "dependencies" / "resolved_dependencies_pip.zip" + project1.copy_input_from(input_location) + + run = project1.add_pipeline( + pipeline_name=pipeline_name, + selected_groups=["StaticResolver"], + ) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + self.assertEqual(3, project1.codebaseresources.count()) + self.assertEqual(4, project1.discoveredpackages.count()) + self.assertEqual(17, project1.discovereddependencies.count()) + + result_file = output.to_json(project1) + expected_file = self.data / "dependencies" / "resolved_dependencies_pip.json" + self.assertPipelineResultEqual( + expected_file, + result_file, + ) + + def test_scanpipe_resolved_dependencies_nuget(self): + pipeline_name = "resolve_dependencies" + project1 = make_project() + + input_location = self.data / "dependencies" / "resolved_dependencies_nuget.zip" + project1.copy_input_from(input_location) + + run = project1.add_pipeline( + pipeline_name=pipeline_name, + selected_groups=["StaticResolver"], + ) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + self.assertEqual(3, project1.codebaseresources.count()) + self.assertEqual(34, project1.discoveredpackages.count()) + self.assertEqual(108, project1.discovereddependencies.count()) + + result_file = output.to_json(project1) + expected_file = self.data / "dependencies" / "resolved_dependencies_nuget.json" + self.assertPipelineResultEqual( + expected_file, + result_file, + sort_dependencies=True, + ) + + def test_scanpipe_scan_codebase_can_process_wheel(self): + pipeline_name = "scan_codebase" + project1 = make_project() + + filename = "daglib-0.6.0-py3-none-any.whl" + input_location = self.data / "scancode" / filename + project1.copy_input_from(input_location) + project1.add_input_source("https://download.url", filename) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(11, project1.codebaseresources.count()) + self.assertEqual(2, project1.discoveredpackages.count()) + self.assertEqual(8, project1.discovereddependencies.count()) + + result_file = output.to_json(project1) + expected_file = ( + self.data / "scancode" / "daglib-0.6.0-py3-none-any.whl_scan_codebase.json" + ) + self.assertPipelineResultEqual(expected_file, result_file) + + @skipIf(sys.platform != "linux", "Expected results are inconsistent across OS") + def test_scanpipe_docker_pipeline_alpine_integration(self): + pipeline_name = "analyze_docker_image" + project1 = make_project() + + filename = "alpine_3_15_4.tar.gz" + input_location = self.data / "docker" / filename + project1.copy_input_from(input_location) + project1.add_input_source("https://download.url", filename) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(510, project1.codebaseresources.count()) + self.assertEqual(14, project1.discoveredpackages.count()) + self.assertEqual(0, project1.discovereddependencies.count()) + + result_file = output.to_json(project1) + expected_file = self.data / "docker" / "alpine_3_15_4_scan_codebase.json" + self.assertPipelineResultEqual(expected_file, result_file) + + def test_scanpipe_docker_pipeline_does_not_report_errors_for_broken_symlinks(self): + pipeline_name = "analyze_docker_image" + project1 = make_project() + + filename = "minitag.tar" + input_location = self.data / "image-with-symlinks" / filename + project1.copy_input_from(input_location) + project1.add_input_source("https://download.url", filename) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + with redirect_stderr(io.StringIO()): + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + project_messages = project1.projectmessages.all() + self.assertEqual(1, len(project_messages)) + self.assertEqual("Distro not found.", project_messages[0].description) + + result_file = output.to_json(project1) + expected_file = ( + self.data / "image-with-symlinks" / (filename + "-expected-scan.json") + ) + self.assertPipelineResultEqual(expected_file, result_file) + + @skipIf(sys.platform != "linux", "RPM related features only supported on Linux.") + def test_scanpipe_docker_pipeline_rpm_integration(self): + pipeline_name = "analyze_docker_image" + project1 = make_project() + + filename = "centos.tar.gz" + input_location = self.data / "docker" / filename + project1.copy_input_from(input_location) + project1.add_input_source("https://download.url", filename) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(29, project1.codebaseresources.count()) + self.assertEqual(101, project1.discoveredpackages.count()) + self.assertEqual(0, project1.discovereddependencies.count()) + + result_file = output.to_json(project1) + expected_file = self.data / "docker" / "centos_scan_codebase.json" + self.assertPipelineResultEqual(expected_file, result_file) + + def test_scanpipe_docker_pipeline_debian_integration(self): + pipeline_name = "analyze_docker_image" + project1 = make_project() + + filename = "debian.tar.gz" + input_location = self.data / "docker" / filename + project1.copy_input_from(input_location) + project1.add_input_source("https://download.url", filename) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(16, project1.codebaseresources.count()) + self.assertEqual(2, project1.discoveredpackages.count()) + self.assertEqual(0, project1.discovereddependencies.count()) + + result_file = output.to_json(project1) + expected_file = self.data / "docker" / "debian_scan_codebase.json" + self.assertPipelineResultEqual(expected_file, result_file) + + def test_scanpipe_docker_pipeline_distroless_debian_integration(self): + pipeline_name = "analyze_docker_image" + project1 = make_project() + + filename = "gcr_io_distroless_base.tar.gz" + input_location = self.data / "docker" / filename + project1.copy_input_from(input_location) + project1.add_input_source("https://download.url", filename) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(2458, project1.codebaseresources.count()) + self.assertEqual(6, project1.discoveredpackages.count()) + self.assertEqual(0, project1.discovereddependencies.count()) + + result_file = output.to_json(project1) + expected_file = ( + self.data / "docker" / "gcr_io_distroless_base_scan_codebase.json" + ) + self.assertPipelineResultEqual(expected_file, result_file) + + def test_scanpipe_rootfs_pipeline_integration(self): + pipeline_name = "analyze_root_filesystem_or_vm_image" + project1 = make_project() + + input_location = self.data / "rootfs" / "basic-rootfs.tar.gz" + project1.copy_input_from(input_location) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(17, project1.codebaseresources.count()) + self.assertEqual(2, project1.discoveredpackages.count()) + self.assertEqual(0, project1.discovereddependencies.count()) + + result_file = output.to_json(project1) + expected_file = self.data / "rootfs" / "basic-rootfs_root_filesystems.json" + self.assertPipelineResultEqual(expected_file, result_file) + + def test_scanpipe_load_inventory_pipeline_integration(self): + pipeline_name = "load_inventory" + project1 = make_project() + + input_location = self.data / "asgiref" / "asgiref-3.3.0_toolkit_scan.json" + project1.copy_input_from(input_location) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(18, project1.codebaseresources.count()) + self.assertEqual(2, project1.discoveredpackages.count()) + self.assertEqual(4, project1.discovereddependencies.count()) + + result_file = output.to_json(project1) + expected_file = ( + self.data / "asgiref" / "asgiref-3.3.0_load_inventory_expected.json" + ) + self.assertPipelineResultEqual(expected_file, result_file) + + # Using the ScanCode.io JSON output as the input + project2 = make_project() + + input_location = self.data / "asgiref" / "asgiref-3.3.0_scanpipe_output.json" + project2.copy_input_from(input_location) + + run = project2.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(18, project2.codebaseresources.count()) + self.assertEqual(2, project2.discoveredpackages.count()) + self.assertEqual(4, project2.discovereddependencies.count()) + + @mock.patch("scanpipe.pipes.vulnerablecode.is_available") + @mock.patch("scanpipe.pipes.vulnerablecode.is_configured") + @mock.patch("scanpipe.pipes.vulnerablecode.bulk_search_by_purl") + def test_scanpipe_find_vulnerabilities_pipeline_integration( + self, mock_bulk_search_by_purl, mock_is_configured, mock_is_available + ): + pipeline_name = "find_vulnerabilities" + project1 = make_project() + package1 = DiscoveredPackage.create_from_data(project1, package_data1) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + mock_is_configured.return_value = False + mock_is_available.return_value = False + exitcode, out = pipeline.execute() + self.assertEqual(1, exitcode, msg=out) + self.assertIn("VulnerableCode is not configured.", out) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + mock_is_configured.return_value = True + mock_is_available.return_value = True + vulnerability_data = [ + { + "purl": "pkg:deb/debian/adduser@3.118?arch=all", + "affected_by_vulnerabilities": [ + { + "vulnerability_id": "VCID-cah8-awtr-aaad", + "summary": "An issue was discovered.", + }, + ], + }, + { + "purl": "pkg:deb/debian/adduser@3.118?qualifiers=1", + "affected_by_vulnerabilities": [ + { + "vulnerability_id": "VCID-cah8-awtr-aaad", + "summary": "An issue was discovered.", + }, + ], + }, + ] + mock_bulk_search_by_purl.return_value = vulnerability_data + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + package1.refresh_from_db() + expected = vulnerability_data[0]["affected_by_vulnerabilities"] + self.assertEqual(expected, package1.affected_by_vulnerabilities) + + @mock.patch("scorecode.ossf_scorecard.is_available") + def test_scanpipe_fetch_scores_pipeline_integration(self, mock_is_available): + pipeline_name = "fetch_scores" + project1 = make_project() + package1 = DiscoveredPackage.create_from_data(project1, package_data1) + package1.vcs_url = "https://github.com/ossf/scorecard" + package1.save() + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + mock_is_available.return_value = False + exitcode, out = pipeline.execute() + self.assertEqual(1, exitcode, msg=out) + self.assertIn("ScoreCode service is not available.", out) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + mock_is_available.return_value = True + + package_score_data = { + "scoring_tool": "ossf_scorecard", + "scoring_tool_version": "v5.2.1", + "score": "9.7", + "scoring_tool_documentation_url": "https://github.com/[trunc...]", + "score_date": "2025-07-24T18:50:16Z", + } + with mock.patch("scorecode.ossf_scorecard.fetch_scorecard") as fetch: + fetch.return_value = PackageScore(**package_score_data) + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + package1.refresh_from_db() + scorecard_entry = package1.scores.filter(scoring_tool="ossf-scorecard").first() + self.assertIsNotNone(scorecard_entry) + self.assertEqual("ossf-scorecard", scorecard_entry.scoring_tool) + self.assertEqual("v5.2.1", scorecard_entry.scoring_tool_version) + self.assertTrue(scorecard_entry.score) + + def test_scanpipe_resolve_dependencies_pipeline_integration(self): + pipeline_name = "resolve_dependencies" + project1 = make_project() + selected_groups = ["DynamicResolver"] + + run = project1.add_pipeline( + pipeline_name=pipeline_name, selected_groups=selected_groups + ) + pipeline = run.make_pipeline_instance() + + project1.move_input_from(tempfile.mkstemp()[1]) + pipeline.execute() + self.assertEqual(1, project1.projectmessages.count()) + message = project1.projectmessages.get() + self.assertEqual("get_packages_from_manifest", message.model) + expected = "No resources containing package data found in codebase." + self.assertIn(expected, message.description) + + def test_scanpipe_resolve_dependencies_pipeline_integration_empty_manifest(self): + pipeline_name = "resolve_dependencies" + project1 = make_project() + selected_groups = ["DynamicResolver"] + + run = project1.add_pipeline( + pipeline_name=pipeline_name, selected_groups=selected_groups + ) + pipeline = run.make_pipeline_instance() + + project1.move_input_from(tempfile.mkstemp(suffix="requirements.txt")[1]) + pipeline.execute() + self.assertEqual(1, project1.projectmessages.count()) + message = project1.projectmessages.get() + self.assertEqual("get_packages_from_manifest", message.model) + expected = "No packages could be resolved" + self.assertIn(expected, message.description) + + @mock.patch("scanpipe.pipes.resolve.python_inspector.resolve_dependencies") + def test_scanpipe_resolve_dependencies_pipeline_integration_misc( + self, mock_resolve_dependencies + ): + pipeline_name = "resolve_dependencies" + project1 = make_project() + selected_groups = ["DynamicResolver"] + + input_location = self.data / "manifests" / "requirements.txt" + project1.copy_input_from(input_location) + + run = project1.add_pipeline( + pipeline_name=pipeline_name, selected_groups=selected_groups + ) + pipeline = run.make_pipeline_instance() + + mock_resolve_dependencies.return_value = mock.Mock(packages=[package_data1]) + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + self.assertEqual(1, project1.discoveredpackages.count()) + + @mock.patch("scanpipe.pipes.resolve.python_inspector.resolve_dependencies") + def test_scanpipe_resolve_dependencies_pipeline_pypi_integration( + self, mock_resolve_dependencies + ): + pipeline_name = "resolve_dependencies" + project1 = make_project() + selected_groups = ["DynamicResolver"] + + run = project1.add_pipeline( + pipeline_name=pipeline_name, selected_groups=selected_groups + ) + pipeline = run.make_pipeline_instance() + + project1.move_input_from(tempfile.mkstemp(suffix="requirements.txt")[1]) + mock_resolve_dependencies.return_value = mock.Mock(packages=[package_data1]) + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(1, project1.discoveredpackages.count()) + discoveredpackage = project1.discoveredpackages.get() + exclude_fields = ["qualifiers", "release_date", "size"] + for field_name, value in package_data1.items(): + if value and field_name not in exclude_fields: + self.assertEqual(value, getattr(discoveredpackage, field_name)) + + def test_scanpipe_load_sbom_pipeline_aboutfile_integration(self): + pipeline_name = "load_sbom" + project1 = make_project() + + input_location = self.data / "manifests" / "Django-4.0.8-py3-none-any.whl.ABOUT" + project1.copy_input_from(input_location) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(1, project1.discoveredpackages.count()) + discoveredpackage = project1.discoveredpackages.get() + self.assertEqual("pypi", discoveredpackage.type) + self.assertEqual("django", discoveredpackage.name) + self.assertEqual("4.0.8", discoveredpackage.version) + self.assertEqual("bsd-new", discoveredpackage.declared_license_expression) + + def test_scanpipe_load_sbom_pipeline_spdx_integration(self): + pipeline_name = "load_sbom" + project1 = make_project() + + input_location = self.data / "manifests" / "toml.spdx.json" + project1.copy_input_from(input_location) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(1, project1.discoveredpackages.count()) + discoveredpackage = project1.discoveredpackages.get() + self.assertEqual("pypi", discoveredpackage.type) + self.assertEqual("toml", discoveredpackage.name) + self.assertEqual("0.10.2", discoveredpackage.version) + self.assertEqual("https://github.com/uiri/toml", discoveredpackage.homepage_url) + self.assertEqual("MIT", discoveredpackage.extracted_license_statement) + self.assertEqual("mit", discoveredpackage.declared_license_expression) + + def test_scanpipe_load_sbom_pipeline_cyclonedx_integration(self): + pipeline_name = "load_sbom" + project1 = make_project() + + input_location = self.data / "cyclonedx" / "nested.cdx.json" + project1.copy_input_from(input_location) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(3, project1.discoveredpackages.count()) + packages = project1.discoveredpackages.all() + expected_data = { + "pkg:pypi/toml@0.10.2?extension=tar.gz": { + "type": "pypi", + "name": "toml", + "version": "0.10.2", + "extracted_license_statement": "OFL-1.1\nApache-2.0", + "declared_license_expression": "ofl-1.1 OR apache-2.0", + "homepage_url": "https://cyclonedx.org/website", + "bug_tracking_url": "https://cyclonedx.org/issue-tracker", + "vcs_url": "https://cyclonedx.org/vcs", + "filename": "", + }, + "pkg:pypi/billiard@3.6.3.0": { + "type": "pypi", + "name": "billiard", + "version": "3.6.3.0", + "extracted_license_statement": "BSD-3-Clause", + "declared_license_expression": "bsd-new", + "homepage_url": "", + "bug_tracking_url": "", + "vcs_url": "", + "extra_data": "", + "filename": "", + }, + "pkg:pypi/fictional@9.10.2": { + "type": "pypi", + "name": "fictional", + "version": "9.10.2", + "extracted_license_statement": ( + "LGPL-3.0-or-later" + " AND " + "LicenseRef-scancode-openssl-exception-lgpl3.0plus" + ), + "declared_license_expression": ( + "lgpl-3.0-plus AND openssl-exception-lgpl-3.0-plus" + ), + "homepage_url": "https://home.page", + "bug_tracking_url": "", + "vcs_url": "", + "extra_data": "", + "filename": "package.zip", + }, + } + + for package in packages: + expected = expected_data.get(str(package)) + self.assertEqual(expected["type"], package.type) + self.assertEqual(expected["name"], package.name) + self.assertEqual(expected["version"], package.version) + self.assertEqual(expected["homepage_url"], package.homepage_url) + self.assertEqual( + expected["extracted_license_statement"], + package.extracted_license_statement, + ) + self.assertEqual( + expected["declared_license_expression"], + package.declared_license_expression, + ) + self.assertEqual(expected["filename"], package.filename) + + def test_scanpipe_load_sbom_pipeline_cyclonedx_with_dependencies_integration(self): + pipeline_name = "load_sbom" + project1 = make_project() + + input_location = self.data / "cyclonedx" / "laravel-7.12.0" / "bom.1.4.json" + project1.copy_input_from(input_location) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(62, project1.discoveredpackages.count()) + self.assertEqual(112, project1.discovereddependencies.count()) + dependency = project1.discovereddependencies.all()[0] + self.assertEqual("bom.1.4.json", str(dependency.datafile_resource)) + + def test_scanpipe_load_sbom_pipeline_cyclonedx_with_vulnerabilities(self): + pipeline_name = "load_sbom" + project1 = make_project() + + input_location = ( + self.data / "cyclonedx" / "python-3.13.0-vulnerabilities.cdx.json" + ) + project1.copy_input_from(input_location) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(1, project1.discoveredpackages.count()) + package = project1.discoveredpackages.get() + expected = [ + { + "vulnerability_id": "CVE-2005-2541", + "summary": "Tar 1.15.1 does not properly warn the user when...", + } + ] + self.assertEqual(expected, package.affected_by_vulnerabilities) + + @mock.patch("scanpipe.pipes.purldb.request_post") + @mock.patch("uuid.uuid4") + def test_scanpipe_deploy_to_develop_pipeline_integration( + self, mock_uuid4, mock_request + ): + forced_uuid = "b74fe5df-e965-415e-ba65-f38421a0695d" + mock_uuid4.return_value = forced_uuid + mock_request.return_value = None + pipeline_name = "map_deploy_to_develop" + project1 = make_project(name="Analysis", uuid=forced_uuid) + selected_groups = ["Java"] + + jar_location = self.data / "d2d" / "jars" + project1.copy_input_from(jar_location / "from-flume-ng-node-1.9.0.zip") + project1.copy_input_from(jar_location / "to-flume-ng-node-1.9.0.zip") + + run = project1.add_pipeline( + pipeline_name=pipeline_name, selected_groups=selected_groups + ) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(57, project1.codebaseresources.count()) + self.assertEqual(18, project1.codebaserelations.count()) + self.assertEqual(1, project1.discoveredpackages.count()) + self.assertEqual(0, project1.discovereddependencies.count()) + + result_file = output.to_json(project1) + expected_file = self.data / "d2d" / "flume-ng-node-d2d.json" + self.assertPipelineResultEqual(expected_file, result_file) + + @skipIf(sys.platform == "darwin", "Not supported on macOS") + def test_scanpipe_deploy_to_develop_pipeline_integration_elfs(self): + pipeline_name = "map_deploy_to_develop" + project1 = make_project(name="Analysis") + selected_groups = ["Elf"] + + elf_location = self.data / "d2d-elfs" + project1.copy_input_from(elf_location / "from-brotli-d2d.zip") + project1.copy_input_from(elf_location / "to-brotli-d2d.zip") + + run = project1.add_pipeline( + pipeline_name=pipeline_name, selected_groups=selected_groups + ) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(17, project1.codebaseresources.count()) + self.assertEqual(7, project1.codebaserelations.count()) + + result_file = output.to_json(project1) + expected_file = self.data / "d2d-elfs" / "brotli-elf-d2d.json" + self.assertPipelineResultEqual(expected_file, result_file) + + def test_scanpipe_deploy_to_develop_pipeline_extract_input_files_errors(self): + project1 = make_project() + run = project1.add_pipeline("map_deploy_to_develop") + pipeline_instance = deploy_to_develop.DeployToDevelop(run) + + # Create 2 files in the input/ directory to generate error twice + project1.move_input_from(tempfile.mkstemp(prefix="from-")[1]) + project1.move_input_from(tempfile.mkstemp(prefix="to-")[1]) + self.assertEqual(2, len(project1.input_files)) + + pipeline_instance.get_inputs() + with mock.patch("scanpipe.pipes.scancode.extract_archive") as extract_archive: + extract_archive.return_value = {"path/to/resource": ["error1", "error2"]} + inputs_with_codebase_path_destination = [ + (pipeline_instance.from_files, project1.codebase_path / d2d.FROM), + (pipeline_instance.to_files, project1.codebase_path / d2d.TO), + ] + + for input_files, codebase_path in inputs_with_codebase_path_destination: + for input_file_path in input_files: + pipeline_instance.extract_archive(input_file_path, codebase_path) + + projects_errors = project1.projectmessages.all() + self.assertEqual(2, len(projects_errors)) + project_error = projects_errors[0] + self.assertEqual("error", project_error.severity) + self.assertEqual("error1\nerror2", project_error.description) + self.assertEqual("extract_archive", project_error.model) + self.assertEqual({"filename": "resource"}, project_error.details) + self.assertEqual("", project_error.traceback) + + @mock.patch("scanpipe.pipes.purldb.request_post") + @mock.patch("uuid.uuid4") + def test_scanpipe_deploy_to_develop_pipeline_with_about_file( + self, mock_uuid4, mock_request + ): + forced_uuid = "90cb6382-431c-4187-be76-d4f1a2199a2f" + mock_uuid4.return_value = forced_uuid + mock_request.return_value = None + pipeline_name = "map_deploy_to_develop" + project1 = make_project(name="Analysis", uuid=forced_uuid) + selected_groups = ["Java"] + + data_dir = self.data / "d2d" / "about_files" + project1.copy_input_from(data_dir / "from-with-about-file.zip") + project1.copy_input_from(data_dir / "to-with-jar.zip") + + run = project1.add_pipeline( + pipeline_name=pipeline_name, selected_groups=selected_groups + ) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(44, project1.codebaseresources.count()) + self.assertEqual(31, project1.codebaserelations.count()) + self.assertEqual(2, project1.discoveredpackages.count()) + self.assertEqual(0, project1.discovereddependencies.count()) + + result_file = output.to_json(project1) + expected_file = data_dir / "expected.json" + self.assertPipelineResultEqual(expected_file, result_file) + + self.assertEqual(1, project1.projectmessages.count()) + message = project1.projectmessages.get() + self.assertEqual("map_about_files", message.model) + expected = ( + "Resource paths listed at about_resource is not found in the to/ codebase" + ) + self.assertIn(expected, message.description) + + @mock.patch("scanpipe.pipes.purldb.request_post") + @mock.patch("scanpipe.pipes.purldb.is_available") + def test_scanpipe_populate_purldb_pipeline_integration( + self, mock_is_available, mock_request_post + ): + pipeline_name1 = "load_inventory" + pipeline_name2 = "populate_purldb" + project1 = make_project() + + input_location = self.data / "asgiref" / "asgiref-3.3.0_toolkit_scan.json" + project1.copy_input_from(input_location) + + run = project1.add_pipeline(pipeline_name1) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + def mock_request_post_return(url, data, headers, timeout): + payload = json.loads(data) + return { + "queued_packages_count": len(payload["packages"]), + "queued_packages": payload["packages"], + "unqueued_packages_count": 1, + "unqueued_packages": [], + "unsupported_packages_count": 1, + "unsupported_packages": [], + } + + mock_request_post.side_effect = mock_request_post_return + mock_is_available.return_value = True + + run = project1.add_pipeline(pipeline_name2) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertIn("Populating PurlDB with 2 PURLs from DiscoveredPackage", run.log) + self.assertIn("Successfully queued 2 PURLs for indexing in PurlDB", run.log) + self.assertIn("1 PURLs were already present in PurlDB index queue", run.log) + self.assertIn("Couldn't index 1 unsupported PURLs", run.log) + + @mock.patch("scanpipe.pipes.purldb.request_post") + @mock.patch("scanpipe.pipes.purldb.is_available") + def test_scanpipe_populate_purldb_pipeline_integration_without_assembly( + self, mock_is_available, mock_request_post + ): + pipeline_name = "populate_purldb" + project1 = make_project() + + def mock_request_post_return(url, data, headers, timeout): + payload = json.loads(data) + return { + "queued_packages_count": len(payload["packages"]), + "queued_packages": payload["packages"], + "unqueued_packages_count": 1, + "unqueued_packages": [], + "unsupported_packages_count": 1, + "unsupported_packages": [], + } + + mock_request_post.side_effect = mock_request_post_return + mock_is_available.return_value = True + + package_json_location = self.data / "manifests" / "package.json" + copy_input(package_json_location, project1.codebase_path) + pipes.collect_and_create_codebase_resources(project1) + + scancode.scan_for_application_packages(project1, assemble=False) + scancode.process_package_data(project1) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertIn("Populating PurlDB with 1 PURLs from DiscoveredPackage", run.log) + self.assertIn( + "Populating PurlDB with 6 unresolved PURLs from DiscoveredDependency", + run.log, + ) + self.assertIn("1 PURLs were already present in PurlDB index queue", run.log) + self.assertIn("Couldn't index 1 unsupported PURLs", run.log) + + @skipIf(sys.platform == "darwin", "Not supported on macOS") + def test_scanpipe_collect_symbols_ctags_pipeline_integration(self): + pipeline_name = "collect_symbols_ctags" + project1 = make_project() + + dir = project1.codebase_path / "codefile" + dir.mkdir(parents=True) + + file_location = self.data / "d2d-javascript" / "from" / "main.js" + copy_input(file_location, dir) + + pipes.collect_and_create_codebase_resources(project1) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + main_file = project1.codebaseresources.files()[0] + result_extra_data_symbols = main_file.extra_data.get("source_symbols") + expected_extra_data_symbols = ["generatePassword", "passwordLength", "charSet"] + self.assertCountEqual(expected_extra_data_symbols, result_extra_data_symbols) + + @skipIf(sys.platform != "linux", "Only supported on Linux") + def test_scanpipe_collect_strings_gettext_pipeline_integration(self): + pipeline_name = "collect_strings_gettext" + project1 = make_project() + + dir = project1.codebase_path / "codefile" + dir.mkdir(parents=True) + + file_location = self.data / "d2d-javascript" / "from" / "main.js" + copy_input(file_location, dir) + + pipes.collect_and_create_codebase_resources(project1) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + main_file = project1.codebaseresources.files()[0] + result_extra_data_strings = main_file.extra_data.get("source_strings") + expected_extra_data_strings = [ + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_-+=", # noqa + "Enter the desired length of your password:", + ] + self.assertCountEqual(expected_extra_data_strings, result_extra_data_strings) + + @skipIf(sys.platform == "darwin", "Not supported on macOS") + def test_scanpipe_collect_symbols_pygments_pipeline_integration(self): + pipeline_name = "collect_symbols_pygments" + project1 = make_project() + + dir = project1.codebase_path / "codefile" + dir.mkdir(parents=True) + + file_location = self.data / "source-inspector" / "test3.cpp" + copy_input(file_location, dir) + + pipes.collect_and_create_codebase_resources(project1) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + main_file = project1.codebaseresources.files()[0] + result_extra_data = main_file.extra_data + + expected_extra_data = ( + self.data / "source-inspector" / "test3.cpp-pygments-expected.json" + ) + + with open(expected_extra_data) as f: + expected_extra_data = json.load(f) + + self.assertDictEqual(expected_extra_data, result_extra_data) + + @skipIf(sys.platform == "darwin", "Not supported on macOS") + def test_scanpipe_collect_symbols_tree_sitter_pipeline_integration(self): + pipeline_name = "collect_symbols_tree_sitter" + project1 = make_project() + + dir = project1.codebase_path / "codefile" + dir.mkdir(parents=True) + + file_location = self.data / "source-inspector" / "test3.cpp" + copy_input(file_location, dir) + + pipes.collect_and_create_codebase_resources(project1) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + main_file = project1.codebaseresources.files()[0] + result_extra_data = main_file.extra_data + + expected_extra_data = ( + self.data / "source-inspector" / "test3.cpp-tree-sitter-expected.json" + ) + + with open(expected_extra_data) as f: + expected_extra_data = json.load(f) + + self.assertDictEqual(expected_extra_data, result_extra_data) + + @mock.patch("scanpipe.pipes.purldb.is_available") + @mock.patch("scanpipe.pipes.purldb.is_configured") + @mock.patch("scanpipe.pipes.purldb.collect_data_for_purl") + def test_scanpipe_enrich_with_purldb_pipeline_integration( + self, mock_collect_data, mock_is_configured, mock_is_available + ): + pipeline_name = "enrich_with_purldb" + project1 = make_project() + package1 = make_package(project1, package_url="pkg:npm/csvtojson@2.0.10") + + mock_is_configured.return_value = True + mock_is_available.return_value = True + + purldb_entry_file = self.data / "purldb" / "csvtojson-2.0.10.json" + purldb_entry = json.loads(purldb_entry_file.read_text()) + mock_collect_data.return_value = [purldb_entry] + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + package1.refresh_from_db() + self.assertTrue(package1.extra_data.get("enrich_with_purldb")) + + run.refresh_from_db() + self.assertIn("pkg:npm/csvtojson@2.0.10 ['release_date'", run.log) + self.assertIn("1 discovered package enriched with the PurlDB.", run.log) From fa1d219933d7514e4b30e63b2437a65004e21eb8 Mon Sep 17 00:00:00 2001 From: Varsha U N Date: Thu, 18 Sep 2025 08:34:40 +0530 Subject: [PATCH 06/18] Update Dockerfile From cb2d0c6f8c667250f95520d8a90635b1ed80f4b1 Mon Sep 17 00:00:00 2001 From: Varsha U N Date: Thu, 18 Sep 2025 08:48:32 +0530 Subject: [PATCH 07/18] Update test_pipelines.py --- scanpipe/tests/test_pipelines.py | 2028 ------------------------------ 1 file changed, 2028 deletions(-) diff --git a/scanpipe/tests/test_pipelines.py b/scanpipe/tests/test_pipelines.py index 6cf0262e98..0831e22081 100644 --- a/scanpipe/tests/test_pipelines.py +++ b/scanpipe/tests/test_pipelines.py @@ -2053,2033 +2053,5 @@ def test_scanpipe_enrich_with_purldb_pipeline_integration( run.refresh_from_db() self.assertIn("pkg:npm/csvtojson@2.0.10 ['release_date'", run.log) self.assertIn("1 discovered package enriched with the PurlDB.", run.log) -======= -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/nexB/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/nexB/scancode.io for support and download. - -import io -import json -import os -import sys -import tempfile -from contextlib import redirect_stderr -from pathlib import Path -from unittest import mock -from unittest import skipIf - -from django.conf import settings -from django.test import TestCase -from django.test import tag - -from packageurl import PackageURL -from scancode.cli_test_utils import purl_with_fake_uuid -from scorecode.models import PackageScore - -from scanpipe import pipes -from scanpipe.models import CodebaseResource -from scanpipe.models import DiscoveredPackage -from scanpipe.pipelines import CommonStepsMixin -from scanpipe.pipelines import InputFilesError -from scanpipe.pipelines import Pipeline -from scanpipe.pipelines import analyze_root_filesystem -from scanpipe.pipelines import deploy_to_develop -from scanpipe.pipelines import is_pipeline -from scanpipe.pipelines import scan_single_package -from scanpipe.pipes import d2d -from scanpipe.pipes import flag -from scanpipe.pipes import output -from scanpipe.pipes import scancode -from scanpipe.pipes.input import copy_input -from scanpipe.tests import FIXTURES_REGEN -from scanpipe.tests import make_mock_response -from scanpipe.tests import make_package -from scanpipe.tests import make_project -from scanpipe.tests import package_data1 -from scanpipe.tests.pipelines.do_nothing import DoNothing -from scanpipe.tests.pipelines.download_inputs import DownloadInput -from scanpipe.tests.pipelines.profile_step import ProfileStep -from scanpipe.tests.pipelines.steps_as_attribute import StepsAsAttribute -from scanpipe.tests.pipelines.with_groups import WithGroups - -from_docker_image = os.environ.get("FROM_DOCKER_IMAGE") - - -class ScanPipePipelinesTest(TestCase): - data = Path(__file__).parent / "data" - - def test_scanpipe_pipeline_class_pipeline_name_attribute(self): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline_instance = DoNothing(run) - self.assertEqual("do_nothing", pipeline_instance.pipeline_name) - - def test_scanpipe_pipeline_class_get_info(self): - expected = { - "description": "Description section of the doc string.", - "summary": "Do nothing, in 2 steps.", - "steps": [ - {"name": "step1", "doc": "Step1 doc.", "groups": []}, - {"name": "step2", "doc": "Step2 doc.", "groups": []}, - ], - "available_groups": [], - } - self.assertEqual(expected, DoNothing.get_info()) - - expected = { - "summary": "Profile a step using the @profile decorator.", - "description": "", - "steps": [ - {"name": "step", "doc": "", "groups": []}, - ], - "available_groups": [], - } - self.assertEqual(expected, ProfileStep.get_info()) - - def test_scanpipe_pipeline_class_get_summary(self): - expected = "Do nothing, in 2 steps." - self.assertEqual(expected, DoNothing.get_summary()) - - expected = "Profile a step using the @profile decorator." - self.assertEqual(expected, ProfileStep.get_summary()) - - def test_scanpipe_pipeline_class_log(self): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - pipeline.log("Event1") - pipeline.log("Event2") - - run.refresh_from_db() - self.assertIn("Event1", run.log) - self.assertIn("Event2", run.log) - - def test_scanpipe_pipeline_class_execute(self): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode) - self.assertEqual("", out) - - run.refresh_from_db() - self.assertIn("Pipeline [do_nothing] starting", run.log) - self.assertIn("Step [step1] starting", run.log) - self.assertIn("Step [step1] completed", run.log) - self.assertIn("Step [step2] starting", run.log) - self.assertIn("Step [step2] completed", run.log) - self.assertIn("Pipeline completed", run.log) - - def test_scanpipe_pipeline_class_execute_with_exception(self): - project1 = make_project() - run = project1.add_pipeline("raise_exception") - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(1, exitcode) - self.assertTrue(out.startswith("Error message")) - self.assertIn("Traceback:", out) - self.assertIn("in execute", out) - self.assertIn("step(self)", out) - self.assertIn("in raise_exception", out) - self.assertIn("raise ValueError", out) - - run.refresh_from_db() - self.assertIn("Pipeline [raise_exception] starting", run.log) - self.assertIn("Step [raise_exception_step] starting", run.log) - self.assertIn("Pipeline failed", run.log) - - @mock.patch("scanpipe.tests.pipelines.do_nothing.DoNothing.step1") - @mock.patch("scanpipe.tests.pipelines.do_nothing.DoNothing.step2") - def test_scanpipe_pipeline_class_execute_with_selected_steps(self, step2, step1): - step1.__name__ = "step1" - step1.groups = [] - step2.__name__ = "step2" - step2.groups = [] - - project1 = make_project() - run = project1.add_pipeline("do_nothing") - run.update(selected_steps=["step2", "not_existing_step"]) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode) - self.assertEqual("", out) - - step1.assert_not_called() - step2.assert_called() - - run.refresh_from_db() - self.assertIn("Pipeline [do_nothing] starting", run.log) - self.assertIn("Step [step1] skipped", run.log) - self.assertIn("Step [step2] starting", run.log) - self.assertIn("Step [step2] completed", run.log) - self.assertIn("Pipeline completed", run.log) - - def test_scanpipe_pipeline_class_download_inputs_attribute(self): - project1 = make_project() - run = project1.add_pipeline("download_inputs") - pipeline = run.make_pipeline_instance() - self.assertTrue(pipeline.download_inputs) - expected = (CommonStepsMixin.download_missing_inputs,) - self.assertEqual(expected, pipeline.get_initial_steps()) - expected = (CommonStepsMixin.download_missing_inputs, DownloadInput.step1) - self.assertEqual(expected, pipeline.get_steps()) - pipeline.execute() - self.assertIn("Step [download_missing_inputs]", run.log) - - run = project1.add_pipeline("profile_step") - pipeline = run.make_pipeline_instance() - self.assertFalse(pipeline.download_inputs) - pipeline.execute() - self.assertNotIn("Step [download_missing_inputs]", run.log) - - @mock.patch("requests.sessions.Session.get") - def test_scanpipe_pipeline_class_download_missing_inputs(self, mock_get): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - - file_location = self.data / "aboutcode" / "notice.NOTICE" - input_source = project1.add_input_source( - filename=file_location.name, is_uploaded=True - ) - self.assertFalse(input_source.exists()) - with self.assertRaises(InputFilesError) as error: - pipeline.download_missing_inputs() - error_msg = ( - "InputFilesError encountered with the following issues:\n\n" - "Error 1: Uploaded file filename=notice.NOTICE [uploaded] not available." - "\n\nNo traceback available." - ) - self.assertEqual(error_msg, str(error.exception)) - self.assertIn( - "Uploaded file filename=notice.NOTICE [uploaded] not available.", run.log - ) - - project1.copy_input_from(file_location) - self.assertTrue(input_source.exists()) - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - pipeline.download_missing_inputs() - self.assertEqual("", run.log) - - download_url = "https://download.url/file.zip" - mock_get.return_value = make_mock_response(url=download_url) - input_source2 = project1.add_input_source(download_url=download_url) - pipeline.download_missing_inputs() - self.assertIn("Fetching input from https://download.url/file.zip", run.log) - input_source2.refresh_from_db() - self.assertEqual("file.zip", input_source2.filename) - self.assertTrue(input_source2.exists()) - mock_get.assert_called_once() - - @mock.patch("scanpipe.models.InputSource.fetch") - def test_scanpipe_pipeline_class_download_fetch_exception(self, mock_fetch): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - - mock_fetch.side_effect = Exception("File not found") - download_url = "https://download.url/file.zip" - project1.add_input_source(download_url=download_url) - - with self.assertRaises(InputFilesError) as error: - pipeline.download_missing_inputs() - self.assertIn( - "InputFilesError encountered with the following issues:", - str(error.exception), - ) - self.assertIn("Error 1: File not found", str(error.exception)) - self.assertIn("Traceback (most recent call last):", str(error.exception)) - self.assertIn("Exception: File not found", str(error.exception)) - - self.assertIn("Fetching input from https://download.url/file.zip", run.log) - self.assertIn("https://download.url/file.zip could not be fetched.", run.log) - - @mock.patch("git.repo.base.Repo.clone_from") - def test_scanpipe_pipeline_class_download_missing_inputs_git_repo(self, mock_clone): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - - download_url = "https://github.com/aboutcode-org/scancode.io.git" - input_source = project1.add_input_source(download_url=download_url) - - def mock_make_to_path(**kwargs): - to_path = kwargs.get("to_path") - to_path.mkdir() - - mock_clone.side_effect = mock_make_to_path - mock_clone.return_value = None - - pipeline.download_missing_inputs() - self.assertIn( - "Fetching input from https://github.com/aboutcode-org/scancode.io.git", - run.log, - ) - input_source.refresh_from_db() - self.assertEqual("scancode.io.git", input_source.filename) - self.assertTrue(input_source.exists()) - - def test_scanpipe_pipeline_class_save_errors_context_manager(self): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - self.assertEqual(project1, pipeline.project) - - with pipeline.save_errors(Exception): - raise Exception("Error message") - - message = project1.projectmessages.get() - self.assertEqual("do_nothing", message.model) - self.assertEqual({}, message.details) - self.assertEqual("Error message", message.description) - self.assertIn('raise Exception("Error message")', message.traceback) - - resource1 = CodebaseResource.objects.create(project=project1, path="filename") - with pipeline.save_errors(Exception, resource=resource1): - raise Exception("Error message") - message = project1.projectmessages.latest("created_date") - self.assertEqual({"resource_path": str(resource1.path)}, message.details) - - def test_scanpipe_pipelines_is_pipeline(self): - self.assertFalse(is_pipeline(None)) - self.assertFalse(is_pipeline(Pipeline)) - self.assertTrue(is_pipeline(DoNothing)) - - class SubSubClass(DoNothing): - pass - - self.assertTrue(is_pipeline(SubSubClass)) - - def test_scanpipe_pipeline_class_get_graph(self): - expected = [ - {"name": "step1", "doc": "Step1 doc.", "groups": []}, - {"name": "step2", "doc": "Step2 doc.", "groups": []}, - ] - self.assertEqual(expected, DoNothing.get_graph()) - - def test_scanpipe_pipelines_profile_decorator(self): - project1 = make_project() - run = project1.add_pipeline("profile_step") - pipeline_instance = run.make_pipeline_instance() - - exitcode, out = pipeline_instance.execute() - self.assertEqual(0, exitcode) - - run.refresh_from_db() - self.assertIn("Profiling results at", run.log) - self.assertIn("Pipeline completed", run.log) - - self.assertEqual(1, len(project1.output_root)) - output_file = project1.output_root[0] - self.assertTrue(output_file.startswith("profile-")) - self.assertTrue(output_file.endswith(".html")) - - def test_scanpipe_pipeline_class_get_steps(self): - expected = ( - DoNothing.step1, - DoNothing.step2, - ) - self.assertEqual(expected, DoNothing.get_steps()) - - with self.assertRaises(TypeError) as cm: - StepsAsAttribute.get_steps() - expected = "Use a ``steps(cls)`` classmethod to declare the steps." - self.assertEqual(expected, str(cm.exception)) - - def test_scanpipe_pipeline_class_get_steps_with_groups(self): - expected = (WithGroups.no_groups,) - self.assertEqual(expected, WithGroups.get_steps()) - self.assertEqual(expected, WithGroups.get_steps(groups=[])) - self.assertEqual(expected, WithGroups.get_steps(groups=["not_defined"])) - - expected = ( - WithGroups.grouped_with_foo_and_bar, - WithGroups.grouped_with_bar, - WithGroups.no_groups, - ) - self.assertEqual(expected, WithGroups.get_steps(groups=["bar"])) - self.assertEqual(expected, WithGroups.get_steps(groups=["foo", "bar"])) - - expected = ( - WithGroups.grouped_with_foo_and_bar, - WithGroups.no_groups, - ) - self.assertEqual(expected, WithGroups.get_steps(groups=["foo"])) - - def test_scanpipe_pipeline_class_get_available_groups(self): - self.assertEqual(["bar", "excluded", "foo"], WithGroups.get_available_groups()) - self.assertEqual([], DoNothing.get_available_groups()) - - def test_scanpipe_pipeline_class_env_loaded_from_config_file(self): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - self.assertEqual({}, pipeline.env) - - config_file = project1.input_path / settings.SCANCODEIO_CONFIG_FILE - config_file.write_text("{*this is not valid yml*}") - pipeline = run.make_pipeline_instance() - self.assertEqual({}, pipeline.env) - - config_file.write_text("product_name: Product") - pipeline = run.make_pipeline_instance() - self.assertEqual({"product_name": "Product"}, pipeline.env) - - def test_scanpipe_pipeline_class_env_reloaded_after_extraction(self): - project1 = make_project() - - input_location = self.data / "settings" / "archived-scancode-config.zip" - project1.copy_input_from(input_location) - run = project1.add_pipeline("scan_codebase") - pipeline = run.make_pipeline_instance() - self.assertEqual({}, pipeline.env) - - # Manually run steps, env is reload from the scancode-config.yml contained in - # the archive - pipeline.copy_inputs_to_codebase_directory() - pipeline.extract_archives() - - expected = { - "product_name": "My Product Name", - "product_version": "1.0", - "ignored_patterns": ["*.tmp", "tests/*"], - } - self.assertEqual(expected, pipeline.env) - - def test_scanpipe_pipeline_class_flag_ignored_resources(self): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - self.assertIsNone(pipeline.env.get("ignored_patterns")) - - project1.settings.update({"ignored_patterns": "*.ext"}) - project1.save() - pipeline = run.make_pipeline_instance() - - with mock.patch("scanpipe.pipes.flag.flag_ignored_patterns") as mock_flag: - mock_flag.return_value = None - pipeline.flag_ignored_resources() - - mock_flag.assert_called_once() - patterns_args = ["*.ext", *flag.DEFAULT_IGNORED_PATTERNS] - self.assertEqual(mock_flag.mock_calls[0].kwargs["patterns"], patterns_args) - self.assertEqual(mock_flag.mock_calls[0].kwargs["codebaseresources"].count(), 0) - - def test_scanpipe_pipeline_class_extract_archive(self): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - - target = tempfile.mkdtemp() - input_location = str(self.data / "scancode" / "corrupted.tar.gz") - pipeline.extract_archive(input_location, target) - - projects_errors = project1.projectmessages.all() - self.assertEqual(1, len(projects_errors)) - project_error = projects_errors.get() - self.assertEqual("error", project_error.severity) - self.assertIn("gzip decompression failed", project_error.description) - self.assertEqual("extract_archive", project_error.model) - self.assertEqual({"filename": "corrupted.tar.gz"}, project_error.details) - self.assertEqual("", project_error.traceback) - - def test_scanpipe_pipeline_class_extract_archives(self): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - - input_location = str(self.data / "scancode" / "corrupted.tar.gz") - resource_location = copy_input(input_location, project1.codebase_path) - pipeline.extract_archives() - - projects_errors = project1.projectmessages.all() - self.assertEqual(1, len(projects_errors)) - project_error = projects_errors.get() - self.assertEqual("error", project_error.severity) - self.assertIn("gzip decompression failed", project_error.description) - self.assertEqual("extract_archives", project_error.model) - self.assertEqual( - {"resource_path": str(resource_location)}, project_error.details - ) - self.assertEqual("", project_error.traceback) - - -class RootFSPipelineTest(TestCase): - def test_scanpipe_rootfs_pipeline_extract_input_files_errors(self): - project1 = make_project() - run = project1.add_pipeline("analyze_root_filesystem_or_vm_image") - pipeline_instance = analyze_root_filesystem.RootFS(run) - - # Create 2 files in the input/ directory to generate error twice - project1.move_input_from(tempfile.mkstemp()[1]) - project1.move_input_from(tempfile.mkstemp()[1]) - self.assertEqual(2, len(project1.input_files)) - - with mock.patch("scanpipe.pipes.scancode.extract_archive") as extract_archive: - extract_archive.return_value = {"path/to/resource": ["error1", "error2"]} - pipeline_instance.extract_input_files_to_codebase_directory() - - projects_errors = project1.projectmessages.all() - self.assertEqual(2, len(projects_errors)) - project_error = projects_errors[0] - self.assertEqual("error", project_error.severity) - self.assertEqual("error1\nerror2", project_error.description) - self.assertEqual("extract_archive", project_error.model) - self.assertEqual({"filename": "resource"}, project_error.details) - self.assertEqual("", project_error.traceback) - - -def sort_for_os_compatibility(scan_data): - """Sort the ``scan_data`` files and relations in place. Return ``scan_data``.""" - if files := scan_data.get("files"): - files.sort(key=lambda x: x["path"]) - - if relations := scan_data.get("relations"): - relations.sort(key=lambda x: x["to_resource"]) - - return scan_data - - -@tag("slow") -class PipelinesIntegrationTest(TestCase): - """Integration tests to ensure the proper output for each built-in Pipelines.""" - - # Un-comment the following to display full diffs: - # maxDiff = None - data = Path(__file__).parent / "data" - exclude_from_diff = [ - "start_timestamp", - "end_timestamp", - "date", - "duration", - "input", - "compliance_alert", - "policy", - "tool_version", - "other_tools", - "created_date", - "log", - "uuid", - "size", # directory sizes are OS dependant - "size_count", - "--json-pp", - "--processes", - "--verbose", - # system_environment differs between systems - "system_environment", - "file_type", - # mime type and is_script are inconsistent across systems - "mime_type", - "is_script", - "notes", - "settings", - "description", - "traceback", - ] - - def _without_keys(self, data, exclude_keys): - """Return the `data` excluding the provided `exclude_keys`.""" - if isinstance(data, list): - return [self._without_keys(entry, exclude_keys) for entry in data] - - if isinstance(data, dict): - return { - key: ( - self._without_keys(value, exclude_keys) - if type(value) in [list, dict] - else value - ) - for key, value in data.items() - if key not in exclude_keys - } - - return data - - def purl_fields_with_fake_uuid(self, value, key): - purl_fields = ["purl", "for_packages", "package_uid"] - purl_name = "fixed-name-for-testing-5642512d1758" - purl_namespace = "fixed-namespace-for-testing-5642512d1758" - - if key == "name": - return purl_name - elif key == "namespace": - return purl_namespace - elif key in purl_fields: - purl_old = PackageURL.from_string(value) - if purl_old.type != "local-files": - return purl_with_fake_uuid(value) - - purl = PackageURL( - name=purl_name, - namespace=purl_namespace, - type="local-files", - version=purl_old.version, - qualifiers=purl_old.qualifiers, - subpath=purl_old.subpath, - ) - return purl_with_fake_uuid(purl.to_string()) - - def _normalize_package_uids(self, data): - """ - Return the `data`, where any `package_uid` value has been normalized - with `purl_with_fake_uuid()` - """ - fields_with_package_uids = [ - "package_uid", - "dependency_uid", - "for_package_uid", - "resolved_to_package_uid", - ] - if isinstance(data, list): - return [self._normalize_package_uids(entry) for entry in data] - - if isinstance(data, dict): - is_local_files = False - if data.get("type") and data["type"] == "local-files": - is_local_files = True - normalized_data = {} - for key, value in data.items(): - if isinstance(value, list | dict): - value = self._normalize_package_uids(value) - if key in fields_with_package_uids and value: - value = purl_with_fake_uuid(value) - if key == "for_packages" and value: - value = sorted( - [ - self.purl_fields_with_fake_uuid(package_uid, key) - for package_uid in value - ] - ) - if ( - is_local_files - and key in ("name", "namespace", "purl", "package_uid") - and value - ): - value = self.purl_fields_with_fake_uuid(value, key) - normalized_data[key] = value - return normalized_data - - return data - - def _sort_dependencies(self, data): - """ - Sort dependencies by their "for_package_uid". - - After dependency resolution in some cases we have multiple - dependency requirements resolved to a same package, and they - are not sorted the same way every time. - """ - mappings = data.get("dependencies") - if mappings: - mappings_by_uid = {} - for mapping in mappings: - uid = mapping.get("for_package_uid") or "" - mappings_by_uid[uid] = mapping - data["dependencies"] = list(dict(sorted(mappings_by_uid.items())).values()) - return data - - def test_package_uids_normalized_in_pipeline_integration_tests(self): - self.maxDiff = 1000 - data = { - "type": "local-files", - "package_uid": ( - "pkg:local-files/analysis-438ebaf4/42440f35-1091-4c03-8c96-a0ed3d3caf23" - "?uuid=42440f35-1091-4c03-8c96-a0ed3d3caf24" - ), - "for_packages": [ - ( - "pkg:local-files/analysis-438ebaf4/42440f35-1091-4c03-8c96-a0ed3d3caf23" - "?uuid=42440f35-1091-4c03-8c96-a0ed3d3caf24" - ) - ], - } - normalized_data = self._normalize_package_uids(data=data) - expected_data = { - "type": "local-files", - "package_uid": ( - "pkg:local-files/fixed-namespace-for-testing-5642512d1758/" - "fixed-name-for-testing-5642512d1758?uuid=fixed-uid-done-for-testing-5642512d1758" - ), - "for_packages": [ - ( - "pkg:local-files/fixed-namespace-for-testing-5642512d1758/" - "fixed-name-for-testing-5642512d1758?uuid=fixed-uid-done-for-testing-5642512d1758" - ) - ], - } - self.assertEqual(normalized_data, expected_data) - - def assertPipelineResultEqual( - self, expected_file, result_file, sort_dependencies=False, regen=FIXTURES_REGEN - ): - """Set `regen` to True to regenerate the expected results.""" - result_json = json.loads(Path(result_file).read_text()) - result_json = self._normalize_package_uids(result_json) - result_data = self._without_keys(result_json, self.exclude_from_diff) - if sort_dependencies: - result_data = self._sort_dependencies(result_data) - result_data = sort_for_os_compatibility(result_data) - - if regen: - expected_file.write_text(json.dumps(result_data, indent=2)) - - expected_json = json.loads(expected_file.read_text()) - expected_json = self._normalize_package_uids(expected_json) - expected_data = self._without_keys(expected_json, self.exclude_from_diff) - if sort_dependencies: - result_data = self._sort_dependencies(result_data) - expected_data = sort_for_os_compatibility(expected_data) - - self.assertEqual(expected_data, result_data) - - @skipIf(from_docker_image, "Random failure in the Docker context.") - def test_scanpipe_scan_package_pipeline_integration(self): - pipeline_name = "scan_single_package" - project1 = make_project() - - input_location = self.data / "scancode" / "is-npm-1.0.0.tgz" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(4, project1.codebaseresources.count()) - self.assertEqual(1, project1.discoveredpackages.count()) - self.assertEqual(1, project1.discovereddependencies.count()) - - scancode_file = project1.get_latest_output(filename="scancode") - expected_file = self.data / "scancode" / "is-npm-1.0.0_scan_package.json" - self.assertPipelineResultEqual(expected_file, scancode_file) - - summary_file = project1.get_latest_output(filename="summary") - expected_file = ( - self.data / "scancode" / "is-npm-1.0.0_scan_package_summary.json" - ) - self.assertPipelineResultEqual(expected_file, summary_file) - - # Ensure that we only have one instance of is-npm in `key_files_packages` - summary_data = json.loads(Path(summary_file).read_text()) - key_files_packages = summary_data.get("key_files_packages", []) - self.assertEqual(1, len(key_files_packages)) - key_file_package = key_files_packages[0] - key_file_package_purl = key_file_package.get("purl", "") - self.assertEqual("pkg:npm/is-npm@1.0.0", key_file_package_purl) - - @skipIf(from_docker_image, "Random failure in the Docker context.") - def test_scanpipe_scan_package_pipeline_integration_multiple_packages(self): - pipeline_name = "scan_single_package" - project1 = make_project() - - input_location = self.data / "scancode" / "multiple-is-npm-1.0.0.tar.gz" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(9, project1.codebaseresources.count()) - self.assertEqual(2, project1.discoveredpackages.count()) - self.assertEqual(2, project1.discovereddependencies.count()) - - scancode_file = project1.get_latest_output(filename="scancode") - expected_file = ( - self.data / "scancode" / "multiple-is-npm-1.0.0_scan_package.json" - ) - # Do not override the regen as this file is generated in regen_test_data - self.assertPipelineResultEqual(expected_file, scancode_file) - - summary_file = project1.get_latest_output(filename="summary") - expected_file = ( - self.data / "scancode" / "multiple-is-npm-1.0.0_scan_package_summary.json" - ) - self.assertPipelineResultEqual(expected_file, summary_file) - - @mock.patch("scanpipe.pipelines.scan_single_package.is_archive") - def test_scanpipe_scan_package_single_extract_input_to_codebase_directory( - self, mock_is_archive - ): - project1 = make_project() - run = project1.add_pipeline("scan_single_package") - pipeline_instance = scan_single_package.ScanSinglePackage(run) - - project1.move_input_from(tempfile.mkstemp(suffix=".zip")[1]) - self.assertEqual(1, len(project1.input_files)) - - mock_is_archive.return_value = True - pipeline_instance.get_package_input() - with mock.patch("scanpipe.pipes.scancode.extract_archive") as extract_archive: - extract_archive.return_value = {"path/to/resource": ["error1", "error2"]} - pipeline_instance.extract_input_to_codebase_directory() - - projects_errors = project1.projectmessages.all() - self.assertEqual(1, len(projects_errors)) - project_error = projects_errors[0] - self.assertEqual("error", project_error.severity) - self.assertEqual("error1\nerror2", project_error.description) - self.assertEqual("extract_archive", project_error.model) - self.assertEqual({"filename": "resource"}, project_error.details) - self.assertEqual("", project_error.traceback) - - def test_scanpipe_scan_package_single_file(self): - pipeline_name = "scan_single_package" - project1 = make_project() - - input_location = self.data / "manifests" / "openpdf-parent-1.3.11.pom.xml" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(1, project1.codebaseresources.count()) - self.assertEqual(1, project1.discoveredpackages.count()) - self.assertEqual(10, project1.discovereddependencies.count()) - - scancode_file = project1.get_latest_output(filename="scancode") - expected_file = ( - self.data / "manifests" / "openpdf-parent-1.3.11_scan_package.json" - ) - self.assertPipelineResultEqual(expected_file, scancode_file) - - @mock.patch("git.repo.base.Repo.clone_from") - def test_scanpipe_scan_package_single_package_git_repo(self, mock_clone): - pipeline_name = "scan_single_package" - project1 = make_project() - - download_url = "https://github.com/aboutcode-org/scancode.io.git" - project1.add_input_source(download_url=download_url) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - # Create the "fetched" git directory content - def mock_make_git_directory(**kwargs): - to_path = kwargs.get("to_path") # scancode.io.git - to_path.mkdir() - file_location = self.data / "aboutcode" / "notice.NOTICE" - copy_input(file_location, to_path) - - mock_clone.side_effect = mock_make_git_directory - mock_clone.return_value = None - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(2, project1.codebaseresources.count()) - self.assertEqual(0, project1.discoveredpackages.count()) - - def test_scanpipe_scan_codebase_pipeline_integration(self): - pipeline_name = "scan_codebase" - project1 = make_project() - - filename = "is-npm-1.0.0.tgz" - input_location = self.data / "scancode" / filename - project1.copy_input_from(input_location) - project1.add_input_source("https://download.url", filename) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(6, project1.codebaseresources.count()) - self.assertEqual(1, project1.discoveredpackages.count()) - self.assertEqual(1, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = self.data / "scancode" / "is-npm-1.0.0_scan_codebase.json" - self.assertPipelineResultEqual(expected_file, result_file) - - def test_scanpipe_scan_codebase_creates_top_level_paths(self): - pipeline_name = "scan_codebase" - project1 = make_project() - - filename = "is-npm-1.0.0.tgz" - input_location = self.data / "scancode" / filename - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - expected_top_level_paths = ["is-npm-1.0.0.tgz", "is-npm-1.0.0.tgz-extract"] - - top_level_resources = project1.codebaseresources.filter(parent_path="") - top_level_paths = [resource.path for resource in top_level_resources] - - self.assertListEqual(top_level_paths, expected_top_level_paths) - - def test_scanpipe_scan_codebase_creates_parent_path_field(self): - pipeline_name = "scan_codebase" - project1 = make_project() - - filename = "is-npm-1.0.0.tgz" - input_location = self.data / "scancode" / filename - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - expected_top_level_paths = ["is-npm-1.0.0.tgz", "is-npm-1.0.0.tgz-extract"] - expected_nested_paths = [ - "is-npm-1.0.0.tgz-extract/package/index.js", - "is-npm-1.0.0.tgz-extract/package/package.json", - "is-npm-1.0.0.tgz-extract/package/readme.md", - ] - - top_level_resources = project1.codebaseresources.filter(parent_path="") - top_level_paths = [resource.path for resource in top_level_resources] - - self.assertListEqual(top_level_paths, expected_top_level_paths) - - nested_resources = project1.codebaseresources.filter( - parent_path="is-npm-1.0.0.tgz-extract/package" - ) - nested_paths = [resource.path for resource in nested_resources] - - self.assertListEqual(nested_paths, expected_nested_paths) - - def test_scanpipe_inspect_packages_creates_packages_npm(self): - pipeline_name = "inspect_packages" - project1 = make_project() - - filename = "is-npm-1.0.0.tgz" - input_location = self.data / "scancode" / filename - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(6, project1.codebaseresources.count()) - self.assertEqual(1, project1.discoveredpackages.count()) - self.assertEqual(1, project1.discovereddependencies.count()) - - package = project1.discoveredpackages.get() - dependency = project1.discovereddependencies.get() - - self.assertEqual(3, package.codebase_resources.count()) - self.assertEqual("pkg:npm/is-npm@1.0.0", dependency.for_package.purl) - self.assertEqual(package.datasource_ids, [dependency.datasource_id]) - self.assertEqual( - package.codebase_resources.get( - path="is-npm-1.0.0.tgz-extract/package/package.json" - ).path, - dependency.datafile_resource.path, - ) - - def test_scanpipe_inspect_packages_creates_packages_pypi(self): - pipeline_name = "inspect_packages" - project1 = make_project() - - input_location = self.data / "manifests" / "python-inspector-0.10.0.zip" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - self.assertEqual(6, project1.codebaseresources.count()) - self.assertEqual(0, project1.discoveredpackages.count()) - self.assertEqual(26, project1.discovereddependencies.count()) - - @skipIf(sys.platform == "darwin", "Not supported on macOS") - def test_scanpipe_inspect_packages_with_resolved_dependencies_npm(self): - pipeline_name = "inspect_packages" - project1 = make_project() - - input_location = self.data / "dependencies" / "resolved_dependencies_npm.zip" - project1.copy_input_from(input_location) - - run = project1.add_pipeline( - pipeline_name=pipeline_name, - selected_groups=["StaticResolver"], - ) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - self.assertEqual(4, project1.codebaseresources.count()) - self.assertEqual(7, project1.discoveredpackages.count()) - self.assertEqual(6, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = ( - self.data - / "dependencies" - / "resolved_dependencies_npm_inspect_packages.json" - ) - self.assertPipelineResultEqual(expected_file, result_file) - - @skipIf(sys.platform == "darwin", "Not supported on macOS") - def test_scanpipe_inspect_packages_with_resolved_dependencies_poetry(self): - pipeline_name = "inspect_packages" - project1 = make_project() - - input_location = self.data / "dependencies" / "resolved_dependencies_poetry.zip" - project1.copy_input_from(input_location) - - run = project1.add_pipeline( - pipeline_name=pipeline_name, - selected_groups=["StaticResolver"], - ) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - self.assertEqual(5, project1.codebaseresources.count()) - self.assertEqual(6, project1.discoveredpackages.count()) - self.assertEqual(10, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = ( - self.data - / "dependencies" - / "resolved_dependencies_poetry_inspect_packages.json" - ) - self.assertPipelineResultEqual(expected_file, result_file) - - @skipIf(sys.platform == "darwin", "Not supported on macOS") - def test_scanpipe_resolved_dependencies_cocoapods(self): - pipeline_name = "resolve_dependencies" - project1 = make_project() - - input_location = ( - self.data / "dependencies" / "resolved_dependencies_cocoapods.zip" - ) - project1.copy_input_from(input_location) - - run = project1.add_pipeline( - pipeline_name=pipeline_name, - selected_groups=["StaticResolver"], - ) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - self.assertEqual(3, project1.codebaseresources.count()) - self.assertEqual(25, project1.discoveredpackages.count()) - self.assertEqual(30, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = ( - self.data / "dependencies" / "resolved_dependencies_cocoapods.json" - ) - self.assertPipelineResultEqual( - expected_file, result_file, sort_dependencies=True - ) - - def test_scanpipe_resolved_dependencies_pip_inspect(self): - pipeline_name = "resolve_dependencies" - project1 = make_project() - - input_location = self.data / "dependencies" / "resolved_dependencies_pip.zip" - project1.copy_input_from(input_location) - - run = project1.add_pipeline( - pipeline_name=pipeline_name, - selected_groups=["StaticResolver"], - ) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - self.assertEqual(3, project1.codebaseresources.count()) - self.assertEqual(4, project1.discoveredpackages.count()) - self.assertEqual(17, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = self.data / "dependencies" / "resolved_dependencies_pip.json" - self.assertPipelineResultEqual( - expected_file, - result_file, - ) - - def test_scanpipe_resolved_dependencies_nuget(self): - pipeline_name = "resolve_dependencies" - project1 = make_project() - - input_location = self.data / "dependencies" / "resolved_dependencies_nuget.zip" - project1.copy_input_from(input_location) - - run = project1.add_pipeline( - pipeline_name=pipeline_name, - selected_groups=["StaticResolver"], - ) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - self.assertEqual(3, project1.codebaseresources.count()) - self.assertEqual(34, project1.discoveredpackages.count()) - self.assertEqual(108, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = self.data / "dependencies" / "resolved_dependencies_nuget.json" - self.assertPipelineResultEqual( - expected_file, - result_file, - sort_dependencies=True, - ) - - def test_scanpipe_scan_codebase_can_process_wheel(self): - pipeline_name = "scan_codebase" - project1 = make_project() - - filename = "daglib-0.6.0-py3-none-any.whl" - input_location = self.data / "scancode" / filename - project1.copy_input_from(input_location) - project1.add_input_source("https://download.url", filename) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(11, project1.codebaseresources.count()) - self.assertEqual(2, project1.discoveredpackages.count()) - self.assertEqual(8, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = ( - self.data / "scancode" / "daglib-0.6.0-py3-none-any.whl_scan_codebase.json" - ) - self.assertPipelineResultEqual(expected_file, result_file) - - @skipIf(sys.platform != "linux", "Expected results are inconsistent across OS") - def test_scanpipe_docker_pipeline_alpine_integration(self): - pipeline_name = "analyze_docker_image" - project1 = make_project() - - filename = "alpine_3_15_4.tar.gz" - input_location = self.data / "docker" / filename - project1.copy_input_from(input_location) - project1.add_input_source("https://download.url", filename) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(510, project1.codebaseresources.count()) - self.assertEqual(14, project1.discoveredpackages.count()) - self.assertEqual(0, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = self.data / "docker" / "alpine_3_15_4_scan_codebase.json" - self.assertPipelineResultEqual(expected_file, result_file) - - def test_scanpipe_docker_pipeline_does_not_report_errors_for_broken_symlinks(self): - pipeline_name = "analyze_docker_image" - project1 = make_project() - - filename = "minitag.tar" - input_location = self.data / "image-with-symlinks" / filename - project1.copy_input_from(input_location) - project1.add_input_source("https://download.url", filename) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - with redirect_stderr(io.StringIO()): - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - project_messages = project1.projectmessages.all() - self.assertEqual(1, len(project_messages)) - self.assertEqual("Distro not found.", project_messages[0].description) - - result_file = output.to_json(project1) - expected_file = ( - self.data / "image-with-symlinks" / (filename + "-expected-scan.json") - ) - self.assertPipelineResultEqual(expected_file, result_file) - - @skipIf(sys.platform != "linux", "RPM related features only supported on Linux.") - def test_scanpipe_docker_pipeline_rpm_integration(self): - pipeline_name = "analyze_docker_image" - project1 = make_project() - - filename = "centos.tar.gz" - input_location = self.data / "docker" / filename - project1.copy_input_from(input_location) - project1.add_input_source("https://download.url", filename) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(29, project1.codebaseresources.count()) - self.assertEqual(101, project1.discoveredpackages.count()) - self.assertEqual(0, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = self.data / "docker" / "centos_scan_codebase.json" - self.assertPipelineResultEqual(expected_file, result_file) - - def test_scanpipe_docker_pipeline_debian_integration(self): - pipeline_name = "analyze_docker_image" - project1 = make_project() - - filename = "debian.tar.gz" - input_location = self.data / "docker" / filename - project1.copy_input_from(input_location) - project1.add_input_source("https://download.url", filename) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(16, project1.codebaseresources.count()) - self.assertEqual(2, project1.discoveredpackages.count()) - self.assertEqual(0, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = self.data / "docker" / "debian_scan_codebase.json" - self.assertPipelineResultEqual(expected_file, result_file) - - def test_scanpipe_docker_pipeline_distroless_debian_integration(self): - pipeline_name = "analyze_docker_image" - project1 = make_project() - - filename = "gcr_io_distroless_base.tar.gz" - input_location = self.data / "docker" / filename - project1.copy_input_from(input_location) - project1.add_input_source("https://download.url", filename) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(2458, project1.codebaseresources.count()) - self.assertEqual(6, project1.discoveredpackages.count()) - self.assertEqual(0, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = ( - self.data / "docker" / "gcr_io_distroless_base_scan_codebase.json" - ) - self.assertPipelineResultEqual(expected_file, result_file) - - def test_scanpipe_rootfs_pipeline_integration(self): - pipeline_name = "analyze_root_filesystem_or_vm_image" - project1 = make_project() - - input_location = self.data / "rootfs" / "basic-rootfs.tar.gz" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(17, project1.codebaseresources.count()) - self.assertEqual(2, project1.discoveredpackages.count()) - self.assertEqual(0, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = self.data / "rootfs" / "basic-rootfs_root_filesystems.json" - self.assertPipelineResultEqual(expected_file, result_file) - - def test_scanpipe_load_inventory_pipeline_integration(self): - pipeline_name = "load_inventory" - project1 = make_project() - - input_location = self.data / "asgiref" / "asgiref-3.3.0_toolkit_scan.json" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(18, project1.codebaseresources.count()) - self.assertEqual(2, project1.discoveredpackages.count()) - self.assertEqual(4, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = ( - self.data / "asgiref" / "asgiref-3.3.0_load_inventory_expected.json" - ) - self.assertPipelineResultEqual(expected_file, result_file) - - # Using the ScanCode.io JSON output as the input - project2 = make_project() - - input_location = self.data / "asgiref" / "asgiref-3.3.0_scanpipe_output.json" - project2.copy_input_from(input_location) - - run = project2.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(18, project2.codebaseresources.count()) - self.assertEqual(2, project2.discoveredpackages.count()) - self.assertEqual(4, project2.discovereddependencies.count()) - - @mock.patch("scanpipe.pipes.vulnerablecode.is_available") - @mock.patch("scanpipe.pipes.vulnerablecode.is_configured") - @mock.patch("scanpipe.pipes.vulnerablecode.bulk_search_by_purl") - def test_scanpipe_find_vulnerabilities_pipeline_integration( - self, mock_bulk_search_by_purl, mock_is_configured, mock_is_available - ): - pipeline_name = "find_vulnerabilities" - project1 = make_project() - package1 = DiscoveredPackage.create_from_data(project1, package_data1) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - mock_is_configured.return_value = False - mock_is_available.return_value = False - exitcode, out = pipeline.execute() - self.assertEqual(1, exitcode, msg=out) - self.assertIn("VulnerableCode is not configured.", out) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - mock_is_configured.return_value = True - mock_is_available.return_value = True - vulnerability_data = [ - { - "purl": "pkg:deb/debian/adduser@3.118?arch=all", - "affected_by_vulnerabilities": [ - { - "vulnerability_id": "VCID-cah8-awtr-aaad", - "summary": "An issue was discovered.", - }, - ], - }, - { - "purl": "pkg:deb/debian/adduser@3.118?qualifiers=1", - "affected_by_vulnerabilities": [ - { - "vulnerability_id": "VCID-cah8-awtr-aaad", - "summary": "An issue was discovered.", - }, - ], - }, - ] - mock_bulk_search_by_purl.return_value = vulnerability_data - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - package1.refresh_from_db() - expected = vulnerability_data[0]["affected_by_vulnerabilities"] - self.assertEqual(expected, package1.affected_by_vulnerabilities) - - @mock.patch("scorecode.ossf_scorecard.is_available") - def test_scanpipe_fetch_scores_pipeline_integration(self, mock_is_available): - pipeline_name = "fetch_scores" - project1 = make_project() - package1 = DiscoveredPackage.create_from_data(project1, package_data1) - package1.vcs_url = "https://github.com/ossf/scorecard" - package1.save() - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - mock_is_available.return_value = False - exitcode, out = pipeline.execute() - self.assertEqual(1, exitcode, msg=out) - self.assertIn("ScoreCode service is not available.", out) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - mock_is_available.return_value = True - - package_score_data = { - "scoring_tool": "ossf_scorecard", - "scoring_tool_version": "v5.2.1", - "score": "9.7", - "scoring_tool_documentation_url": "https://github.com/[trunc...]", - "score_date": "2025-07-24T18:50:16Z", - } - with mock.patch("scorecode.ossf_scorecard.fetch_scorecard_info") as fetch: - fetch.return_value = PackageScore(**package_score_data) - exitcode, out = pipeline.execute() - - self.assertEqual(0, exitcode, msg=out) - - package1.refresh_from_db() - scorecard_entry = package1.scores.filter(scoring_tool="ossf-scorecard").first() - self.assertIsNotNone(scorecard_entry) - self.assertEqual("ossf-scorecard", scorecard_entry.scoring_tool) - self.assertEqual("v5.2.1", scorecard_entry.scoring_tool_version) - self.assertTrue(scorecard_entry.score) - - def test_scanpipe_resolve_dependencies_pipeline_integration(self): - pipeline_name = "resolve_dependencies" - project1 = make_project() - selected_groups = ["DynamicResolver"] - - run = project1.add_pipeline( - pipeline_name=pipeline_name, selected_groups=selected_groups - ) - pipeline = run.make_pipeline_instance() - - project1.move_input_from(tempfile.mkstemp()[1]) - pipeline.execute() - self.assertEqual(1, project1.projectmessages.count()) - message = project1.projectmessages.get() - self.assertEqual("get_packages_from_manifest", message.model) - expected = "No resources containing package data found in codebase." - self.assertIn(expected, message.description) - - def test_scanpipe_resolve_dependencies_pipeline_integration_empty_manifest(self): - pipeline_name = "resolve_dependencies" - project1 = make_project() - selected_groups = ["DynamicResolver"] - - run = project1.add_pipeline( - pipeline_name=pipeline_name, selected_groups=selected_groups - ) - pipeline = run.make_pipeline_instance() - - project1.move_input_from(tempfile.mkstemp(suffix="requirements.txt")[1]) - pipeline.execute() - self.assertEqual(1, project1.projectmessages.count()) - message = project1.projectmessages.get() - self.assertEqual("get_packages_from_manifest", message.model) - expected = "No packages could be resolved" - self.assertIn(expected, message.description) - - @mock.patch("scanpipe.pipes.resolve.python_inspector.resolve_dependencies") - def test_scanpipe_resolve_dependencies_pipeline_integration_misc( - self, mock_resolve_dependencies - ): - pipeline_name = "resolve_dependencies" - project1 = make_project() - selected_groups = ["DynamicResolver"] - - input_location = self.data / "manifests" / "requirements.txt" - project1.copy_input_from(input_location) - - run = project1.add_pipeline( - pipeline_name=pipeline_name, selected_groups=selected_groups - ) - pipeline = run.make_pipeline_instance() - - mock_resolve_dependencies.return_value = mock.Mock(packages=[package_data1]) - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - self.assertEqual(1, project1.discoveredpackages.count()) - - @mock.patch("scanpipe.pipes.resolve.python_inspector.resolve_dependencies") - def test_scanpipe_resolve_dependencies_pipeline_pypi_integration( - self, mock_resolve_dependencies - ): - pipeline_name = "resolve_dependencies" - project1 = make_project() - selected_groups = ["DynamicResolver"] - - run = project1.add_pipeline( - pipeline_name=pipeline_name, selected_groups=selected_groups - ) - pipeline = run.make_pipeline_instance() - - project1.move_input_from(tempfile.mkstemp(suffix="requirements.txt")[1]) - mock_resolve_dependencies.return_value = mock.Mock(packages=[package_data1]) - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(1, project1.discoveredpackages.count()) - discoveredpackage = project1.discoveredpackages.get() - exclude_fields = ["qualifiers", "release_date", "size"] - for field_name, value in package_data1.items(): - if value and field_name not in exclude_fields: - self.assertEqual(value, getattr(discoveredpackage, field_name)) - - def test_scanpipe_load_sbom_pipeline_aboutfile_integration(self): - pipeline_name = "load_sbom" - project1 = make_project() - - input_location = self.data / "manifests" / "Django-4.0.8-py3-none-any.whl.ABOUT" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(1, project1.discoveredpackages.count()) - discoveredpackage = project1.discoveredpackages.get() - self.assertEqual("pypi", discoveredpackage.type) - self.assertEqual("django", discoveredpackage.name) - self.assertEqual("4.0.8", discoveredpackage.version) - self.assertEqual("bsd-new", discoveredpackage.declared_license_expression) - - def test_scanpipe_load_sbom_pipeline_spdx_integration(self): - pipeline_name = "load_sbom" - project1 = make_project() - - input_location = self.data / "manifests" / "toml.spdx.json" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(1, project1.discoveredpackages.count()) - discoveredpackage = project1.discoveredpackages.get() - self.assertEqual("pypi", discoveredpackage.type) - self.assertEqual("toml", discoveredpackage.name) - self.assertEqual("0.10.2", discoveredpackage.version) - self.assertEqual("https://github.com/uiri/toml", discoveredpackage.homepage_url) - self.assertEqual("MIT", discoveredpackage.extracted_license_statement) - self.assertEqual("mit", discoveredpackage.declared_license_expression) - - def test_scanpipe_load_sbom_pipeline_cyclonedx_integration(self): - pipeline_name = "load_sbom" - project1 = make_project() - - input_location = self.data / "cyclonedx" / "nested.cdx.json" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(3, project1.discoveredpackages.count()) - packages = project1.discoveredpackages.all() - expected_data = { - "pkg:pypi/toml@0.10.2?extension=tar.gz": { - "type": "pypi", - "name": "toml", - "version": "0.10.2", - "extracted_license_statement": "OFL-1.1\nApache-2.0", - "declared_license_expression": "ofl-1.1 OR apache-2.0", - "homepage_url": "https://cyclonedx.org/website", - "bug_tracking_url": "https://cyclonedx.org/issue-tracker", - "vcs_url": "https://cyclonedx.org/vcs", - "filename": "", - }, - "pkg:pypi/billiard@3.6.3.0": { - "type": "pypi", - "name": "billiard", - "version": "3.6.3.0", - "extracted_license_statement": "BSD-3-Clause", - "declared_license_expression": "bsd-new", - "homepage_url": "", - "bug_tracking_url": "", - "vcs_url": "", - "extra_data": "", - "filename": "", - }, - "pkg:pypi/fictional@9.10.2": { - "type": "pypi", - "name": "fictional", - "version": "9.10.2", - "extracted_license_statement": ( - "LGPL-3.0-or-later" - " AND " - "LicenseRef-scancode-openssl-exception-lgpl3.0plus" - ), - "declared_license_expression": ( - "lgpl-3.0-plus AND openssl-exception-lgpl-3.0-plus" - ), - "homepage_url": "https://home.page", - "bug_tracking_url": "", - "vcs_url": "", - "extra_data": "", - "filename": "package.zip", - }, - } - - for package in packages: - expected = expected_data.get(str(package)) - self.assertEqual(expected["type"], package.type) - self.assertEqual(expected["name"], package.name) - self.assertEqual(expected["version"], package.version) - self.assertEqual(expected["homepage_url"], package.homepage_url) - self.assertEqual( - expected["extracted_license_statement"], - package.extracted_license_statement, - ) - self.assertEqual( - expected["declared_license_expression"], - package.declared_license_expression, - ) - self.assertEqual(expected["filename"], package.filename) - - def test_scanpipe_load_sbom_pipeline_cyclonedx_with_dependencies_integration(self): - pipeline_name = "load_sbom" - project1 = make_project() - - input_location = self.data / "cyclonedx" / "laravel-7.12.0" / "bom.1.4.json" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(62, project1.discoveredpackages.count()) - self.assertEqual(112, project1.discovereddependencies.count()) - dependency = project1.discovereddependencies.all()[0] - self.assertEqual("bom.1.4.json", str(dependency.datafile_resource)) - - def test_scanpipe_load_sbom_pipeline_cyclonedx_with_vulnerabilities(self): - pipeline_name = "load_sbom" - project1 = make_project() - - input_location = ( - self.data / "cyclonedx" / "python-3.13.0-vulnerabilities.cdx.json" - ) - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(1, project1.discoveredpackages.count()) - package = project1.discoveredpackages.get() - expected = [ - { - "vulnerability_id": "CVE-2005-2541", - "summary": "Tar 1.15.1 does not properly warn the user when...", - } - ] - self.assertEqual(expected, package.affected_by_vulnerabilities) - - @mock.patch("scanpipe.pipes.purldb.request_post") - @mock.patch("uuid.uuid4") - def test_scanpipe_deploy_to_develop_pipeline_integration( - self, mock_uuid4, mock_request - ): - forced_uuid = "b74fe5df-e965-415e-ba65-f38421a0695d" - mock_uuid4.return_value = forced_uuid - mock_request.return_value = None - pipeline_name = "map_deploy_to_develop" - project1 = make_project(name="Analysis", uuid=forced_uuid) - selected_groups = ["Java"] - - jar_location = self.data / "d2d" / "jars" - project1.copy_input_from(jar_location / "from-flume-ng-node-1.9.0.zip") - project1.copy_input_from(jar_location / "to-flume-ng-node-1.9.0.zip") - - run = project1.add_pipeline( - pipeline_name=pipeline_name, selected_groups=selected_groups - ) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(57, project1.codebaseresources.count()) - self.assertEqual(18, project1.codebaserelations.count()) - self.assertEqual(1, project1.discoveredpackages.count()) - self.assertEqual(0, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = self.data / "d2d" / "flume-ng-node-d2d.json" - self.assertPipelineResultEqual(expected_file, result_file) - - @skipIf(sys.platform == "darwin", "Not supported on macOS") - def test_scanpipe_deploy_to_develop_pipeline_integration_elfs(self): - pipeline_name = "map_deploy_to_develop" - project1 = make_project(name="Analysis") - selected_groups = ["Elf"] - - elf_location = self.data / "d2d-elfs" - project1.copy_input_from(elf_location / "from-brotli-d2d.zip") - project1.copy_input_from(elf_location / "to-brotli-d2d.zip") - - run = project1.add_pipeline( - pipeline_name=pipeline_name, selected_groups=selected_groups - ) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(17, project1.codebaseresources.count()) - self.assertEqual(7, project1.codebaserelations.count()) - - result_file = output.to_json(project1) - expected_file = self.data / "d2d-elfs" / "brotli-elf-d2d.json" - self.assertPipelineResultEqual(expected_file, result_file) - - def test_scanpipe_deploy_to_develop_pipeline_extract_input_files_errors(self): - project1 = make_project() - run = project1.add_pipeline("map_deploy_to_develop") - pipeline_instance = deploy_to_develop.DeployToDevelop(run) - - # Create 2 files in the input/ directory to generate error twice - project1.move_input_from(tempfile.mkstemp(prefix="from-")[1]) - project1.move_input_from(tempfile.mkstemp(prefix="to-")[1]) - self.assertEqual(2, len(project1.input_files)) - - pipeline_instance.get_inputs() - with mock.patch("scanpipe.pipes.scancode.extract_archive") as extract_archive: - extract_archive.return_value = {"path/to/resource": ["error1", "error2"]} - inputs_with_codebase_path_destination = [ - (pipeline_instance.from_files, project1.codebase_path / d2d.FROM), - (pipeline_instance.to_files, project1.codebase_path / d2d.TO), - ] - - for input_files, codebase_path in inputs_with_codebase_path_destination: - for input_file_path in input_files: - pipeline_instance.extract_archive(input_file_path, codebase_path) - - projects_errors = project1.projectmessages.all() - self.assertEqual(2, len(projects_errors)) - project_error = projects_errors[0] - self.assertEqual("error", project_error.severity) - self.assertEqual("error1\nerror2", project_error.description) - self.assertEqual("extract_archive", project_error.model) - self.assertEqual({"filename": "resource"}, project_error.details) - self.assertEqual("", project_error.traceback) - - @mock.patch("scanpipe.pipes.purldb.request_post") - @mock.patch("uuid.uuid4") - def test_scanpipe_deploy_to_develop_pipeline_with_about_file( - self, mock_uuid4, mock_request - ): - forced_uuid = "90cb6382-431c-4187-be76-d4f1a2199a2f" - mock_uuid4.return_value = forced_uuid - mock_request.return_value = None - pipeline_name = "map_deploy_to_develop" - project1 = make_project(name="Analysis", uuid=forced_uuid) - selected_groups = ["Java"] - - data_dir = self.data / "d2d" / "about_files" - project1.copy_input_from(data_dir / "from-with-about-file.zip") - project1.copy_input_from(data_dir / "to-with-jar.zip") - - run = project1.add_pipeline( - pipeline_name=pipeline_name, selected_groups=selected_groups - ) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(44, project1.codebaseresources.count()) - self.assertEqual(31, project1.codebaserelations.count()) - self.assertEqual(2, project1.discoveredpackages.count()) - self.assertEqual(0, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = data_dir / "expected.json" - self.assertPipelineResultEqual(expected_file, result_file) - - self.assertEqual(1, project1.projectmessages.count()) - message = project1.projectmessages.get() - self.assertEqual("map_about_files", message.model) - expected = ( - "Resource paths listed at about_resource is not found in the to/ codebase" - ) - self.assertIn(expected, message.description) - - @mock.patch("scanpipe.pipes.purldb.request_post") - @mock.patch("scanpipe.pipes.purldb.is_available") - def test_scanpipe_populate_purldb_pipeline_integration( - self, mock_is_available, mock_request_post - ): - pipeline_name1 = "load_inventory" - pipeline_name2 = "populate_purldb" - project1 = make_project() - - input_location = self.data / "asgiref" / "asgiref-3.3.0_toolkit_scan.json" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name1) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - def mock_request_post_return(url, data, headers, timeout): - payload = json.loads(data) - return { - "queued_packages_count": len(payload["packages"]), - "queued_packages": payload["packages"], - "unqueued_packages_count": 1, - "unqueued_packages": [], - "unsupported_packages_count": 1, - "unsupported_packages": [], - } - - mock_request_post.side_effect = mock_request_post_return - mock_is_available.return_value = True - - run = project1.add_pipeline(pipeline_name2) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertIn("Populating PurlDB with 2 PURLs from DiscoveredPackage", run.log) - self.assertIn("Successfully queued 2 PURLs for indexing in PurlDB", run.log) - self.assertIn("1 PURLs were already present in PurlDB index queue", run.log) - self.assertIn("Couldn't index 1 unsupported PURLs", run.log) - - @mock.patch("scanpipe.pipes.purldb.request_post") - @mock.patch("scanpipe.pipes.purldb.is_available") - def test_scanpipe_populate_purldb_pipeline_integration_without_assembly( - self, mock_is_available, mock_request_post - ): - pipeline_name = "populate_purldb" - project1 = make_project() - - def mock_request_post_return(url, data, headers, timeout): - payload = json.loads(data) - return { - "queued_packages_count": len(payload["packages"]), - "queued_packages": payload["packages"], - "unqueued_packages_count": 1, - "unqueued_packages": [], - "unsupported_packages_count": 1, - "unsupported_packages": [], - } - - mock_request_post.side_effect = mock_request_post_return - mock_is_available.return_value = True - - package_json_location = self.data / "manifests" / "package.json" - copy_input(package_json_location, project1.codebase_path) - pipes.collect_and_create_codebase_resources(project1) - - scancode.scan_for_application_packages(project1, assemble=False) - scancode.process_package_data(project1) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertIn("Populating PurlDB with 1 PURLs from DiscoveredPackage", run.log) - self.assertIn( - "Populating PurlDB with 6 unresolved PURLs from DiscoveredDependency", - run.log, - ) - self.assertIn("1 PURLs were already present in PurlDB index queue", run.log) - self.assertIn("Couldn't index 1 unsupported PURLs", run.log) - - @skipIf(sys.platform == "darwin", "Not supported on macOS") - def test_scanpipe_collect_symbols_ctags_pipeline_integration(self): - pipeline_name = "collect_symbols_ctags" - project1 = make_project() - - dir = project1.codebase_path / "codefile" - dir.mkdir(parents=True) - - file_location = self.data / "d2d-javascript" / "from" / "main.js" - copy_input(file_location, dir) - - pipes.collect_and_create_codebase_resources(project1) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - main_file = project1.codebaseresources.files()[0] - result_extra_data_symbols = main_file.extra_data.get("source_symbols") - expected_extra_data_symbols = ["generatePassword", "passwordLength", "charSet"] - self.assertCountEqual(expected_extra_data_symbols, result_extra_data_symbols) - - @skipIf(sys.platform != "linux", "Only supported on Linux") - def test_scanpipe_collect_strings_gettext_pipeline_integration(self): - pipeline_name = "collect_strings_gettext" - project1 = make_project() - dir = project1.codebase_path / "codefile" - dir.mkdir(parents=True) - - file_location = self.data / "d2d-javascript" / "from" / "main.js" - copy_input(file_location, dir) - - pipes.collect_and_create_codebase_resources(project1) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - main_file = project1.codebaseresources.files()[0] - result_extra_data_strings = main_file.extra_data.get("source_strings") - expected_extra_data_strings = [ - "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_-+=", # noqa - "Enter the desired length of your password:", - ] - self.assertCountEqual(expected_extra_data_strings, result_extra_data_strings) - - @skipIf(sys.platform == "darwin", "Not supported on macOS") - def test_scanpipe_collect_symbols_pygments_pipeline_integration(self): - pipeline_name = "collect_symbols_pygments" - project1 = make_project() - - dir = project1.codebase_path / "codefile" - dir.mkdir(parents=True) - - file_location = self.data / "source-inspector" / "test3.cpp" - copy_input(file_location, dir) - - pipes.collect_and_create_codebase_resources(project1) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - main_file = project1.codebaseresources.files()[0] - result_extra_data = main_file.extra_data - - expected_extra_data = ( - self.data / "source-inspector" / "test3.cpp-pygments-expected.json" - ) - - with open(expected_extra_data) as f: - expected_extra_data = json.load(f) - - self.assertDictEqual(expected_extra_data, result_extra_data) - - @skipIf(sys.platform == "darwin", "Not supported on macOS") - def test_scanpipe_collect_symbols_tree_sitter_pipeline_integration(self): - pipeline_name = "collect_symbols_tree_sitter" - project1 = make_project() - - dir = project1.codebase_path / "codefile" - dir.mkdir(parents=True) - - file_location = self.data / "source-inspector" / "test3.cpp" - copy_input(file_location, dir) - - pipes.collect_and_create_codebase_resources(project1) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - main_file = project1.codebaseresources.files()[0] - result_extra_data = main_file.extra_data - - expected_extra_data = ( - self.data / "source-inspector" / "test3.cpp-tree-sitter-expected.json" - ) - - with open(expected_extra_data) as f: - expected_extra_data = json.load(f) - - self.assertDictEqual(expected_extra_data, result_extra_data) - - @mock.patch("scanpipe.pipes.purldb.is_available") - @mock.patch("scanpipe.pipes.purldb.is_configured") - @mock.patch("scanpipe.pipes.purldb.collect_data_for_purl") - def test_scanpipe_enrich_with_purldb_pipeline_integration( - self, mock_collect_data, mock_is_configured, mock_is_available - ): - pipeline_name = "enrich_with_purldb" - project1 = make_project() - package1 = make_package(project1, package_url="pkg:npm/csvtojson@2.0.10") - - mock_is_configured.return_value = True - mock_is_available.return_value = True - - purldb_entry_file = self.data / "purldb" / "csvtojson-2.0.10.json" - purldb_entry = json.loads(purldb_entry_file.read_text()) - mock_collect_data.return_value = [purldb_entry] - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - package1.refresh_from_db() - self.assertTrue(package1.extra_data.get("enrich_with_purldb")) - - run.refresh_from_db() - self.assertIn("pkg:npm/csvtojson@2.0.10 ['release_date'", run.log) - self.assertIn("1 discovered package enriched with the PurlDB.", run.log) - - def test_scanpipe_benchmark_purls_pipeline_integration(self): - project1 = make_project(name="Analysis") - - file_location = self.data / "benchmark" / "scancodeio_alpine_3.22.1.cdx.json" - project1.copy_input_from(file_location) - file_location = self.data / "benchmark" / "alpine-3.22.1-expected-purls.txt" - project1.copy_input_from(file_location) - - run = project1.add_pipeline(pipeline_name="load_sbom") - pipeline = run.make_pipeline_instance() - pipeline.execute() - self.assertEqual(2, project1.codebaseresources.count()) - self.assertEqual(16, project1.discoveredpackages.count()) - - run = project1.add_pipeline(pipeline_name="benchmark_purls") - pipeline = run.make_pipeline_instance() - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - result_file = project1.get_latest_output( - filename="benchmark_purls", extension="txt" - ) - expected_file = self.data / "benchmark" / "alpine-3.22.1-expected-benchmark.txt" - self.assertEqual(expected_file.read_text(), result_file.read_text()) From 195c3b794953f0ce48e81c54bcfcf652d77a56aa Mon Sep 17 00:00:00 2001 From: Varsha U N Date: Thu, 18 Sep 2025 08:56:35 +0530 Subject: [PATCH 08/18] Update Dockerfile --- Dockerfile | 96 ------------------------------------------------------ 1 file changed, 96 deletions(-) diff --git a/Dockerfile b/Dockerfile index 5782b8ceaf..621935aa4c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -93,102 +93,6 @@ RUN pip install --no-cache-dir . # Copy the codebase and set the proper permissions for the APP_USER COPY --chown=$APP_USER:$APP_USER . $APP_DIR -======= -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - -FROM python:3.13-slim - -LABEL org.opencontainers.image.source="https://github.com/aboutcode-org/scancode.io" -LABEL org.opencontainers.image.description="ScanCode.io" -LABEL org.opencontainers.image.licenses="Apache-2.0" - -# Set default values for APP_UID and APP_GID at build-time -ARG APP_UID=1000 -ARG APP_GID=1000 - -ENV APP_NAME=scancodeio -ENV APP_USER=app -ENV APP_UID=${APP_UID} -ENV APP_GID=${APP_GID} -ENV APP_DIR=/opt/$APP_NAME -ENV VENV_LOCATION=/opt/$APP_NAME/.venv - -# Force Python unbuffered stdout and stderr (they are flushed to terminal immediately) -ENV PYTHONUNBUFFERED=1 -# Do not write Python .pyc files -ENV PYTHONDONTWRITEBYTECODE=1 -# Add the app dir in the Python path for entry points availability -ENV PYTHONPATH=$PYTHONPATH:$APP_DIR - -# OS requirements as per -# https://scancode-toolkit.readthedocs.io/en/latest/getting-started/install.html -# Also install universal-ctags and xgettext for symbol and string collection. -RUN apt-get update \ - && apt-get install -y --no-install-recommends \ - bzip2 \ - xz-utils \ - zlib1g \ - libxml2-dev \ - libxslt1-dev \ - libgomp1 \ - libsqlite3-0 \ - libgcrypt20 \ - libpopt0 \ - libzstd1 \ - libgpgme11 \ - libdevmapper1.02.1 \ - libguestfs-tools \ - linux-image-amd64 \ - git \ - wait-for-it \ - universal-ctags \ - gettext \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* - -# Create the APP_USER group, user, and directory with specific UID and GID -RUN groupadd --gid $APP_GID --system $APP_USER \ - && useradd --uid $APP_UID --gid $APP_GID --home-dir $APP_DIR --system --create-home $APP_USER \ - && chown $APP_USER:$APP_USER $APP_DIR \ - && mkdir -p /var/$APP_NAME \ - && chown $APP_USER:$APP_USER /var/$APP_NAME - -# Setup the work directory and the user as APP_USER for the remaining stages -WORKDIR $APP_DIR -USER $APP_USER -# Create static/ and workspace/ directories -RUN mkdir -p /var/$APP_NAME/static/ /var/$APP_NAME/workspace/ -# Create the virtualenv -RUN python -m venv $VENV_LOCATION -# Enable the virtualenv, similar effect as "source activate" -ENV PATH=$VENV_LOCATION/bin:$PATH - -# Install the dependencies before the codebase COPY for proper Docker layer caching -COPY --chown=$APP_USER:$APP_USER pyproject.toml $APP_DIR/ -RUN pip install --no-cache-dir . - -# Copy the codebase and set the proper permissions for the APP_USER -COPY --chown=$APP_USER:$APP_USER . $APP_DIR From 48c8b1ca2f188a6d756e1e87d5dc364f913d9edd Mon Sep 17 00:00:00 2001 From: Varsha U N Date: Thu, 18 Sep 2025 07:51:43 +0530 Subject: [PATCH 09/18] Revert "add tests for storing packages" This reverts commit 87c81bd08c57ac5ac6d1dee1cc21121cb3363687. --- Dockerfile | 3 - scancodeio/settings.py | 979 +++---- scanpipe/archiving.py | 375 +-- scanpipe/pipelines/__init__.py | 699 +++-- scanpipe/pipes/input.py | 692 ++--- scanpipe/tests/test_archiving.py | 172 +- scanpipe/tests/test_input.py | 255 +- scanpipe/tests/test_pipelines.py | 4114 +++++++++++++++--------------- 8 files changed, 3660 insertions(+), 3629 deletions(-) diff --git a/Dockerfile b/Dockerfile index 621935aa4c..42761550d9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -93,6 +93,3 @@ RUN pip install --no-cache-dir . # Copy the codebase and set the proper permissions for the APP_USER COPY --chown=$APP_USER:$APP_USER . $APP_DIR - - - diff --git a/scancodeio/settings.py b/scancodeio/settings.py index 15e52a4440..2d7686900c 100644 --- a/scancodeio/settings.py +++ b/scancodeio/settings.py @@ -1,488 +1,491 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - -import logging -import sys -import tempfile -from pathlib import Path - -import environ - -from scanpipe.archiving import LocalFilesystemProvider - -PROJECT_DIR = environ.Path(__file__) - 1 -ROOT_DIR = PROJECT_DIR - 1 - -# True if running tests through `./manage test` -IS_TESTS = "test" in sys.argv - -# Environment - -ENV_FILE = "/etc/scancodeio/.env" -if not Path(ENV_FILE).exists(): - ENV_FILE = ROOT_DIR(".env") - -# Do not use local .env environment when running the tests. -if IS_TESTS: - ENV_FILE = None - -env = environ.Env() -environ.Env.read_env(ENV_FILE) - -# Security - -SECRET_KEY = env.str("SECRET_KEY", default="") - -ALLOWED_HOSTS = env.list( - "ALLOWED_HOSTS", - default=[".localhost", "127.0.0.1", "[::1]", "host.docker.internal", "172.17.0.1"], -) - -CSRF_TRUSTED_ORIGINS = env.list("CSRF_TRUSTED_ORIGINS", default=[]) - -# SECURITY WARNING: don't run with debug turned on in production -DEBUG = env.bool("SCANCODEIO_DEBUG", default=False) - -SCANCODEIO_REQUIRE_AUTHENTICATION = env.bool( - "SCANCODEIO_REQUIRE_AUTHENTICATION", default=False -) - -SCANCODEIO_ENABLE_ADMIN_SITE = env.bool("SCANCODEIO_ENABLE_ADMIN_SITE", default=False) - -SECURE_CONTENT_TYPE_NOSNIFF = env.bool("SECURE_CONTENT_TYPE_NOSNIFF", default=True) - -X_FRAME_OPTIONS = env.str("X_FRAME_OPTIONS", default="DENY") - -SESSION_COOKIE_SECURE = env.bool("SESSION_COOKIE_SECURE", default=True) - -CSRF_COOKIE_SECURE = env.bool("CSRF_COOKIE_SECURE", default=True) - -# ``security.W004`` SECURE_HSTS_SECONDS and ``security.W008`` SECURE_SSL_REDIRECT -# are handled by the web server. -SILENCED_SYSTEM_CHECKS = ["security.W004", "security.W008"] - -# ScanCode.io - -SCANCODEIO_WORKSPACE_LOCATION = env.str("SCANCODEIO_WORKSPACE_LOCATION", default="var") - -SCANCODEIO_CONFIG_DIR = env.str("SCANCODEIO_CONFIG_DIR", default=".scancode") - -SCANCODEIO_CONFIG_FILE = env.str( - "SCANCODEIO_CONFIG_FILE", default="scancode-config.yml" -) - -SCANCODEIO_LOG_LEVEL = env.str("SCANCODEIO_LOG_LEVEL", "INFO") - -# Set the number of parallel processes to use for ScanCode related scan execution. -# If the SCANCODEIO_PROCESSES argument is not set, defaults to an optimal number of CPUs -# available on the machine. -SCANCODEIO_PROCESSES = env.int("SCANCODEIO_PROCESSES", default=None) - -SCANCODEIO_POLICIES_FILE = env.str("SCANCODEIO_POLICIES_FILE", default="policies.yml") - -# This setting defines the additional locations ScanCode.io will search for pipelines. -# This should be set to a list of strings that contain full paths to your additional -# pipelines directories. -SCANCODEIO_PIPELINES_DIRS = env.list("SCANCODEIO_PIPELINES_DIRS", default=[]) - -# Maximum time allowed for a pipeline to complete. -SCANCODEIO_TASK_TIMEOUT = env.str("SCANCODEIO_TASK_TIMEOUT", default="24h") - -# Default to 2 minutes. -SCANCODEIO_SCAN_FILE_TIMEOUT = env.int("SCANCODEIO_SCAN_FILE_TIMEOUT", default=120) - -# Default to None which scans all files -SCANCODEIO_SCAN_MAX_FILE_SIZE = env.int("SCANCODEIO_SCAN_MAX_FILE_SIZE", default=None) - -# List views pagination, controls the number of items displayed per page. -# Syntax in .env: SCANCODEIO_PAGINATE_BY=project=10,project_error=10 -SCANCODEIO_PAGINATE_BY = env.dict( - "SCANCODEIO_PAGINATE_BY", - default={ - "project": 20, - "error": 50, - "resource": 100, - "package": 100, - "dependency": 100, - "license": 100, - "relation": 100, - }, -) - -# Default limit for "most common" entries in QuerySets. -SCANCODEIO_MOST_COMMON_LIMIT = env.int("SCANCODEIO_MOST_COMMON_LIMIT", default=7) - -# The base URL (e.g., https://hostname/) of this application instance. -# Required for generating URLs to reference objects within the app, -# such as in webhook notifications. -SCANCODEIO_SITE_URL = env.str("SCANCODEIO_SITE_URL", default="") - -# Fetch authentication credentials - -# SCANCODEIO_FETCH_BASIC_AUTH="host=user,password;" -SCANCODEIO_FETCH_BASIC_AUTH = env.dict( - "SCANCODEIO_FETCH_BASIC_AUTH", - cast={"value": tuple}, - default={}, -) - -# SCANCODEIO_FETCH_DIGEST_AUTH="host=user,password;" -SCANCODEIO_FETCH_DIGEST_AUTH = env.dict( - "SCANCODEIO_FETCH_DIGEST_AUTH", - cast={"value": tuple}, - default={}, -) - -# SCANCODEIO_FETCH_HEADERS="host=Header1=value,Header2=value;" -SCANCODEIO_FETCH_HEADERS = {} -FETCH_HEADERS_STR = env.str("SCANCODEIO_FETCH_HEADERS", default="") -for entry in FETCH_HEADERS_STR.split(";"): - if entry.strip(): - host, headers = entry.split("=", 1) - SCANCODEIO_FETCH_HEADERS[host] = env.parse_value(headers, cast=dict) - -# SCANCODEIO_NETRC_LOCATION="~/.netrc" -SCANCODEIO_NETRC_LOCATION = env.str("SCANCODEIO_NETRC_LOCATION", default="") -if SCANCODEIO_NETRC_LOCATION: - # Propagate the location to the environ for `requests.utils.get_netrc_auth` - env.ENVIRON["NETRC"] = SCANCODEIO_NETRC_LOCATION - -# SCANCODEIO_SKOPEO_CREDENTIALS="host1=user:password,host2=user:password" -SCANCODEIO_SKOPEO_CREDENTIALS = env.dict("SCANCODEIO_SKOPEO_CREDENTIALS", default={}) - -# SCANCODEIO_SKOPEO_AUTHFILE_LOCATION="/path/to/auth.json" -SCANCODEIO_SKOPEO_AUTHFILE_LOCATION = env.str( - "SCANCODEIO_SKOPEO_AUTHFILE_LOCATION", default="" -) - -# This webhook will be added as WebhookSubscription for each new project. -# SCANCODEIO_GLOBAL_WEBHOOK=target_url=https://webhook.url,trigger_on_each_run=False,include_summary=True,include_results=False -SCANCODEIO_GLOBAL_WEBHOOK = env.dict("SCANCODEIO_GLOBAL_WEBHOOK", default={}) - -# Application definition - -INSTALLED_APPS = [ - # Local apps - # Must come before Third-party apps for proper templates override - "scanpipe", - # Django built-in - "django.contrib.auth", - "django.contrib.contenttypes", - "django.contrib.sessions", - "django.contrib.messages", - "django.contrib.staticfiles", - "django.contrib.admin", - "django.contrib.humanize", - # Third-party apps - "crispy_forms", - "crispy_bootstrap3", # required for the djangorestframework browsable API - "django_filters", - "rest_framework", - "rest_framework.authtoken", - "django_rq", - "django_probes", - "taggit", -] - -MIDDLEWARE = [ - "django.middleware.security.SecurityMiddleware", - "django.contrib.sessions.middleware.SessionMiddleware", - "django.middleware.common.CommonMiddleware", - "django.middleware.csrf.CsrfViewMiddleware", - "django.contrib.auth.middleware.AuthenticationMiddleware", - "django.contrib.messages.middleware.MessageMiddleware", - "django.middleware.clickjacking.XFrameOptionsMiddleware", - "scancodeio.middleware.TimezoneMiddleware", -] - -ROOT_URLCONF = "scancodeio.urls" - -WSGI_APPLICATION = "scancodeio.wsgi.application" - -SECURE_PROXY_SSL_HEADER = env.tuple( - "SECURE_PROXY_SSL_HEADER", default=("HTTP_X_FORWARDED_PROTO", "https") -) - -# Database - -DATABASES = { - "default": { - "ENGINE": env.str("SCANCODEIO_DB_ENGINE", "django.db.backends.postgresql"), - "HOST": env.str("SCANCODEIO_DB_HOST", "localhost"), - "NAME": env.str("SCANCODEIO_DB_NAME", "scancodeio"), - "USER": env.str("SCANCODEIO_DB_USER", "scancodeio"), - "PASSWORD": env.str("SCANCODEIO_DB_PASSWORD", "scancodeio"), - "PORT": env.str("SCANCODEIO_DB_PORT", "5432"), - "ATOMIC_REQUESTS": True, - } -} - -DEFAULT_AUTO_FIELD = "django.db.models.AutoField" - -# Forms and filters - -FILTERS_EMPTY_CHOICE_LABEL = env.str("FILTERS_EMPTY_CHOICE_LABEL", default="All") - -# Templates - -TEMPLATES = [ - { - "BACKEND": "django.template.backends.django.DjangoTemplates", - "APP_DIRS": True, - "OPTIONS": { - "debug": DEBUG, - "context_processors": [ - "django.contrib.auth.context_processors.auth", - "django.contrib.messages.context_processors.messages", - "django.template.context_processors.request", - "scancodeio.context_processors.versions", - ], - }, - }, -] - -# Login - -LOGIN_REDIRECT_URL = "project_list" - -# Passwords - -AUTH_PASSWORD_VALIDATORS = [ - { - "NAME": ( - "django.contrib.auth.password_validation.UserAttributeSimilarityValidator" - ), - }, - { - "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator", - "OPTIONS": { - "min_length": env.int("SCANCODEIO_PASSWORD_MIN_LENGTH", default=12), - }, - }, - { - "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator", - }, - { - "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator", - }, -] - -# Testing - -if IS_TESTS: - from django.core.management.utils import get_random_secret_key - - SECRET_KEY = get_random_secret_key() - # Do not pollute the workspace while running the tests. - SCANCODEIO_WORKSPACE_LOCATION = tempfile.mkdtemp() - SCANCODEIO_REQUIRE_AUTHENTICATION = True - SCANCODEIO_SCAN_FILE_TIMEOUT = 120 - SCANCODEIO_POLICIES_FILE = None - # The default password hasher is rather slow by design. - # Using a faster hashing algorithm in the testing context to speed up the run. - PASSWORD_HASHERS = ["django.contrib.auth.hashers.MD5PasswordHasher"] - -# Debug toolbar - -DEBUG_TOOLBAR = env.bool("SCANCODEIO_DEBUG_TOOLBAR", default=False) -if DEBUG and DEBUG_TOOLBAR: - INSTALLED_APPS.append("debug_toolbar") - MIDDLEWARE.append("debug_toolbar.middleware.DebugToolbarMiddleware") - INTERNAL_IPS = ["127.0.0.1"] - -# Logging - -LOGGING = { - "version": 1, - "disable_existing_loggers": False, - "formatters": { - "simple": { - "format": "{levelname} {message}", - "style": "{", - }, - }, - "handlers": { - "null": { - "class": "logging.NullHandler", - }, - "console": { - "class": "logging.StreamHandler", - "formatter": "simple", - }, - }, - "loggers": { - "scanpipe": { - "handlers": ["null"] if IS_TESTS else ["console"], - "level": SCANCODEIO_LOG_LEVEL, - "propagate": False, - }, - "django": { - "handlers": ["null"] if IS_TESTS else ["console"], - "propagate": False, - }, - # Set SCANCODEIO_LOG_LEVEL=DEBUG to display all SQL queries in the console. - "django.db.backends": { - "level": SCANCODEIO_LOG_LEVEL, - }, - }, -} - -# Instead of sending out real emails the console backend just writes the emails -# that would be sent to the standard output. -EMAIL_BACKEND = "django.core.mail.backends.console.EmailBackend" - -# Internationalization - -LANGUAGE_CODE = "en-us" - -FORMAT_MODULE_PATH = ["scancodeio.formats"] - -TIME_ZONE = env.str("TIME_ZONE", default="UTC") - -USE_I18N = True - -USE_TZ = True - -# Static files (CSS, JavaScript, Images) - -STATIC_URL = "/static/" - -STATIC_ROOT = env.str("STATIC_ROOT", default="/var/scancodeio/static/") - -STATICFILES_DIRS = [ - PROJECT_DIR("static"), -] - -# Third-party apps - -CRISPY_TEMPLATE_PACK = "bootstrap3" - -# Centralized archive directory for all projects -CENTRAL_ARCHIVE_PATH = env.str( - "CENTRAL_ARCHIVE_PATH", default="/var/scancodeio/archives" -) - -# localstorage configuration -DOWNLOAD_ARCHIVING_PROVIDER = env.str( - "DOWNLOAD_ARCHIVING_PROVIDER", default="localstorage" -) - -# For local storage, we would store the root path in that setting -DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION = env.dict( - "DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION", default=None -) - -# Initialize the DownloadStore for local storage - -download_store = None -logger = logging.getLogger(__name__) -if DOWNLOAD_ARCHIVING_PROVIDER == "localstorage": - config = DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION or {} - root_path = Path(config.get("root_path", CENTRAL_ARCHIVE_PATH)) - try: - download_store = LocalFilesystemProvider(root_path=root_path) - except Exception as e: - logger.error(f"Failed to initialize LocalFilesystemProvider: {e}") -else: - logger.error(f"Unknown DOWNLOAD_ARCHIVING_PROVIDER: {DOWNLOAD_ARCHIVING_PROVIDER}") - -# Job Queue - -RQ_QUEUES = { - "default": { - "HOST": env.str("SCANCODEIO_RQ_REDIS_HOST", default="localhost"), - "PORT": env.str("SCANCODEIO_RQ_REDIS_PORT", default="6379"), - "DB": env.int("SCANCODEIO_RQ_REDIS_DB", default=0), - "USERNAME": env.str("SCANCODEIO_RQ_REDIS_USERNAME", default=None), - "PASSWORD": env.str("SCANCODEIO_RQ_REDIS_PASSWORD", default=""), - "DEFAULT_TIMEOUT": env.int("SCANCODEIO_RQ_REDIS_DEFAULT_TIMEOUT", default=360), - # Enable SSL for Redis connections when deploying ScanCode.io in environments - # where Redis is hosted on a separate system (e.g., cloud deployment or remote - # Redis server) to secure data in transit. - "SSL": env.bool("SCANCODEIO_RQ_REDIS_SSL", default=False), - }, -} - -SCANCODEIO_ASYNC = env.bool("SCANCODEIO_ASYNC", default=False) -if not SCANCODEIO_ASYNC: - for queue_config in RQ_QUEUES.values(): - queue_config["ASYNC"] = False - -# ClamAV virus scan -CLAMD_USE_TCP = env.bool("CLAMD_USE_TCP", default=True) -CLAMD_TCP_ADDR = env.str("CLAMD_TCP_ADDR", default="clamav") - -# Django restframework - -REST_FRAMEWORK = { - "DEFAULT_AUTHENTICATION_CLASSES": ( - "rest_framework.authentication.TokenAuthentication", - ), - "DEFAULT_PERMISSION_CLASSES": ("rest_framework.permissions.IsAuthenticated",), - "DEFAULT_RENDERER_CLASSES": ( - "rest_framework.renderers.JSONRenderer", - "rest_framework.renderers.BrowsableAPIRenderer", - "rest_framework.renderers.AdminRenderer", - ), - "DEFAULT_FILTER_BACKENDS": ( - "django_filters.rest_framework.DjangoFilterBackend", - "rest_framework.filters.SearchFilter", - ), - "DEFAULT_PAGINATION_CLASS": "rest_framework.pagination.PageNumberPagination", - "PAGE_SIZE": env.int("SCANCODEIO_REST_API_PAGE_SIZE", default=50), - "UPLOADED_FILES_USE_URL": False, -} - -if not SCANCODEIO_REQUIRE_AUTHENTICATION: - REST_FRAMEWORK["DEFAULT_PERMISSION_CLASSES"] = ( - "rest_framework.permissions.AllowAny", - ) - -# VulnerableCode integration - -VULNERABLECODE_URL = env.str("VULNERABLECODE_URL", default="").rstrip("/") -VULNERABLECODE_USER = env.str("VULNERABLECODE_USER", default="") -VULNERABLECODE_PASSWORD = env.str("VULNERABLECODE_PASSWORD", default="") -VULNERABLECODE_API_KEY = env.str("VULNERABLECODE_API_KEY", default="") - -# PurlDB integration - -PURLDB_URL = env.str("PURLDB_URL", default="").rstrip("/") -PURLDB_USER = env.str("PURLDB_USER", default="") -PURLDB_PASSWORD = env.str("PURLDB_PASSWORD", default="") -PURLDB_API_KEY = env.str("PURLDB_API_KEY", default="") - -# MatchCode.io integration - -MATCHCODEIO_URL = env.str("MATCHCODEIO_URL", default="").rstrip("/") -MATCHCODEIO_USER = env.str("MATCHCODEIO_USER", default="") -MATCHCODEIO_PASSWORD = env.str("MATCHCODEIO_PASSWORD", default="") -MATCHCODEIO_API_KEY = env.str("MATCHCODEIO_API_KEY", default="") - -# FederatedCode integration - -FEDERATEDCODE_GIT_ACCOUNT_URL = env.str( - "FEDERATEDCODE_GIT_ACCOUNT_URL", default="" -).rstrip("/") -FEDERATEDCODE_GIT_SERVICE_TOKEN = env.str("FEDERATEDCODE_GIT_SERVICE_TOKEN", default="") -FEDERATEDCODE_GIT_SERVICE_NAME = env.str("FEDERATEDCODE_GIT_SERVICE_NAME", default="") -FEDERATEDCODE_GIT_SERVICE_EMAIL = env.str("FEDERATEDCODE_GIT_SERVICE_EMAIL", default="") +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +import sys +import tempfile +from pathlib import Path +import logging + +import environ + +from scanpipe.archiving import LocalFilesystemProvider + + +PROJECT_DIR = environ.Path(__file__) - 1 +ROOT_DIR = PROJECT_DIR - 1 + +# True if running tests through `./manage test` +IS_TESTS = "test" in sys.argv + +# Environment + +ENV_FILE = "/etc/scancodeio/.env" +if not Path(ENV_FILE).exists(): + ENV_FILE = ROOT_DIR(".env") + +# Do not use local .env environment when running the tests. +if IS_TESTS: + ENV_FILE = None + +env = environ.Env() +environ.Env.read_env(ENV_FILE) + +# Security + +SECRET_KEY = env.str("SECRET_KEY", default="") + +ALLOWED_HOSTS = env.list( + "ALLOWED_HOSTS", + default=[".localhost", "127.0.0.1", "[::1]", "host.docker.internal", "172.17.0.1"], +) + +CSRF_TRUSTED_ORIGINS = env.list("CSRF_TRUSTED_ORIGINS", default=[]) + +# SECURITY WARNING: don't run with debug turned on in production +DEBUG = env.bool("SCANCODEIO_DEBUG", default=False) + +SCANCODEIO_REQUIRE_AUTHENTICATION = env.bool( + "SCANCODEIO_REQUIRE_AUTHENTICATION", default=False +) + +SCANCODEIO_ENABLE_ADMIN_SITE = env.bool("SCANCODEIO_ENABLE_ADMIN_SITE", default=False) + +SECURE_CONTENT_TYPE_NOSNIFF = env.bool("SECURE_CONTENT_TYPE_NOSNIFF", default=True) + +X_FRAME_OPTIONS = env.str("X_FRAME_OPTIONS", default="DENY") + +SESSION_COOKIE_SECURE = env.bool("SESSION_COOKIE_SECURE", default=True) + +CSRF_COOKIE_SECURE = env.bool("CSRF_COOKIE_SECURE", default=True) + +# ``security.W004`` SECURE_HSTS_SECONDS and ``security.W008`` SECURE_SSL_REDIRECT +# are handled by the web server. +SILENCED_SYSTEM_CHECKS = ["security.W004", "security.W008"] + +# ScanCode.io + +SCANCODEIO_WORKSPACE_LOCATION = env.str("SCANCODEIO_WORKSPACE_LOCATION", default="var") + +SCANCODEIO_CONFIG_DIR = env.str("SCANCODEIO_CONFIG_DIR", default=".scancode") + +SCANCODEIO_CONFIG_FILE = env.str( + "SCANCODEIO_CONFIG_FILE", default="scancode-config.yml" +) + +SCANCODEIO_LOG_LEVEL = env.str("SCANCODEIO_LOG_LEVEL", "INFO") + +# Set the number of parallel processes to use for ScanCode related scan execution. +# If the SCANCODEIO_PROCESSES argument is not set, defaults to an optimal number of CPUs +# available on the machine. +SCANCODEIO_PROCESSES = env.int("SCANCODEIO_PROCESSES", default=None) + +SCANCODEIO_POLICIES_FILE = env.str("SCANCODEIO_POLICIES_FILE", default="policies.yml") + +# This setting defines the additional locations ScanCode.io will search for pipelines. +# This should be set to a list of strings that contain full paths to your additional +# pipelines directories. +SCANCODEIO_PIPELINES_DIRS = env.list("SCANCODEIO_PIPELINES_DIRS", default=[]) + +# Maximum time allowed for a pipeline to complete. +SCANCODEIO_TASK_TIMEOUT = env.str("SCANCODEIO_TASK_TIMEOUT", default="24h") + +# Default to 2 minutes. +SCANCODEIO_SCAN_FILE_TIMEOUT = env.int("SCANCODEIO_SCAN_FILE_TIMEOUT", default=120) + +# Default to None which scans all files +SCANCODEIO_SCAN_MAX_FILE_SIZE = env.int("SCANCODEIO_SCAN_MAX_FILE_SIZE", default=None) + +# List views pagination, controls the number of items displayed per page. +# Syntax in .env: SCANCODEIO_PAGINATE_BY=project=10,project_error=10 +SCANCODEIO_PAGINATE_BY = env.dict( + "SCANCODEIO_PAGINATE_BY", + default={ + "project": 20, + "error": 50, + "resource": 100, + "package": 100, + "dependency": 100, + "license": 100, + "relation": 100, + }, +) + +# Default limit for "most common" entries in QuerySets. +SCANCODEIO_MOST_COMMON_LIMIT = env.int("SCANCODEIO_MOST_COMMON_LIMIT", default=7) + +# The base URL (e.g., https://hostname/) of this application instance. +# Required for generating URLs to reference objects within the app, +# such as in webhook notifications. +SCANCODEIO_SITE_URL = env.str("SCANCODEIO_SITE_URL", default="") + +# Fetch authentication credentials + +# SCANCODEIO_FETCH_BASIC_AUTH="host=user,password;" +SCANCODEIO_FETCH_BASIC_AUTH = env.dict( + "SCANCODEIO_FETCH_BASIC_AUTH", + cast={"value": tuple}, + default={}, +) + +# SCANCODEIO_FETCH_DIGEST_AUTH="host=user,password;" +SCANCODEIO_FETCH_DIGEST_AUTH = env.dict( + "SCANCODEIO_FETCH_DIGEST_AUTH", + cast={"value": tuple}, + default={}, +) + +# SCANCODEIO_FETCH_HEADERS="host=Header1=value,Header2=value;" +SCANCODEIO_FETCH_HEADERS = {} +FETCH_HEADERS_STR = env.str("SCANCODEIO_FETCH_HEADERS", default="") +for entry in FETCH_HEADERS_STR.split(";"): + if entry.strip(): + host, headers = entry.split("=", 1) + SCANCODEIO_FETCH_HEADERS[host] = env.parse_value(headers, cast=dict) + +# SCANCODEIO_NETRC_LOCATION="~/.netrc" +SCANCODEIO_NETRC_LOCATION = env.str("SCANCODEIO_NETRC_LOCATION", default="") +if SCANCODEIO_NETRC_LOCATION: + # Propagate the location to the environ for `requests.utils.get_netrc_auth` + env.ENVIRON["NETRC"] = SCANCODEIO_NETRC_LOCATION + +# SCANCODEIO_SKOPEO_CREDENTIALS="host1=user:password,host2=user:password" +SCANCODEIO_SKOPEO_CREDENTIALS = env.dict("SCANCODEIO_SKOPEO_CREDENTIALS", default={}) + +# SCANCODEIO_SKOPEO_AUTHFILE_LOCATION="/path/to/auth.json" +SCANCODEIO_SKOPEO_AUTHFILE_LOCATION = env.str( + "SCANCODEIO_SKOPEO_AUTHFILE_LOCATION", default="" +) + +# This webhook will be added as WebhookSubscription for each new project. +# SCANCODEIO_GLOBAL_WEBHOOK=target_url=https://webhook.url,trigger_on_each_run=False,include_summary=True,include_results=False +SCANCODEIO_GLOBAL_WEBHOOK = env.dict("SCANCODEIO_GLOBAL_WEBHOOK", default={}) + +# Application definition + +INSTALLED_APPS = [ + # Local apps + # Must come before Third-party apps for proper templates override + "scanpipe", + # Django built-in + "django.contrib.auth", + "django.contrib.contenttypes", + "django.contrib.sessions", + "django.contrib.messages", + "django.contrib.staticfiles", + "django.contrib.admin", + "django.contrib.humanize", + # Third-party apps + "crispy_forms", + "crispy_bootstrap3", # required for the djangorestframework browsable API + "django_filters", + "rest_framework", + "rest_framework.authtoken", + "django_rq", + "django_probes", + "taggit", +] + +MIDDLEWARE = [ + "django.middleware.security.SecurityMiddleware", + "django.contrib.sessions.middleware.SessionMiddleware", + "django.middleware.common.CommonMiddleware", + "django.middleware.csrf.CsrfViewMiddleware", + "django.contrib.auth.middleware.AuthenticationMiddleware", + "django.contrib.messages.middleware.MessageMiddleware", + "django.middleware.clickjacking.XFrameOptionsMiddleware", + "scancodeio.middleware.TimezoneMiddleware", +] + +ROOT_URLCONF = "scancodeio.urls" + +WSGI_APPLICATION = "scancodeio.wsgi.application" + +SECURE_PROXY_SSL_HEADER = env.tuple( + "SECURE_PROXY_SSL_HEADER", default=("HTTP_X_FORWARDED_PROTO", "https") +) + +# Database + +DATABASES = { + "default": { + "ENGINE": env.str("SCANCODEIO_DB_ENGINE", "django.db.backends.postgresql"), + "HOST": env.str("SCANCODEIO_DB_HOST", "localhost"), + "NAME": env.str("SCANCODEIO_DB_NAME", "scancodeio"), + "USER": env.str("SCANCODEIO_DB_USER", "scancodeio"), + "PASSWORD": env.str("SCANCODEIO_DB_PASSWORD", "scancodeio"), + "PORT": env.str("SCANCODEIO_DB_PORT", "5432"), + "ATOMIC_REQUESTS": True, + } +} + +DEFAULT_AUTO_FIELD = "django.db.models.AutoField" + +# Forms and filters + +FILTERS_EMPTY_CHOICE_LABEL = env.str("FILTERS_EMPTY_CHOICE_LABEL", default="All") + +# Templates + +TEMPLATES = [ + { + "BACKEND": "django.template.backends.django.DjangoTemplates", + "APP_DIRS": True, + "OPTIONS": { + "debug": DEBUG, + "context_processors": [ + "django.contrib.auth.context_processors.auth", + "django.contrib.messages.context_processors.messages", + "django.template.context_processors.request", + "scancodeio.context_processors.versions", + ], + }, + }, +] + +# Login + +LOGIN_REDIRECT_URL = "project_list" + +# Passwords + +AUTH_PASSWORD_VALIDATORS = [ + { + "NAME": ( + "django.contrib.auth.password_validation.UserAttributeSimilarityValidator" + ), + }, + { + "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator", + "OPTIONS": { + "min_length": env.int("SCANCODEIO_PASSWORD_MIN_LENGTH", default=12), + }, + }, + { + "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator", + }, + { + "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator", + }, +] + +# Testing + +if IS_TESTS: + from django.core.management.utils import get_random_secret_key + + SECRET_KEY = get_random_secret_key() + # Do not pollute the workspace while running the tests. + SCANCODEIO_WORKSPACE_LOCATION = tempfile.mkdtemp() + SCANCODEIO_REQUIRE_AUTHENTICATION = True + SCANCODEIO_SCAN_FILE_TIMEOUT = 120 + SCANCODEIO_POLICIES_FILE = None + # The default password hasher is rather slow by design. + # Using a faster hashing algorithm in the testing context to speed up the run. + PASSWORD_HASHERS = ["django.contrib.auth.hashers.MD5PasswordHasher"] + +# Debug toolbar + +DEBUG_TOOLBAR = env.bool("SCANCODEIO_DEBUG_TOOLBAR", default=False) +if DEBUG and DEBUG_TOOLBAR: + INSTALLED_APPS.append("debug_toolbar") + MIDDLEWARE.append("debug_toolbar.middleware.DebugToolbarMiddleware") + INTERNAL_IPS = ["127.0.0.1"] + +# Logging + +LOGGING = { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "simple": { + "format": "{levelname} {message}", + "style": "{", + }, + }, + "handlers": { + "null": { + "class": "logging.NullHandler", + }, + "console": { + "class": "logging.StreamHandler", + "formatter": "simple", + }, + }, + "loggers": { + "scanpipe": { + "handlers": ["null"] if IS_TESTS else ["console"], + "level": SCANCODEIO_LOG_LEVEL, + "propagate": False, + }, + "django": { + "handlers": ["null"] if IS_TESTS else ["console"], + "propagate": False, + }, + # Set SCANCODEIO_LOG_LEVEL=DEBUG to display all SQL queries in the console. + "django.db.backends": { + "level": SCANCODEIO_LOG_LEVEL, + }, + }, +} + +# Instead of sending out real emails the console backend just writes the emails +# that would be sent to the standard output. +EMAIL_BACKEND = "django.core.mail.backends.console.EmailBackend" + +# Internationalization + +LANGUAGE_CODE = "en-us" + +FORMAT_MODULE_PATH = ["scancodeio.formats"] + +TIME_ZONE = env.str("TIME_ZONE", default="UTC") + +USE_I18N = True + +USE_TZ = True + +# Static files (CSS, JavaScript, Images) + +STATIC_URL = "/static/" + +STATIC_ROOT = env.str("STATIC_ROOT", default="/var/scancodeio/static/") + +STATICFILES_DIRS = [ + PROJECT_DIR("static"), +] + +# Third-party apps + +CRISPY_TEMPLATE_PACK = "bootstrap3" + +# Centralized archive directory for all projects +CENTRAL_ARCHIVE_PATH = env.str( + "CENTRAL_ARCHIVE_PATH", default="/var/scancodeio/archives" +) + +# localstorage configuration +DOWNLOAD_ARCHIVING_PROVIDER = env.str( + "DOWNLOAD_ARCHIVING_PROVIDER", default="localstorage" +) + +# For local storage, we would store the root path in that setting +DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION = env.dict( + "DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION", default=None +) + +# Initialize the DownloadStore for local storage + +download_store = None +logger = logging.getLogger(__name__) +if DOWNLOAD_ARCHIVING_PROVIDER == "localstorage": + config = DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION or {} + root_path = Path(config.get("root_path", CENTRAL_ARCHIVE_PATH)) + try: + download_store = LocalFilesystemProvider(root_path=root_path) + except Exception as e: + logger.error(f"Failed to initialize LocalFilesystemProvider: {e}") +else: + logger.error( + f"Unknown DOWNLOAD_ARCHIVING_PROVIDER: {DOWNLOAD_ARCHIVING_PROVIDER}" + ) + +# Job Queue + +RQ_QUEUES = { + "default": { + "HOST": env.str("SCANCODEIO_RQ_REDIS_HOST", default="localhost"), + "PORT": env.str("SCANCODEIO_RQ_REDIS_PORT", default="6379"), + "DB": env.int("SCANCODEIO_RQ_REDIS_DB", default=0), + "USERNAME": env.str("SCANCODEIO_RQ_REDIS_USERNAME", default=None), + "PASSWORD": env.str("SCANCODEIO_RQ_REDIS_PASSWORD", default=""), + "DEFAULT_TIMEOUT": env.int("SCANCODEIO_RQ_REDIS_DEFAULT_TIMEOUT", default=360), + # Enable SSL for Redis connections when deploying ScanCode.io in environments + # where Redis is hosted on a separate system (e.g., cloud deployment or remote + # Redis server) to secure data in transit. + "SSL": env.bool("SCANCODEIO_RQ_REDIS_SSL", default=False), + }, +} + +SCANCODEIO_ASYNC = env.bool("SCANCODEIO_ASYNC", default=False) +if not SCANCODEIO_ASYNC: + for queue_config in RQ_QUEUES.values(): + queue_config["ASYNC"] = False + +# ClamAV virus scan +CLAMD_USE_TCP = env.bool("CLAMD_USE_TCP", default=True) +CLAMD_TCP_ADDR = env.str("CLAMD_TCP_ADDR", default="clamav") + +# Django restframework + +REST_FRAMEWORK = { + "DEFAULT_AUTHENTICATION_CLASSES": ( + "rest_framework.authentication.TokenAuthentication", + ), + "DEFAULT_PERMISSION_CLASSES": ("rest_framework.permissions.IsAuthenticated",), + "DEFAULT_RENDERER_CLASSES": ( + "rest_framework.renderers.JSONRenderer", + "rest_framework.renderers.BrowsableAPIRenderer", + "rest_framework.renderers.AdminRenderer", + ), + "DEFAULT_FILTER_BACKENDS": ( + "django_filters.rest_framework.DjangoFilterBackend", + "rest_framework.filters.SearchFilter", + ), + "DEFAULT_PAGINATION_CLASS": "rest_framework.pagination.PageNumberPagination", + "PAGE_SIZE": env.int("SCANCODEIO_REST_API_PAGE_SIZE", default=50), + "UPLOADED_FILES_USE_URL": False, +} + +if not SCANCODEIO_REQUIRE_AUTHENTICATION: + REST_FRAMEWORK["DEFAULT_PERMISSION_CLASSES"] = ( + "rest_framework.permissions.AllowAny", + ) + +# VulnerableCode integration + +VULNERABLECODE_URL = env.str("VULNERABLECODE_URL", default="").rstrip("/") +VULNERABLECODE_USER = env.str("VULNERABLECODE_USER", default="") +VULNERABLECODE_PASSWORD = env.str("VULNERABLECODE_PASSWORD", default="") +VULNERABLECODE_API_KEY = env.str("VULNERABLECODE_API_KEY", default="") + +# PurlDB integration + +PURLDB_URL = env.str("PURLDB_URL", default="").rstrip("/") +PURLDB_USER = env.str("PURLDB_USER", default="") +PURLDB_PASSWORD = env.str("PURLDB_PASSWORD", default="") +PURLDB_API_KEY = env.str("PURLDB_API_KEY", default="") + +# MatchCode.io integration + +MATCHCODEIO_URL = env.str("MATCHCODEIO_URL", default="").rstrip("/") +MATCHCODEIO_USER = env.str("MATCHCODEIO_USER", default="") +MATCHCODEIO_PASSWORD = env.str("MATCHCODEIO_PASSWORD", default="") +MATCHCODEIO_API_KEY = env.str("MATCHCODEIO_API_KEY", default="") + +# FederatedCode integration + +FEDERATEDCODE_GIT_ACCOUNT_URL = env.str( + "FEDERATEDCODE_GIT_ACCOUNT_URL", default="" +).rstrip("/") +FEDERATEDCODE_GIT_SERVICE_TOKEN = env.str("FEDERATEDCODE_GIT_SERVICE_TOKEN", default="") +FEDERATEDCODE_GIT_SERVICE_NAME = env.str("FEDERATEDCODE_GIT_SERVICE_NAME", default="") +FEDERATEDCODE_GIT_SERVICE_EMAIL = env.str("FEDERATEDCODE_GIT_SERVICE_EMAIL", default="") diff --git a/scanpipe/archiving.py b/scanpipe/archiving.py index 3f3d66e2e8..482f448de5 100644 --- a/scanpipe/archiving.py +++ b/scanpipe/archiving.py @@ -1,185 +1,190 @@ -# scanpipe/archiving.py -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - -import hashlib -import json -import logging -from abc import ABC -from abc import abstractmethod -from dataclasses import dataclass -from pathlib import Path - -logger = logging.getLogger(__name__) - - -@dataclass -class Download: - sha256: str - download_date: str - download_url: str - filename: str - - -class DownloadStore(ABC): - def _compute_sha256(self, content: bytes) -> str: - """Compute SHA256 hash for content.""" - return hashlib.sha256(content).hexdigest() - - def _compute_origin_hash( - self, filename: str, download_date: str, download_url: str - ) -> str: - """Compute a hash for the metadata to name the origin JSON file.""" - to_hash = f"{filename}{download_date}{download_url}".encode() - return hashlib.sha256(to_hash).hexdigest() - - def _build_metadata( - self, sha256: str, filename: str, download_date: str, download_url: str - ) -> dict: - """Build metadata dictionary for JSON storage.""" - return { - "sha256": sha256, - "filename": filename, - "download_date": download_date, - "download_url": download_url, - } - - @abstractmethod - def _get_content_path(self, sha256: str) -> str: - """Get the storage path/key for the content based on SHA256.""" - pass - - @abstractmethod - def list(self): - """Return an iterable of all stored downloads.""" - pass - - @abstractmethod - def get(self, sha256_checksum: str): - """Return a Download object for this checksum or None.""" - pass - - @abstractmethod - def put(self, content: bytes, download_url: str, download_date: str, filename: str): - """ - Store content with its metadata. Return a Download object on success. - Raise an exception on error. - """ - pass - - @abstractmethod - def find( - self, download_url: str = None, filename: str = None, download_date: str = None - ): - """Return a Download object matching the metadata or None.""" - pass - - -class LocalFilesystemProvider(DownloadStore): - def __init__(self, root_path: Path): - self.root_path = root_path - - def _get_content_path(self, sha256: str) -> Path: - """Create a nested path like 59/4c/67/... based on the SHA256 hash.""" - return self.root_path / sha256[:2] / sha256[2:4] / sha256[4:] - - def list(self): - """Return an iterable of all stored downloads.""" - downloads = [] - for content_path in self.root_path.rglob("content"): - origin_files = list(content_path.parent.glob("origin-*.json")) - for origin_file in origin_files: - try: - with open(origin_file) as f: - data = json.load(f) - downloads.append(Download(**data)) - except Exception as e: - logger.error(f"Error reading {origin_file}: {e}") - return downloads - - def get(self, sha256_checksum: str): - """Retrieve a Download object for the given SHA256 hash.""" - content_path = self._get_content_path(sha256_checksum) - if content_path.exists(): - origin_files = list(content_path.glob("origin-*.json")) - if origin_files: - try: - with open(origin_files[0]) as f: - data = json.load(f) - return Download(**data) - except Exception as e: - logger.error( - f"Error reading origin file for {sha256_checksum}: {e}" - ) - return None - - def put(self, content: bytes, download_url: str, download_date: str, filename: str): - """Store the content and its metadata.""" - sha256 = self._compute_sha256(content) - content_path = self._get_content_path(sha256) - content_path.mkdir(parents=True, exist_ok=True) - - content_file = content_path / "content" - if not content_file.exists(): - try: - with open(content_file, "wb") as f: - f.write(content) - except Exception as e: - raise Exception(f"Failed to write content to {content_file}: {e}") - - origin_hash = self._compute_origin_hash(filename, download_date, download_url) - origin_filename = f"origin-{origin_hash}.json" - origin_path = content_path / origin_filename - if origin_path.exists(): - raise Exception(f"Origin {origin_filename} already exists") - - metadata = self._build_metadata(sha256, filename, download_date, download_url) - try: - with open(origin_path, "w") as f: - json.dump(metadata, f, indent=2) - except Exception as e: - raise Exception(f"Failed to write metadata to {origin_path}: {e}") - - return Download(**metadata) - - def find( - self, download_url: str = None, filename: str = None, download_date: str = None - ): - """Find a download based on metadata.""" - if not (download_url or filename or download_date): - return None - for content_path in self.root_path.rglob("origin-*.json"): - try: - with open(content_path) as f: - data = json.load(f) - if ( - (download_url is None or data.get("url") == download_url) - and (filename is None or data.get("filename") == filename) - and ( - download_date is None - or data.get("download_date") == download_date - ) - ): - return Download(**data) - except Exception as e: - logger.error(f"Error reading {content_path}: {e}") - return None +# scanpipe/archiving.py +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +import hashlib +import json +import logging +import os +import stat +from abc import ABC +from abc import abstractmethod +from dataclasses import dataclass +from pathlib import Path + + +logger = logging.getLogger(__name__) + + +@dataclass +class Download: + sha256: str + download_date: str + download_url: str + filename: str + + +class DownloadStore(ABC): + def _compute_sha256(self, content: bytes) -> str: + """Compute SHA256 hash for content.""" + return hashlib.sha256(content).hexdigest() + + def _compute_origin_hash( + self, filename: str, download_date: str, download_url: str + ) -> str: + """Compute a hash for the metadata to name the origin JSON file.""" + to_hash = f"{filename}{download_date}{download_url}".encode() + return hashlib.sha256(to_hash).hexdigest() + + def _build_metadata( + self, sha256: str, filename: str, download_date: str, download_url: str + ) -> dict: + """Build metadata dictionary for JSON storage.""" + return { + "sha256": sha256, + "filename": filename, + "download_date": download_date, + "download_url": download_url, + } + + @abstractmethod + def _get_content_path(self, sha256: str) -> str: + """Get the storage path/key for the content based on SHA256.""" + pass + + @abstractmethod + def list(self): + """Return an iterable of all stored downloads.""" + pass + + @abstractmethod + def get(self, sha256_checksum: str): + """Return a Download object for this checksum or None.""" + pass + + @abstractmethod + def put(self, content: bytes, download_url: str, download_date: str, filename: str): + """ + Store content with its metadata. Return a Download object on success. + Raise an exception on error. + """ + pass + + @abstractmethod + def find( + self, download_url: str = None, filename: str = None, download_date: str = None + ): + """Return a Download object matching the metadata or None.""" + pass + + +class LocalFilesystemProvider(DownloadStore): + def __init__(self, root_path: Path): + self.root_path = root_path + + def _get_content_path(self, sha256: str) -> Path: + """Create a nested path like 59/4c/67/... based on the SHA256 hash.""" + return self.root_path / sha256[:2] / sha256[2:4] / sha256[4:] + + def list(self): + """Return an iterable of all stored downloads.""" + downloads = [] + for content_path in self.root_path.rglob("content"): + origin_files = list(content_path.parent.glob("origin-*.json")) + for origin_file in origin_files: + try: + with open(origin_file) as f: + data = json.load(f) + downloads.append(Download(**data)) + except Exception as e: + logger.error(f"Error reading {origin_file}: {e}") + return downloads + + def get(self, sha256_checksum: str): + """Retrieve a Download object for the given SHA256 hash.""" + content_path = self._get_content_path(sha256_checksum) + if content_path.exists(): + origin_files = list(content_path.glob("origin-*.json")) + if origin_files: + try: + with open(origin_files[0]) as f: + data = json.load(f) + return Download(**data) + except Exception as e: + logger.error( + f"Error reading origin file for {sha256_checksum}: {e}" + ) + return None + + def put(self, content: bytes, download_url: str, download_date: str, filename: str): + """Store the content and its metadata.""" + sha256 = self._compute_sha256(content) + content_path = self._get_content_path(sha256) + content_path.mkdir(parents=True, exist_ok=True) + + content_file = content_path / "content" + if not content_file.exists(): + try: + with open(content_file, "wb") as f: + f.write(content) + except Exception as e: + raise Exception(f"Failed to write content to {content_file}: {e}") + + origin_hash = self._compute_origin_hash(filename, download_date, download_url) + origin_filename = f"origin-{origin_hash}.json" + origin_path = content_path / origin_filename + if origin_path.exists(): + raise Exception(f"Origin {origin_filename} already exists") + + metadata = self._build_metadata(sha256, filename, download_date, download_url) + try: + with open(origin_path, "w") as f: + json.dump(metadata, f, indent=2) + except Exception as e: + raise Exception(f"Failed to write metadata to {origin_path}: {e}") + + return Download(**metadata) + + def find( + self, download_url: str = None, filename: str = None, download_date: str = None + ): + """Find a download based on metadata.""" + if not (download_url or filename or download_date): + return None + for content_path in self.root_path.rglob("origin-*.json"): + try: + with open(content_path) as f: + data = json.load(f) + if ( + (download_url is None or data.get("url") == download_url) + and (filename is None or data.get("filename") == filename) + and ( + download_date is None + or data.get("download_date") == download_date + ) + ): + return Download(**data) + except Exception as e: + logger.error(f"Error reading {content_path}: {e}") + return None + + diff --git a/scanpipe/pipelines/__init__.py b/scanpipe/pipelines/__init__.py index 5153bf1887..1b6cd4e0a0 100644 --- a/scanpipe/pipelines/__init__.py +++ b/scanpipe/pipelines/__init__.py @@ -1,353 +1,346 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - -import hashlib -import inspect -import logging -import traceback -from contextlib import contextmanager -from datetime import datetime -from functools import wraps -from pathlib import Path - -import bleach -from markdown_it import MarkdownIt -from pyinstrument import Profiler - -from aboutcode.pipeline import BasePipeline -from scancodeio.settings import download_store -from scancodeio.settings import settings - -logger = logging.getLogger(__name__) - - -class InputFilesError(Exception): - """InputFile is missing or cannot be downloaded.""" - - def __init__(self, error_tracebacks): - self.error_tracebacks = error_tracebacks - super().__init__(self._generate_message()) - - def _generate_message(self): - message = "InputFilesError encountered with the following issues:\n" - for index, (error, tb) in enumerate(self.error_tracebacks, start=1): - message += f"\nError {index}: {str(error)}\n\n{tb}" - return message - - -def convert_markdown_to_html(markdown_text): - """Convert Markdown text to sanitized HTML.""" - # Using the "js-default" for safety. - html_content = MarkdownIt("js-default").renderInline(markdown_text) - # Sanitize HTML using bleach. - sanitized_html = bleach.clean(html_content) - return sanitized_html - - -class CommonStepsMixin: - """Common steps available on all project pipelines.""" - - def flag_empty_files(self): - """Flag empty files.""" - from scanpipe.pipes import flag - - flag.flag_empty_files(self.project) - - def flag_ignored_resources(self): - """Flag ignored resources based on Project ``ignored_patterns`` setting.""" - from scanpipe.pipes import flag - - ignored_patterns = self.env.get("ignored_patterns", []) - - if isinstance(ignored_patterns, str): - ignored_patterns = ignored_patterns.splitlines() - ignored_patterns.extend(flag.DEFAULT_IGNORED_PATTERNS) - - flag.flag_ignored_patterns( - codebaseresources=self.project.codebaseresources.no_status(), - patterns=ignored_patterns, - ) - - def extract_archive(self, location, target): - """Extract archive at `location` to `target`. Save errors as messages.""" - from scanpipe.pipes import scancode - - extract_errors = scancode.extract_archive(location, target) - - for resource_location, errors in extract_errors.items(): - resource_path = Path(resource_location) - - if resource_path.is_relative_to(self.project.codebase_path): - resource_path = resource_path.relative_to(self.project.codebase_path) - details = {"resource_path": str(resource_path)} - elif resource_path.is_relative_to(self.project.input_path): - resource_path = resource_path.relative_to(self.project.input_path) - details = {"path": f"input/{str(resource_path)}"} - else: - details = {"filename": str(resource_path.name)} - - self.project.add_error( - description="\n".join(errors), - model="extract_archive", - details=details, - ) - - def extract_archives(self, location=None): - """Extract archives located in the codebase/ directory with extractcode.""" - from scanpipe.pipes import scancode - - if not location: - location = self.project.codebase_path - - extract_errors = scancode.extract_archives(location=location, recurse=True) - - for resource_path, errors in extract_errors.items(): - self.project.add_error( - description="\n".join(errors), - model="extract_archives", - details={"resource_path": resource_path}, - ) - - # Reload the project env post-extraction as the scancode-config.yml file - # may be located in one of the extracted archives. - self.env = self.project.get_env() - - def download_missing_inputs(self): - """ - Download any InputSource missing on disk. - Raise an error if any of the uploaded files is not available or not reachable. - """ - error_tracebacks = [] - - for input_source in self.project.inputsources.all(): - if input_source.exists(): - continue - - if input_source.is_uploaded: - msg = f"Uploaded file {input_source} not available." - self.log(msg) - error_tracebacks.append((msg, "No traceback available.")) - continue - - download_url = input_source.download_url - if not download_url: - continue - - url_hash = hashlib.sha256(download_url.encode()).hexdigest() - filename = ( - input_source.filename - or Path(download_url).name - or f"{url_hash}.archive" - ) - archive_path = Path(settings.CENTRAL_ARCHIVE_PATH) / url_hash / filename - - if archive_path.exists(): - logger.info(f"Reusing existing archive at {archive_path}") - input_source.file_path = str(archive_path) - input_source.save() - continue - - self.log(f"Fetching input from {input_source.download_url}") - try: - input_source.fetch() - - except Exception as error: - traceback_str = traceback.format_exc() - logger.error(traceback_str) - self.log(f"{input_source.download_url} could not be fetched.") - error_tracebacks.append((str(error), traceback_str)) - - if error_tracebacks: - raise InputFilesError(error_tracebacks) - - def archive_downloads(self): - """ - Archive downloaded inputs to the centralized DownloadStore if not already - archived.Updates InputSource with archiving metadata (sha256, download_date). - """ - logger.info(f"Archiving downloads for project {self.project.name}") - for input_source in self.project.inputsources.filter( - sha256__isnull=True, is_uploaded=False - ): - if input_source.download_url: - logger.warning( - f"No download URL for input {input_source.filename}, " - "skipping archiving" - ) - continue - - if not input_source.file_path: - logger.warning( - f"No file_path for input {input_source.download_url}, " - "skipping archiving" - ) - continue - try: - with open(input_source.file_path, "rb") as f: - content = f.read() - filename = ( - input_source.filename or input_source.download_url.split("/")[-1] - ) - download = download_store.put( - content=content, - download_url=input_source.download_url, - download_date=datetime.now().isoformat(), - filename=filename, - ) - input_source.sha256 = download.sha256 - input_source.download_date = download.download_date - input_source.file_path = str(download.path) - input_source.save() - except Exception as e: - self.add_error( - exception=e, - message=f"Failed to archive {input_source.download_url}", - ) - - -class ProjectPipeline(CommonStepsMixin, BasePipeline): - """Main class for all project related pipelines including common steps methods.""" - - # Flag specifying whether to download missing inputs as an initial step. - download_inputs = True - - # Optional URL that targets a view of the results relative to this Pipeline. - # This URL may contain dictionary-style string formatting, which will be - # interpolated against the project's field attributes. - # For example, you could use results_url="/project/{slug}/packages/?filter=value" - # to target the Package list view with an active filtering. - results_url = "" - - def __init__(self, run_instance): - """Load the Pipeline execution context from a Run database object.""" - self.run = run_instance - self.project = run_instance.project - self.env = self.project.get_env() - - self.pipeline_class = run_instance.pipeline_class - self.pipeline_name = run_instance.pipeline_name - - self.selected_groups = run_instance.selected_groups or [] - self.selected_steps = run_instance.selected_steps or [] - - self.ecosystem_config = None - - @classmethod - def get_initial_steps(cls): - """Add the ``download_inputs`` step as an initial step if enabled.""" - steps = [] - if cls.download_inputs: - steps.append(cls.download_missing_inputs) - steps.append(cls.archive_downloads) - return tuple(steps) - - @classmethod - def get_info(cls, as_html=False): - """Add the option to render the values as HTML.""" - info = super().get_info() - - if as_html: - info["summary"] = convert_markdown_to_html(info["summary"]) - info["description"] = convert_markdown_to_html(info["description"]) - for step in info["steps"]: - step["doc"] = convert_markdown_to_html(step["doc"]) - - return info - - def append_to_log(self, message): - self.run.append_to_log(message) - - def set_current_step(self, message): - self.run.set_current_step(message) - - def add_error(self, exception, resource=None): - """Create a ``ProjectMessage`` ERROR record on the current `project`.""" - self.project.add_error( - model=self.pipeline_name, - exception=exception, - object_instance=resource, - ) - - @contextmanager - def save_errors(self, *exceptions, **kwargs): - """ - Context manager to save specified exceptions as ``ProjectMessage`` in the - database. - - - Example in a Pipeline step:: - - with self.save_errors(rootfs.DistroNotFound): - rootfs.scan_rootfs_for_system_packages(self.project, rfs) - - - Example when iterating over resources:: - - for resource in self.project.codebaseresources.all(): - with self.save_errors(Exception, resource=resource): - analyse(resource) - """ - try: - yield - except exceptions as error: - self.add_error(exception=error, **kwargs) - - -class Pipeline(ProjectPipeline): - """Alias for the ProjectPipeline class.""" - - pass - - -def is_pipeline(obj): - """ - Return True if the `obj` is a subclass of `Pipeline` except for the - `Pipeline` class itself. - """ - return inspect.isclass(obj) and issubclass(obj, Pipeline) and obj is not Pipeline - - -def profile(step): - """ - Profile a Pipeline step and save the results as HTML file in the project output - directory. - - Usage: - @profile - def step(self): - pass - """ - - @wraps(step) - def wrapper(*arg, **kwargs): - pipeline_instance = arg[0] - project = pipeline_instance.project - - with Profiler() as profiler: - result = step(*arg, **kwargs) - - output_file = project.get_output_file_path("profile", "html") - output_file.write_text(profiler.output_html()) - - pipeline_instance.log(f"Profiling results at {output_file.resolve()}") - - return result - - return wrapper +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +import inspect +import logging +import traceback +import hashlib +from contextlib import contextmanager +from datetime import datetime +from functools import wraps +from pathlib import Path + +import bleach +import requests +from markdown_it import MarkdownIt +from pyinstrument import Profiler + +from aboutcode.pipeline import BasePipeline +from scancodeio.settings import download_store + +logger = logging.getLogger(__name__) + + +class InputFilesError(Exception): + """InputFile is missing or cannot be downloaded.""" + + def __init__(self, error_tracebacks): + self.error_tracebacks = error_tracebacks + super().__init__(self._generate_message()) + + def _generate_message(self): + message = "InputFilesError encountered with the following issues:\n" + for index, (error, tb) in enumerate(self.error_tracebacks, start=1): + message += f"\nError {index}: {str(error)}\n\n{tb}" + return message + + +def convert_markdown_to_html(markdown_text): + """Convert Markdown text to sanitized HTML.""" + # Using the "js-default" for safety. + html_content = MarkdownIt("js-default").renderInline(markdown_text) + # Sanitize HTML using bleach. + sanitized_html = bleach.clean(html_content) + return sanitized_html + + +class CommonStepsMixin: + """Common steps available on all project pipelines.""" + + def flag_empty_files(self): + """Flag empty files.""" + from scanpipe.pipes import flag + + flag.flag_empty_files(self.project) + + def flag_ignored_resources(self): + """Flag ignored resources based on Project ``ignored_patterns`` setting.""" + from scanpipe.pipes import flag + + ignored_patterns = self.env.get("ignored_patterns", []) + + if isinstance(ignored_patterns, str): + ignored_patterns = ignored_patterns.splitlines() + ignored_patterns.extend(flag.DEFAULT_IGNORED_PATTERNS) + + flag.flag_ignored_patterns( + codebaseresources=self.project.codebaseresources.no_status(), + patterns=ignored_patterns, + ) + + def extract_archive(self, location, target): + """Extract archive at `location` to `target`. Save errors as messages.""" + from scanpipe.pipes import scancode + + extract_errors = scancode.extract_archive(location, target) + + for resource_location, errors in extract_errors.items(): + resource_path = Path(resource_location) + + if resource_path.is_relative_to(self.project.codebase_path): + resource_path = resource_path.relative_to(self.project.codebase_path) + details = {"resource_path": str(resource_path)} + elif resource_path.is_relative_to(self.project.input_path): + resource_path = resource_path.relative_to(self.project.input_path) + details = {"path": f"input/{str(resource_path)}"} + else: + details = {"filename": str(resource_path.name)} + + self.project.add_error( + description="\n".join(errors), + model="extract_archive", + details=details, + ) + + def extract_archives(self, location=None): + """Extract archives located in the codebase/ directory with extractcode.""" + from scanpipe.pipes import scancode + + if not location: + location = self.project.codebase_path + + extract_errors = scancode.extract_archives(location=location, recurse=True) + + for resource_path, errors in extract_errors.items(): + self.project.add_error( + description="\n".join(errors), + model="extract_archives", + details={"resource_path": resource_path}, + ) + + # Reload the project env post-extraction as the scancode-config.yml file + # may be located in one of the extracted archives. + self.env = self.project.get_env() + + def download_missing_inputs(self): + """ + Download any InputSource missing on disk. + Raise an error if any of the uploaded files is not available or not reachable. + """ + error_tracebacks = [] + + for input_source in self.project.inputsources.all(): + if input_source.exists(): + continue + + if input_source.is_uploaded: + msg = f"Uploaded file {input_source} not available." + self.log(msg) + error_tracebacks.append((msg, "No traceback available.")) + continue + + download_url = input_source.download_url + if not download_url: + continue + + url_hash = hashlib.sha256(download_url.encode()).hexdigest() + filename = input_source.filename or Path(download_url).name or f"{url_hash}.archive" + archive_path = Path(settings.CENTRAL_ARCHIVE_PATH) / url_hash / filename + + if archive_path.exists(): + logger.info(f"Reusing existing archive at {archive_path}") + input_source.file_path = str(archive_path) + input_source.save() + continue + + self.log(f"Fetching input from {input_source.download_url}") + try: + input_source.fetch() + + except Exception as error: + traceback_str = traceback.format_exc() + logger.error(traceback_str) + self.log(f"{input_source.download_url} could not be fetched.") + error_tracebacks.append((str(error), traceback_str)) + + if error_tracebacks: + raise InputFilesError(error_tracebacks) + + def archive_downloads(self): + """ + Archive downloaded inputs to the centralized DownloadStore if not already + archived.Updates InputSource with archiving metadata (sha256, download_date). + """ + logger.info(f"Archiving downloads for project {self.project.name}") + for input_source in self.project.inputsources.filter( + sha256__isnull=True, is_uploaded=False + ): + if input_source.download_url: + try: + response = requests.get( + input_source.download_url, stream=True,timeout=30 + ) + response.raise_for_status() + content = response.content + filename = ( + input_source.filename + or input_source.download_url.split("/")[-1] + ) + download = download_store.put( + content=content, + download_url=input_source.download_url, + download_date=datetime.now().isoformat(), + filename=filename, + ) + input_source.sha256 = download.sha256 + input_source.download_date = download.download_date + input_source.save() + except Exception as e: + self.add_error( + exception=e, + message=f"Failed to archive {input_source.download_url}", + ) + else: + logger.warning( + f"No download URL for input {input_source.filename}," + "skipping archiving" + ) + + +class ProjectPipeline(CommonStepsMixin, BasePipeline): + """Main class for all project related pipelines including common steps methods.""" + + # Flag specifying whether to download missing inputs as an initial step. + download_inputs = True + + # Optional URL that targets a view of the results relative to this Pipeline. + # This URL may contain dictionary-style string formatting, which will be + # interpolated against the project's field attributes. + # For example, you could use results_url="/project/{slug}/packages/?filter=value" + # to target the Package list view with an active filtering. + results_url = "" + + def __init__(self, run_instance): + """Load the Pipeline execution context from a Run database object.""" + self.run = run_instance + self.project = run_instance.project + self.env = self.project.get_env() + + self.pipeline_class = run_instance.pipeline_class + self.pipeline_name = run_instance.pipeline_name + + self.selected_groups = run_instance.selected_groups or [] + self.selected_steps = run_instance.selected_steps or [] + + self.ecosystem_config = None + + @classmethod + def get_initial_steps(cls): + """Add the ``download_inputs`` step as an initial step if enabled.""" + steps = [] + if cls.download_inputs: + steps.append(cls.download_missing_inputs) + if ENABLE_DOWNLOAD_ARCHIVING: + steps.append(cls.archive_downloads) + return tuple(steps) + + @classmethod + def get_info(cls, as_html=False): + """Add the option to render the values as HTML.""" + info = super().get_info() + + if as_html: + info["summary"] = convert_markdown_to_html(info["summary"]) + info["description"] = convert_markdown_to_html(info["description"]) + for step in info["steps"]: + step["doc"] = convert_markdown_to_html(step["doc"]) + + return info + + def append_to_log(self, message): + self.run.append_to_log(message) + + def set_current_step(self, message): + self.run.set_current_step(message) + + def add_error(self, exception, resource=None): + """Create a ``ProjectMessage`` ERROR record on the current `project`.""" + self.project.add_error( + model=self.pipeline_name, + exception=exception, + object_instance=resource, + ) + + @contextmanager + def save_errors(self, *exceptions, **kwargs): + """ + Context manager to save specified exceptions as ``ProjectMessage`` in the + database. + + - Example in a Pipeline step:: + + with self.save_errors(rootfs.DistroNotFound): + rootfs.scan_rootfs_for_system_packages(self.project, rfs) + + - Example when iterating over resources:: + + for resource in self.project.codebaseresources.all(): + with self.save_errors(Exception, resource=resource): + analyse(resource) + """ + try: + yield + except exceptions as error: + self.add_error(exception=error, **kwargs) + + +class Pipeline(ProjectPipeline): + """Alias for the ProjectPipeline class.""" + + pass + + +def is_pipeline(obj): + """ + Return True if the `obj` is a subclass of `Pipeline` except for the + `Pipeline` class itself. + """ + return inspect.isclass(obj) and issubclass(obj, Pipeline) and obj is not Pipeline + + +def profile(step): + """ + Profile a Pipeline step and save the results as HTML file in the project output + directory. + + Usage: + @profile + def step(self): + pass + """ + + @wraps(step) + def wrapper(*arg, **kwargs): + pipeline_instance = arg[0] + project = pipeline_instance.project + + with Profiler() as profiler: + result = step(*arg, **kwargs) + + output_file = project.get_output_file_path("profile", "html") + output_file.write_text(profiler.output_html()) + + pipeline_instance.log(f"Profiling results at {output_file.resolve()}") + + return result + + return wrapper diff --git a/scanpipe/pipes/input.py b/scanpipe/pipes/input.py index 906a2ee3a1..81ae91c21d 100644 --- a/scanpipe/pipes/input.py +++ b/scanpipe/pipes/input.py @@ -1,345 +1,347 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - -import logging -import os -import shutil -from datetime import datetime -from pathlib import Path - -from django.core.exceptions import FieldDoesNotExist -from django.core.validators import EMPTY_VALUES -from django.db import models - -import openpyxl -import requests -from typecode.contenttype import get_type - -from scancodeio.settings import download_store -from scanpipe import pipes -from scanpipe.models import CodebaseRelation -from scanpipe.models import CodebaseResource -from scanpipe.models import DiscoveredDependency -from scanpipe.models import DiscoveredLicense -from scanpipe.models import DiscoveredPackage -from scanpipe.models import InputSource -from scanpipe.pipes import scancode -from scanpipe.pipes.output import mappings_key_by_fieldname - -logger = logging.getLogger(__name__) - - -def copy_input(input_location, dest_path): - """Copy the ``input_location`` (file or directory) to the ``dest_path``.""" - input_path = Path(input_location) - destination_dir = Path(dest_path) - destination = destination_dir / input_path.name - - if input_path.is_dir(): - shutil.copytree(input_location, destination) - else: - if not os.path.exists(destination_dir): - os.makedirs(destination_dir) - shutil.copyfile(input_location, destination) - - return destination - - -def copy_inputs(input_locations, dest_path): - """Copy the provided ``input_locations`` to the ``dest_path``.""" - for input_location in input_locations: - copy_input(input_location, dest_path) - - -def move_input(input_location, dest_path): - """Move the provided ``input_location`` to the ``dest_path``.""" - destination = dest_path / Path(input_location).name - return shutil.move(input_location, destination) - - -def move_inputs(inputs, dest_path): - """Move the provided ``inputs`` to the ``dest_path``.""" - for input_location in inputs: - move_input(input_location, dest_path) - - -def get_tool_name_from_scan_headers(scan_data): - """Return the ``tool_name`` of the first header in the provided ``scan_data``.""" - if headers := scan_data.get("headers", []): - first_header = headers[0] - tool_name = first_header.get("tool_name", "") - return tool_name - - -def get_extra_data_from_scan_headers(scan_data): - """Return the ``extra_data`` of the first header in the provided ``scan_data``.""" - if headers := scan_data.get("headers", []): - first_header = headers[0] - if extra_data := first_header.get("extra_data"): - return extra_data - - -def is_archive(location): - """Return True if the file at ``location`` is an archive.""" - return get_type(location).is_archive - - -def load_inventory_from_toolkit_scan(project, input_location): - """ - Create license detections, packages, dependencies, and resources - loaded from the ScanCode-toolkit scan results located at ``input_location``. - """ - scanned_codebase = scancode.get_virtual_codebase(project, input_location) - scancode.create_discovered_licenses(project, scanned_codebase) - scancode.create_discovered_packages(project, scanned_codebase) - scancode.create_codebase_resources(project, scanned_codebase) - scancode.create_discovered_dependencies( - project, scanned_codebase, strip_datafile_path_root=True - ) - scancode.load_todo_issues(project, scanned_codebase) - - -def load_inventory_from_scanpipe(project, scan_data, extra_data_prefix=None): - """ - Create packages, dependencies, license detections, resources, and relations - loaded from a ScanCode.io JSON output provided as ``scan_data``. - - An ``extra_data_prefix`` can be provided in case multiple input files are loaded - into the same project. The prefix is usually the filename of the input. - """ - for detection_data in scan_data.get("license_detections", []): - pipes.update_or_create_license_detection(project, detection_data) - - for package_data in scan_data.get("packages", []): - pipes.update_or_create_package(project, package_data) - - for resource_data in scan_data.get("files", []): - pipes.update_or_create_resource(project, resource_data) - - for dependency_data in scan_data.get("dependencies", []): - pipes.update_or_create_dependency(project, dependency_data) - - for relation_data in scan_data.get("relations", []): - pipes.get_or_create_relation(project, relation_data) - - if extra_data := get_extra_data_from_scan_headers(scan_data): - if extra_data_prefix: - extra_data = {extra_data_prefix: extra_data} - project.update_extra_data(extra_data) - - -model_to_object_maker_func = { - DiscoveredPackage: pipes.update_or_create_package, - DiscoveredDependency: pipes.update_or_create_dependency, - DiscoveredLicense: pipes.update_or_create_license_detection, - CodebaseResource: pipes.update_or_create_resource, - CodebaseRelation: pipes.get_or_create_relation, -} - -worksheet_name_to_model = { - "PACKAGES": DiscoveredPackage, - "LICENSE_DETECTIONS": DiscoveredLicense, - "RESOURCES": CodebaseResource, - "DEPENDENCIES": DiscoveredDependency, - "RELATIONS": CodebaseRelation, -} - - -def get_worksheet_data(worksheet): - """Return the data from provided ``worksheet`` as a list of dict.""" - try: - header = [cell.value for cell in next(worksheet.rows)] - except StopIteration: - return {} - - worksheet_data = [ - dict(zip(header, row)) - for row in worksheet.iter_rows(min_row=2, values_only=True) - ] - return worksheet_data - - -def clean_xlsx_field_value(model_class, field_name, value): - """Clean the ``value`` for compatibility with the database ``model_class``.""" - if value in EMPTY_VALUES: - return - - if field_name == "for_packages": - return value.splitlines() - - elif field_name in ["purl", "for_package_uid", "datafile_path"]: - return value - - try: - field = model_class._meta.get_field(field_name) - except FieldDoesNotExist: - return - - if dict_key := mappings_key_by_fieldname.get(field_name): - return [{dict_key: entry} for entry in value.splitlines()] - - elif isinstance(field, models.JSONField): - if field.default is list: - return value.splitlines() - elif field.default is dict: - return # dict stored as JSON are not supported - - return value - - -def clean_xlsx_data_to_model_data(model_class, xlsx_data): - """Clean the ``xlsx_data`` for compatibility with the database ``model_class``.""" - cleaned_data = {} - - for field_name, value in xlsx_data.items(): - if cleaned_value := clean_xlsx_field_value(model_class, field_name, value): - cleaned_data[field_name] = cleaned_value - - return cleaned_data - - -def load_inventory_from_xlsx(project, input_location, extra_data_prefix=None): - """ - Create packages, dependencies, resources, and relations loaded from XLSX file - located at ``input_location``. - - An ``extra_data_prefix`` can be provided in case multiple input files are loaded - into the same project. The prefix is usually the filename of the input. - """ - workbook = openpyxl.load_workbook(input_location, read_only=True, data_only=True) - - for worksheet_name, model_class in worksheet_name_to_model.items(): - if worksheet_name not in workbook: - continue - - worksheet_data = get_worksheet_data(worksheet=workbook[worksheet_name]) - for row_data in worksheet_data: - object_maker_func = model_to_object_maker_func.get(model_class) - cleaned_data = clean_xlsx_data_to_model_data(model_class, row_data) - if cleaned_data: - object_maker_func(project, cleaned_data) - - if "LAYERS" in workbook: - layers_data = get_worksheet_data(worksheet=workbook["LAYERS"]) - extra_data = {"layers": layers_data} - if extra_data_prefix: - extra_data = {extra_data_prefix: extra_data} - project.update_extra_data(extra_data) - - -def add_input_from_url(project, url, filename=None): - """ - Download the file from the provided ``url`` and add it as an InputSource for the - specified ``project``. Optionally, specify a ``filename`` for the downloaded file. - If archiving is enabled, store the content in the DownloadStore and save metadata. - """ - try: - response = requests.get(url, stream=True, timeout=30) - response.raise_for_status() - content = response.content - except requests.RequestException as e: - logger.error(f"Failed to download {url}: {e}") - raise - - filename = filename or url.split("/")[-1] or "downloaded_file" - - if download_store: - try: - download = download_store.put( - content=content, - download_url=url, - download_date=datetime.now().isoformat(), - filename=filename, - ) - InputSource.objects.create( - project=project, - sha256=download.sha256, - download_url=download.download_url, - filename=download.filename, - download_date=download.download_date, - file_path=str(download.path), - is_uploaded=False, - ) - except Exception as e: - logger.error(f"Failed to archive download for {url}: {e}") - raise - else: - input_path = project.input_path / filename - try: - input_path.parent.mkdir(parents=True, exist_ok=True) - with open(input_path, "wb") as f: - f.write(content) - InputSource.objects.create( - project=project, - filename=filename, - download_url=url, - file_path=str(input_path), - is_uploaded=False, - ) - except Exception as e: - logger.error(f"Failed to save {filename} to {input_path}: {e}") - raise - - -def add_input_from_upload(project, uploaded_file): - """ - Add an uploaded file as an InputSource for the specified ``project``. - If archiving is enabled, store the content in the DownloadStore and save metadata. - """ - content = uploaded_file.read() - filename = uploaded_file.name - - if download_store: - try: - download = download_store.put( - content=content, - download_url="", - download_date=datetime.now().isoformat(), - filename=filename, - ) - InputSource.objects.create( - project=project, - sha256=download.sha256, - download_url=download.download_url, - filename=download.filename, - download_date=download.download_date, - file_path=str(download.path), - is_uploaded=True, - ) - except Exception as e: - logger.error(f"Failed to archive upload {filename}: {e}") - raise - else: - input_path = project.input_path / filename - try: - input_path.parent.mkdir(parents=True, exist_ok=True) - with open(input_path, "wb") as f: - f.write(content) - InputSource.objects.create( - project=project, - filename=filename, - file_path=str(input_path), - is_uploaded=True, - ) - except Exception as e: - logger.error(f"Failed to save {filename} to {input_path}: {e}") - raise +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +import hashlib +import logging +import os +import shutil +from datetime import datetime +from pathlib import Path + +from django.core.exceptions import FieldDoesNotExist +from django.core.validators import EMPTY_VALUES +from django.db import models + +import openpyxl +import requests +from typecode.contenttype import get_type + +from scanpipe import pipes +from scanpipe.models import CodebaseRelation +from scanpipe.models import CodebaseResource +from scanpipe.models import DiscoveredDependency +from scanpipe.models import DiscoveredLicense +from scanpipe.models import DiscoveredPackage +from scanpipe.models import InputSource +from scanpipe.pipes import scancode +from scanpipe.pipes.output import mappings_key_by_fieldname +from scancodeio.settings import download_store + +logger = logging.getLogger(__name__) + + +def copy_input(input_location, dest_path): + """Copy the ``input_location`` (file or directory) to the ``dest_path``.""" + input_path = Path(input_location) + destination_dir = Path(dest_path) + destination = destination_dir / input_path.name + + if input_path.is_dir(): + shutil.copytree(input_location, destination) + else: + if not os.path.exists(destination_dir): + os.makedirs(destination_dir) + shutil.copyfile(input_location, destination) + + return destination + + +def copy_inputs(input_locations, dest_path): + """Copy the provided ``input_locations`` to the ``dest_path``.""" + for input_location in input_locations: + copy_input(input_location, dest_path) + + +def move_input(input_location, dest_path): + """Move the provided ``input_location`` to the ``dest_path``.""" + destination = dest_path / Path(input_location).name + return shutil.move(input_location, destination) + + +def move_inputs(inputs, dest_path): + """Move the provided ``inputs`` to the ``dest_path``.""" + for input_location in inputs: + move_input(input_location, dest_path) + + +def get_tool_name_from_scan_headers(scan_data): + """Return the ``tool_name`` of the first header in the provided ``scan_data``.""" + if headers := scan_data.get("headers", []): + first_header = headers[0] + tool_name = first_header.get("tool_name", "") + return tool_name + + +def get_extra_data_from_scan_headers(scan_data): + """Return the ``extra_data`` of the first header in the provided ``scan_data``.""" + if headers := scan_data.get("headers", []): + first_header = headers[0] + if extra_data := first_header.get("extra_data"): + return extra_data + + +def is_archive(location): + """Return True if the file at ``location`` is an archive.""" + return get_type(location).is_archive + + +def load_inventory_from_toolkit_scan(project, input_location): + """ + Create license detections, packages, dependencies, and resources + loaded from the ScanCode-toolkit scan results located at ``input_location``. + """ + scanned_codebase = scancode.get_virtual_codebase(project, input_location) + scancode.create_discovered_licenses(project, scanned_codebase) + scancode.create_discovered_packages(project, scanned_codebase) + scancode.create_codebase_resources(project, scanned_codebase) + scancode.create_discovered_dependencies( + project, scanned_codebase, strip_datafile_path_root=True + ) + scancode.load_todo_issues(project, scanned_codebase) + + +def load_inventory_from_scanpipe(project, scan_data, extra_data_prefix=None): + """ + Create packages, dependencies, license detections, resources, and relations + loaded from a ScanCode.io JSON output provided as ``scan_data``. + + An ``extra_data_prefix`` can be provided in case multiple input files are loaded + into the same project. The prefix is usually the filename of the input. + """ + for detection_data in scan_data.get("license_detections", []): + pipes.update_or_create_license_detection(project, detection_data) + + for package_data in scan_data.get("packages", []): + pipes.update_or_create_package(project, package_data) + + for resource_data in scan_data.get("files", []): + pipes.update_or_create_resource(project, resource_data) + + for dependency_data in scan_data.get("dependencies", []): + pipes.update_or_create_dependency(project, dependency_data) + + for relation_data in scan_data.get("relations", []): + pipes.get_or_create_relation(project, relation_data) + + if extra_data := get_extra_data_from_scan_headers(scan_data): + if extra_data_prefix: + extra_data = {extra_data_prefix: extra_data} + project.update_extra_data(extra_data) + + +model_to_object_maker_func = { + DiscoveredPackage: pipes.update_or_create_package, + DiscoveredDependency: pipes.update_or_create_dependency, + DiscoveredLicense: pipes.update_or_create_license_detection, + CodebaseResource: pipes.update_or_create_resource, + CodebaseRelation: pipes.get_or_create_relation, +} + +worksheet_name_to_model = { + "PACKAGES": DiscoveredPackage, + "LICENSE_DETECTIONS": DiscoveredLicense, + "RESOURCES": CodebaseResource, + "DEPENDENCIES": DiscoveredDependency, + "RELATIONS": CodebaseRelation, +} + + +def get_worksheet_data(worksheet): + """Return the data from provided ``worksheet`` as a list of dict.""" + try: + header = [cell.value for cell in next(worksheet.rows)] + except StopIteration: + return {} + + worksheet_data = [ + dict(zip(header, row)) + for row in worksheet.iter_rows(min_row=2, values_only=True) + ] + return worksheet_data + + +def clean_xlsx_field_value(model_class, field_name, value): + """Clean the ``value`` for compatibility with the database ``model_class``.""" + if value in EMPTY_VALUES: + return + + if field_name == "for_packages": + return value.splitlines() + + elif field_name in ["purl", "for_package_uid", "datafile_path"]: + return value + + try: + field = model_class._meta.get_field(field_name) + except FieldDoesNotExist: + return + + if dict_key := mappings_key_by_fieldname.get(field_name): + return [{dict_key: entry} for entry in value.splitlines()] + + elif isinstance(field, models.JSONField): + if field.default is list: + return value.splitlines() + elif field.default is dict: + return # dict stored as JSON are not supported + + return value + + +def clean_xlsx_data_to_model_data(model_class, xlsx_data): + """Clean the ``xlsx_data`` for compatibility with the database ``model_class``.""" + cleaned_data = {} + + for field_name, value in xlsx_data.items(): + if cleaned_value := clean_xlsx_field_value(model_class, field_name, value): + cleaned_data[field_name] = cleaned_value + + return cleaned_data + + +def load_inventory_from_xlsx(project, input_location, extra_data_prefix=None): + """ + Create packages, dependencies, resources, and relations loaded from XLSX file + located at ``input_location``. + + An ``extra_data_prefix`` can be provided in case multiple input files are loaded + into the same project. The prefix is usually the filename of the input. + """ + workbook = openpyxl.load_workbook(input_location, read_only=True, data_only=True) + + for worksheet_name, model_class in worksheet_name_to_model.items(): + if worksheet_name not in workbook: + continue + + worksheet_data = get_worksheet_data(worksheet=workbook[worksheet_name]) + for row_data in worksheet_data: + object_maker_func = model_to_object_maker_func.get(model_class) + cleaned_data = clean_xlsx_data_to_model_data(model_class, row_data) + if cleaned_data: + object_maker_func(project, cleaned_data) + + if "LAYERS" in workbook: + layers_data = get_worksheet_data(worksheet=workbook["LAYERS"]) + extra_data = {"layers": layers_data} + if extra_data_prefix: + extra_data = {extra_data_prefix: extra_data} + project.update_extra_data(extra_data) + + +def add_input_from_url(project, url, filename=None): + """ + Download the file from the provided ``url`` and add it as an InputSource for the + specified ``project``. Optionally, specify a ``filename`` for the downloaded file. + If archiving is enabled, store the content in the DownloadStore and save metadata. + """ + try: + response = requests.get(url, stream=True,timeout=30) + response.raise_for_status() + content = response.content + except requests.RequestException as e: + logger.error(f"Failed to download {url}: {e}") + raise + + filename = filename or url.split("/")[-1] or "downloaded_file" + url_hash = hashlib.sha256(url.encode()).hexdigest() + archive_path = Path(project.settings.CENTRAL_ARCHIVE_PATH) / url_hash / filename + + if download_store: + try: + download = download_store.put( + content=content, + download_url=url, + download_date=datetime.now().isoformat(), + filename=filename, + ) + InputSource.objects.create( + project=project, + sha256=download.sha256, + download_url=download.download_url, + filename=download.filename, + download_date=download.download_date, + file_path=str(download.path), + is_uploaded=False, + ) + except Exception as e: + logger.error(f"Failed to archive download for {url}: {e}") + raise + else: + input_path = project.input_path / filename + try: + input_path.parent.mkdir(parents=True, exist_ok=True) + with open(input_path, "wb") as f: + f.write(content) + InputSource.objects.create( + project=project, + filename=filename, + download_url=url, + file_path=str(input_path), + is_uploaded=False, + ) + except Exception as e: + logger.error(f"Failed to save {filename} to {input_path}: {e}") + raise + +def add_input_from_upload(project, uploaded_file): + """ + Add an uploaded file as an InputSource for the specified ``project``. + If archiving is enabled, store the content in the DownloadStore and save metadata. + """ + content = uploaded_file.read() + filename = uploaded_file.name + + if download_store: + try: + download = download_store.put( + content=content, + download_url="", + download_date=datetime.now().isoformat(), + filename=filename, + ) + InputSource.objects.create( + project=project, + sha256=download.sha256, + download_url=download.download_url, + filename=download.filename, + download_date=download.download_date, + file_path=str(download.path), + is_uploaded=True, + ) + except Exception as e: + logger.error(f"Failed to archive upload {filename}: {e}") + raise + else: + input_path = project.input_path / filename + try: + input_path.parent.mkdir(parents=True, exist_ok=True) + with open(input_path, "wb") as f: + f.write(content) + InputSource.objects.create( + project=project, + filename=filename, + file_path=str(input_path), + is_uploaded=True, + ) + except Exception as e: + logger.error(f"Failed to save {filename} to {input_path}: {e}") + raise \ No newline at end of file diff --git a/scanpipe/tests/test_archiving.py b/scanpipe/tests/test_archiving.py index 0da1a236b5..a249c96c46 100644 --- a/scanpipe/tests/test_archiving.py +++ b/scanpipe/tests/test_archiving.py @@ -1,86 +1,86 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - - -import hashlib -from pathlib import Path - -from django.test import TestCase - -from scanpipe.archiving import LocalFilesystemProvider -from scanpipe.tests import make_project - - -class TestArchiving(TestCase): - def setUp(self): - self.project = make_project() - self.root_path = Path(__file__).parent / "data" / "test_downloads" - self.store = LocalFilesystemProvider(root_path=self.root_path) - self.test_content = b"test content" - self.test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" - self.test_filename = "sample.tar.gz" - - def tearDown(self): - if self.root_path.exists(): - import shutil - - shutil.rmtree(self.root_path) - - def test_local_filesystem_provider_put_get(self): - download = self.store.put( - content=self.test_content, - download_url=self.test_url, - download_date="2025-08-21T09:00:00", - filename=self.test_filename, - ) - sha256 = hashlib.sha256(self.test_content).hexdigest() - self.assertEqual(download.sha256, sha256) - self.assertEqual(download.download_url, self.test_url) - self.assertEqual(download.filename, self.test_filename) - self.assertEqual(download.download_date, "2025-08-21T09:00:00") - content_path = ( - self.root_path / sha256[:2] / sha256[2:4] / sha256[4:] / "content" - ) - self.assertTrue(content_path.exists()) - with open(content_path, "rb") as f: - self.assertEqual(f.read(), self.test_content) - - retrieved = self.store.get(sha256) - self.assertEqual(retrieved.sha256, sha256) - self.assertEqual(retrieved.download_url, self.test_url) - self.assertEqual(retrieved.filename, self.test_filename) - - def test_local_filesystem_provider_deduplication(self): - download1 = self.store.put( - content=self.test_content, - download_url=self.test_url, - download_date="2025-08-21T09:00:00", - filename=self.test_filename, - ) - download2 = self.store.put( - content=self.test_content, - download_url="https://files.pythonhosted.org/packages/another.tar.gz", - download_date="2025-08-21T10:00:00", - filename="another.tar.gz", - ) - self.assertEqual(download1.sha256, download2.sha256) - self.assertEqual(download1.download_url, self.test_url) +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + + +import hashlib +from pathlib import Path + +from django.test import TestCase + +from scanpipe.archiving import LocalFilesystemProvider +from scanpipe.tests import make_project + + +class TestArchiving(TestCase): + def setUp(self): + self.project = make_project() + self.root_path = Path(__file__).parent / "data" / "test_downloads" + self.store = LocalFilesystemProvider(root_path=self.root_path) + self.test_content = b"test content" + self.test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" + self.test_filename = "sample.tar.gz" + + def tearDown(self): + if self.root_path.exists(): + import shutil + + shutil.rmtree(self.root_path) + + def test_local_filesystem_provider_put_get(self): + download = self.store.put( + content=self.test_content, + download_url=self.test_url, + download_date="2025-08-21T09:00:00", + filename=self.test_filename, + ) + sha256 = hashlib.sha256(self.test_content).hexdigest() + self.assertEqual(download.sha256, sha256) + self.assertEqual(download.download_url, self.test_url) + self.assertEqual(download.filename, self.test_filename) + self.assertEqual(download.download_date, "2025-08-21T09:00:00") + content_path = ( + self.root_path / sha256[:2] / sha256[2:4] / sha256[4:] / "content" + ) + self.assertTrue(content_path.exists()) + with open(content_path, "rb") as f: + self.assertEqual(f.read(), self.test_content) + + retrieved = self.store.get(sha256) + self.assertEqual(retrieved.sha256, sha256) + self.assertEqual(retrieved.download_url, self.test_url) + self.assertEqual(retrieved.filename, self.test_filename) + + def test_local_filesystem_provider_deduplication(self): + download1 = self.store.put( + content=self.test_content, + download_url=self.test_url, + download_date="2025-08-21T09:00:00", + filename=self.test_filename, + ) + download2 = self.store.put( + content=self.test_content, + download_url="https://files.pythonhosted.org/packages/another.tar.gz", + download_date="2025-08-21T10:00:00", + filename="another.tar.gz", + ) + self.assertEqual(download1.sha256, download2.sha256) + self.assertEqual(download1.download_url, self.test_url) diff --git a/scanpipe/tests/test_input.py b/scanpipe/tests/test_input.py index e55a90cace..3f2848cf1b 100644 --- a/scanpipe/tests/test_input.py +++ b/scanpipe/tests/test_input.py @@ -1,112 +1,143 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: -# http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, -# software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an -# "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - - -from pathlib import Path -from unittest.mock import patch - -from django.core.files.uploadedfile import SimpleUploadedFile -from django.test import TestCase - -from scancodeio.settings import settings -from scanpipe.models import InputSource -from scanpipe.pipes.input import add_input_from_upload -from scanpipe.pipes.input import add_input_from_url -from scanpipe.tests import make_project - - -class TestInput(TestCase): - def setUp(self): - self.project = make_project() - self.test_filename = "sample.tar.gz" - self.test_data_path = ( - Path(__file__).parent / "data" / "test-downloads" / self.test_filename - ) - with open(self.test_data_path, "rb") as f: - self.test_content = f.read() - - @patch("requests.get") - def test_add_input_from_url(self, mock_get): - test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" - mock_get.return_value.content = self.test_content - mock_get.return_value.status_code = 200 - add_input_from_url(self.project, test_url, filename=self.test_filename) - input_source = InputSource.objects.get(project=self.project) - self.assertEqual(input_source.filename, self.test_filename) - self.assertEqual(input_source.download_url, test_url) - self.assertTrue(input_source.sha256) - self.assertTrue(input_source.download_date) - self.assertFalse(input_source.is_uploaded) - self.assertTrue( - input_source.file_path.startswith(settings.CENTRAL_ARCHIVE_PATH) - ) - self.assertTrue(Path(input_source.file_path).exists()) - - @patch("scanpipe.pipes.input.download_store", None) - @patch("requests.get") - def test_add_input_from_url_fallback(self, mock_get): - test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" - mock_get.return_value.content = self.test_content - mock_get.return_value.status_code = 200 - add_input_from_url(self.project, test_url, filename=self.test_filename) - input_source = InputSource.objects.get(project=self.project) - self.assertEqual(input_source.filename, self.test_filename) - self.assertEqual(input_source.download_url, test_url) - self.assertFalse(input_source.sha256) - self.assertFalse(input_source.download_date) - self.assertFalse(input_source.is_uploaded) - self.assertTrue( - str(input_source.file_path).startswith(str(self.project.input_path)) - ) - self.assertTrue(Path(input_source.file_path).exists()) - - def test_add_input_from_upload(self): - uploaded_file = SimpleUploadedFile(self.test_filename, self.test_content) - add_input_from_upload(self.project, uploaded_file) - input_source = InputSource.objects.get(project=self.project) - self.assertEqual(input_source.filename, self.test_filename) - self.assertEqual(input_source.download_url, "") - self.assertTrue(input_source.sha256) - self.assertTrue(input_source.download_date) - self.assertTrue(input_source.is_uploaded) - self.assertTrue( - input_source.file_path.startswith(settings.CENTRAL_ARCHIVE_PATH) - ) - self.assertTrue(Path(input_source.file_path).exists()) - - @patch("scanpipe.pipes.input.download_store", None) - def test_add_input_from_upload_fallback(self): - uploaded_file = SimpleUploadedFile(self.test_filename, self.test_content) - add_input_from_upload(self.project, uploaded_file) - input_source = InputSource.objects.get(project=self.project) - self.assertEqual(input_source.filename, self.test_filename) - self.assertEqual(input_source.download_url, "") - self.assertFalse(input_source.sha256) - self.assertFalse(input_source.download_date) - self.assertTrue(input_source.is_uploaded) - self.assertTrue( - str(input_source.file_path).startswith(str(self.project.input_path)) - ) - self.assertTrue(Path(input_source.file_path).exists()) +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: +# http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, +# software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an +# "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + + +from pathlib import Path +from unittest.mock import patch + +from django.core.files.uploadedfile import SimpleUploadedFile +from django.test import TestCase + +from scanpipe.models import InputSource +from scanpipe.pipes.input import add_input_from_upload +from scanpipe.pipes.input import add_input_from_url +from scancodeio.settings import settings +from scanpipe.tests import make_project + + +class TestInput(TestCase): + def setUp(self): + self.project = make_project() + self.test_filename = "sample.tar.gz" + self.test_data_path = ( + Path(__file__).parent / + "data" / + "test-downloads" / + self.test_filename + ) + with open(self.test_data_path, "rb") as f: + self.test_content = f.read() + + @patch("requests.get") + def test_add_input_from_url(self, mock_get): + test_url = ( + "https://files.pythonhosted.org/" + "packages/sample.tar.gz" + ) + mock_get.return_value.content = self.test_content + mock_get.return_value.status_code = 200 + add_input_from_url( + self.project, + test_url, + filename=self.test_filename + ) + input_source = InputSource.objects.get(project=self.project) + self.assertEqual(input_source.filename, self.test_filename) + self.assertEqual(input_source.download_url, test_url) + self.assertTrue(input_source.sha256) + self.assertTrue(input_source.download_date) + self.assertFalse(input_source.is_uploaded) + self.assertTrue( + input_source.file_path.startswith( + settings.CENTRAL_ARCHIVE_PATH + ) + ) + self.assertTrue(Path(input_source.file_path).exists()) + + @patch("scanpipe.pipes.input.download_store", None) + @patch("requests.get") + def test_add_input_from_url_fallback(self, mock_get): + test_url = ( + "https://files.pythonhosted.org/" + "packages/sample.tar.gz" + ) + mock_get.return_value.content = self.test_content + mock_get.return_value.status_code = 200 + add_input_from_url( + self.project, + test_url, + filename=self.test_filename + ) + input_source = InputSource.objects.get(project=self.project) + self.assertEqual(input_source.filename, self.test_filename) + self.assertEqual(input_source.download_url, test_url) + self.assertFalse(input_source.sha256) + self.assertFalse(input_source.download_date) + self.assertFalse(input_source.is_uploaded) + self.assertTrue( + str(input_source.file_path).startswith( + str(self.project.input_path) + ) + ) + self.assertTrue(Path(input_source.file_path).exists()) + + def test_add_input_from_upload(self): + uploaded_file = SimpleUploadedFile( + self.test_filename, + self.test_content + ) + add_input_from_upload(self.project, uploaded_file) + input_source = InputSource.objects.get(project=self.project) + self.assertEqual(input_source.filename, self.test_filename) + self.assertEqual(input_source.download_url, "") + self.assertTrue(input_source.sha256) + self.assertTrue(input_source.download_date) + self.assertTrue(input_source.is_uploaded) + self.assertTrue( + input_source.file_path.startswith( + settings.CENTRAL_ARCHIVE_PATH + ) + ) + self.assertTrue(Path(input_source.file_path).exists()) + + @patch("scanpipe.pipes.input.download_store", None) + def test_add_input_from_upload_fallback(self): + uploaded_file = SimpleUploadedFile( + self.test_filename, + self.test_content + ) + add_input_from_upload(self.project, uploaded_file) + input_source = InputSource.objects.get(project=self.project) + self.assertEqual(input_source.filename, self.test_filename) + self.assertEqual(input_source.download_url, "") + self.assertFalse(input_source.sha256) + self.assertFalse(input_source.download_date) + self.assertTrue(input_source.is_uploaded) + self.assertTrue( + str(input_source.file_path).startswith( + str(self.project.input_path) + ) + ) + self.assertTrue(Path(input_source.file_path).exists()) diff --git a/scanpipe/tests/test_pipelines.py b/scanpipe/tests/test_pipelines.py index 0831e22081..6439e842dd 100644 --- a/scanpipe/tests/test_pipelines.py +++ b/scanpipe/tests/test_pipelines.py @@ -1,2057 +1,2057 @@ - -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/nexB/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/nexB/scancode.io for support and download. - -import io -import json -import os -import sys -import tempfile -from contextlib import redirect_stderr -from pathlib import Path -from unittest import mock -from unittest import skipIf - -from django.conf import settings -from django.test import TestCase -from django.test import tag - -from packageurl import PackageURL -from scancode.cli_test_utils import purl_with_fake_uuid -from scorecode.models import PackageScore - -from scanpipe import pipes -from scanpipe.models import CodebaseResource -from scanpipe.models import DiscoveredPackage -from scanpipe.models import InputSource -from scanpipe.pipelines import CommonStepsMixin -from scanpipe.pipelines import InputFilesError -from scanpipe.pipelines import Pipeline -from scanpipe.pipelines import analyze_root_filesystem -from scanpipe.pipelines import deploy_to_develop -from scanpipe.pipelines import is_pipeline -from scanpipe.pipelines import scan_single_package -from scanpipe.pipes import d2d -from scanpipe.pipes import flag -from scanpipe.pipes import output -from scanpipe.pipes import scancode -from scanpipe.pipes.input import copy_input -from scanpipe.tests import FIXTURES_REGEN -from scanpipe.tests import make_mock_response -from scanpipe.tests import make_package -from scanpipe.tests import make_project -from scanpipe.tests import package_data1 -from scanpipe.tests.pipelines.do_nothing import DoNothing -from scanpipe.tests.pipelines.download_inputs import DownloadInput -from scanpipe.tests.pipelines.profile_step import ProfileStep -from scanpipe.tests.pipelines.steps_as_attribute import StepsAsAttribute -from scanpipe.tests.pipelines.with_groups import WithGroups - -from_docker_image = os.environ.get("FROM_DOCKER_IMAGE") - - -class ScanPipePipelinesTest(TestCase): - data = Path(__file__).parent / "data" - - def test_scanpipe_pipeline_class_pipeline_name_attribute(self): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline_instance = DoNothing(run) - self.assertEqual("do_nothing", pipeline_instance.pipeline_name) - - def test_scanpipe_pipeline_class_get_info(self): - expected = { - "description": "Description section of the doc string.", - "summary": "Do nothing, in 2 steps.", - "steps": [ - {"name": "step1", "doc": "Step1 doc.", "groups": []}, - {"name": "step2", "doc": "Step2 doc.", "groups": []}, - ], - "available_groups": [], - } - self.assertEqual(expected, DoNothing.get_info()) - - expected = { - "summary": "Profile a step using the @profile decorator.", - "description": "", - "steps": [ - {"name": "step", "doc": "", "groups": []}, - ], - "available_groups": [], - } - self.assertEqual(expected, ProfileStep.get_info()) - - def test_scanpipe_pipeline_class_get_summary(self): - expected = "Do nothing, in 2 steps." - self.assertEqual(expected, DoNothing.get_summary()) - - expected = "Profile a step using the @profile decorator." - self.assertEqual(expected, ProfileStep.get_summary()) - - def test_scanpipe_pipeline_class_log(self): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - pipeline.log("Event1") - pipeline.log("Event2") - - run.refresh_from_db() - self.assertIn("Event1", run.log) - self.assertIn("Event2", run.log) - - def test_scanpipe_pipeline_class_execute(self): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode) - self.assertEqual("", out) - - run.refresh_from_db() - self.assertIn("Pipeline [do_nothing] starting", run.log) - self.assertIn("Step [step1] starting", run.log) - self.assertIn("Step [step1] completed", run.log) - self.assertIn("Step [step2] starting", run.log) - self.assertIn("Step [step2] completed", run.log) - self.assertIn("Pipeline completed", run.log) - - def test_scanpipe_pipeline_class_execute_with_exception(self): - project1 = make_project() - run = project1.add_pipeline("raise_exception") - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(1, exitcode) - self.assertTrue(out.startswith("Error message")) - self.assertIn("Traceback:", out) - self.assertIn("in execute", out) - self.assertIn("step(self)", out) - self.assertIn("in raise_exception", out) - self.assertIn("raise ValueError", out) - - run.refresh_from_db() - self.assertIn("Pipeline [raise_exception] starting", run.log) - self.assertIn("Step [raise_exception_step] starting", run.log) - self.assertIn("Pipeline failed", run.log) - - @mock.patch("scanpipe.tests.pipelines.do_nothing.DoNothing.step1") - @mock.patch("scanpipe.tests.pipelines.do_nothing.DoNothing.step2") - def test_scanpipe_pipeline_class_execute_with_selected_steps(self, step2, step1): - step1.__name__ = "step1" - step1.groups = [] - step2.__name__ = "step2" - step2.groups = [] - - project1 = make_project() - run = project1.add_pipeline("do_nothing") - run.update(selected_steps=["step2", "not_existing_step"]) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode) - self.assertEqual("", out) - - step1.assert_not_called() - step2.assert_called() - - run.refresh_from_db() - self.assertIn("Pipeline [do_nothing] starting", run.log) - self.assertIn("Step [step1] skipped", run.log) - self.assertIn("Step [step2] starting", run.log) - self.assertIn("Step [step2] completed", run.log) - self.assertIn("Pipeline completed", run.log) - - def test_scanpipe_pipeline_class_download_inputs_attribute(self): - project1 = make_project() - run = project1.add_pipeline("download_inputs") - pipeline = run.make_pipeline_instance() - self.assertTrue(pipeline.download_inputs) - expected = (CommonStepsMixin.download_missing_inputs,) - self.assertEqual(expected, pipeline.get_initial_steps()) - expected = (CommonStepsMixin.download_missing_inputs, DownloadInput.step1) - self.assertEqual(expected, pipeline.get_steps()) - pipeline.execute() - self.assertIn("Step [download_missing_inputs]", run.log) - - run = project1.add_pipeline("profile_step") - pipeline = run.make_pipeline_instance() - self.assertFalse(pipeline.download_inputs) - pipeline.execute() - self.assertNotIn("Step [download_missing_inputs]", run.log) - - @mock.patch("requests.sessions.Session.get") - def test_scanpipe_pipeline_class_download_missing_inputs(self, mock_get): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - - file_location = self.data / "aboutcode" / "notice.NOTICE" - input_source = project1.add_input_source( - filename=file_location.name, is_uploaded=True - ) - self.assertFalse(input_source.exists()) - with self.assertRaises(InputFilesError) as error: - pipeline.download_missing_inputs() - error_msg = ( - "InputFilesError encountered with the following issues:\n\n" - "Error 1: Uploaded file filename=notice.NOTICE [uploaded] not available." - "\n\nNo traceback available." - ) - self.assertEqual(error_msg, str(error.exception)) - self.assertIn( - "Uploaded file filename=notice.NOTICE [uploaded] not available.", run.log - ) - - project1.copy_input_from(file_location) - self.assertTrue(input_source.exists()) - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - pipeline.download_missing_inputs() - self.assertEqual("", run.log) - - download_url = "https://download.url/file.zip" - mock_get.return_value = make_mock_response(url=download_url) - input_source2 = project1.add_input_source(download_url=download_url) - pipeline.download_missing_inputs() - self.assertIn("Fetching input from https://download.url/file.zip", run.log) - input_source2.refresh_from_db() - self.assertEqual("file.zip", input_source2.filename) - self.assertTrue(input_source2.exists()) - mock_get.assert_called_once() - - @mock.patch("scanpipe.models.InputSource.fetch") - def test_scanpipe_pipeline_class_download_fetch_exception(self, mock_fetch): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - - mock_fetch.side_effect = Exception("File not found") - download_url = "https://download.url/file.zip" - project1.add_input_source(download_url=download_url) - - with self.assertRaises(InputFilesError) as error: - pipeline.download_missing_inputs() - self.assertIn( - "InputFilesError encountered with the following issues:", - str(error.exception), - ) - self.assertIn("Error 1: File not found", str(error.exception)) - self.assertIn("Traceback (most recent call last):", str(error.exception)) - self.assertIn("Exception: File not found", str(error.exception)) - - self.assertIn("Fetching input from https://download.url/file.zip", run.log) - self.assertIn("https://download.url/file.zip could not be fetched.", run.log) - - @mock.patch("git.repo.base.Repo.clone_from") - def test_scanpipe_pipeline_class_download_missing_inputs_git_repo(self, mock_clone): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - - download_url = "https://github.com/aboutcode-org/scancode.io.git" - input_source = project1.add_input_source(download_url=download_url) - - def mock_make_to_path(**kwargs): - to_path = kwargs.get("to_path") - to_path.mkdir() - - mock_clone.side_effect = mock_make_to_path - mock_clone.return_value = None - - pipeline.download_missing_inputs() - self.assertIn( - "Fetching input from https://github.com/aboutcode-org/scancode.io.git", - run.log, - ) - input_source.refresh_from_db() - self.assertEqual("scancode.io.git", input_source.filename) - self.assertTrue(input_source.exists()) - - @mock.patch("requests.get") - def test_archive_downloads(self, mock_get): - project1 = make_project() - run = project1.add_pipeline("scan_codebase") - pipeline = run.make_pipeline_instance() - test_filename = "sample.tar.gz" - test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" - test_data_path = ( - Path(__file__).parent / "data" / "test-downloads" / test_filename - ) - with open(test_data_path, "rb") as f: - test_content = f.read() - - input_source=InputSource.objects.create( - project=project1, - filename=test_filename, - download_url=test_url, - is_uploaded=False, - ) - - mock_get.return_value.content = test_content - mock_get.return_value.status_code = 200 - - pipeline.download_missing_inputs() - input_source.refresh_from_db() - self.assertTrue( - input_source.file_path.startswith(settings.CENTRAL_ARCHIVE_PATH) - ) - self.assertTrue(Path(input_source.file_path).exists()) - - pipeline.archive_downloads() - input_source = InputSource.refresh_from_db() - self.assertTrue(input_source.sha256) - self.assertTrue(input_source.download_date) - self.assertEqual(input_source.download_url, test_url) - self.assertEqual(input_source.filename, test_filename) - - project2 = make_project(name="project2") - input_source2 = InputSource.objects.create( - project=project2, - filename=test_filename, - download_url=test_url, - is_uploaded=False, - ) - run2 = project2.add_pipeline("scan_codebase") - pipeline2 = run2.make_pipeline_instance() - pipeline2.download_missing_inputs() - input_source2.refresh_from_db() - self.assertEqual(input_source.file_path, input_source2.file_path) - self.assertTrue(Path(input_source2.file_path).exists()) - - def test_scanpipe_pipeline_class_save_errors_context_manager(self): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - self.assertEqual(project1, pipeline.project) - - with pipeline.save_errors(Exception): - raise Exception("Error message") - - message = project1.projectmessages.get() - self.assertEqual("do_nothing", message.model) - self.assertEqual({}, message.details) - self.assertEqual("Error message", message.description) - self.assertIn('raise Exception("Error message")', message.traceback) - - resource1 = CodebaseResource.objects.create(project=project1, path="filename") - with pipeline.save_errors(Exception, resource=resource1): - raise Exception("Error message") - message = project1.projectmessages.latest("created_date") - self.assertEqual({"resource_path": str(resource1.path)}, message.details) - - def test_scanpipe_pipelines_is_pipeline(self): - self.assertFalse(is_pipeline(None)) - self.assertFalse(is_pipeline(Pipeline)) - self.assertTrue(is_pipeline(DoNothing)) - - class SubSubClass(DoNothing): - pass - - self.assertTrue(is_pipeline(SubSubClass)) - - def test_scanpipe_pipeline_class_get_graph(self): - expected = [ - {"name": "step1", "doc": "Step1 doc.", "groups": []}, - {"name": "step2", "doc": "Step2 doc.", "groups": []}, - ] - self.assertEqual(expected, DoNothing.get_graph()) - - def test_scanpipe_pipelines_profile_decorator(self): - project1 = make_project() - run = project1.add_pipeline("profile_step") - pipeline_instance = run.make_pipeline_instance() - - exitcode, out = pipeline_instance.execute() - self.assertEqual(0, exitcode) - - run.refresh_from_db() - self.assertIn("Profiling results at", run.log) - self.assertIn("Pipeline completed", run.log) - - self.assertEqual(1, len(project1.output_root)) - output_file = project1.output_root[0] - self.assertTrue(output_file.startswith("profile-")) - self.assertTrue(output_file.endswith(".html")) - - def test_scanpipe_pipeline_class_get_steps(self): - expected = ( - DoNothing.step1, - DoNothing.step2, - ) - self.assertEqual(expected, DoNothing.get_steps()) - - with self.assertRaises(TypeError) as cm: - StepsAsAttribute.get_steps() - expected = "Use a ``steps(cls)`` classmethod to declare the steps." - self.assertEqual(expected, str(cm.exception)) - - def test_scanpipe_pipeline_class_get_steps_with_groups(self): - expected = (WithGroups.no_groups,) - self.assertEqual(expected, WithGroups.get_steps()) - self.assertEqual(expected, WithGroups.get_steps(groups=[])) - self.assertEqual(expected, WithGroups.get_steps(groups=["not_defined"])) - - expected = ( - WithGroups.grouped_with_foo_and_bar, - WithGroups.grouped_with_bar, - WithGroups.no_groups, - ) - self.assertEqual(expected, WithGroups.get_steps(groups=["bar"])) - self.assertEqual(expected, WithGroups.get_steps(groups=["foo", "bar"])) - - expected = ( - WithGroups.grouped_with_foo_and_bar, - WithGroups.no_groups, - ) - self.assertEqual(expected, WithGroups.get_steps(groups=["foo"])) - - def test_scanpipe_pipeline_class_get_available_groups(self): - self.assertEqual(["bar", "excluded", "foo"], WithGroups.get_available_groups()) - self.assertEqual([], DoNothing.get_available_groups()) - - def test_scanpipe_pipeline_class_env_loaded_from_config_file(self): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - self.assertEqual({}, pipeline.env) - - config_file = project1.input_path / settings.SCANCODEIO_CONFIG_FILE - config_file.write_text("{*this is not valid yml*}") - pipeline = run.make_pipeline_instance() - self.assertEqual({}, pipeline.env) - - config_file.write_text("product_name: Product") - pipeline = run.make_pipeline_instance() - self.assertEqual({"product_name": "Product"}, pipeline.env) - - def test_scanpipe_pipeline_class_env_reloaded_after_extraction(self): - project1 = make_project() - - input_location = self.data / "settings" / "archived-scancode-config.zip" - project1.copy_input_from(input_location) - run = project1.add_pipeline("scan_codebase") - pipeline = run.make_pipeline_instance() - self.assertEqual({}, pipeline.env) - - # Manually run steps, env is reload from the scancode-config.yml contained in - # the archive - pipeline.copy_inputs_to_codebase_directory() - pipeline.extract_archives() - - expected = { - "product_name": "My Product Name", - "product_version": "1.0", - "ignored_patterns": ["*.tmp", "tests/*"], - } - self.assertEqual(expected, pipeline.env) - - def test_scanpipe_pipeline_class_flag_ignored_resources(self): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - self.assertIsNone(pipeline.env.get("ignored_patterns")) - - project1.settings.update({"ignored_patterns": "*.ext"}) - project1.save() - pipeline = run.make_pipeline_instance() - - with mock.patch("scanpipe.pipes.flag.flag_ignored_patterns") as mock_flag: - mock_flag.return_value = None - pipeline.flag_ignored_resources() - - mock_flag.assert_called_once() - patterns_args = ["*.ext", *flag.DEFAULT_IGNORED_PATTERNS] - self.assertEqual(mock_flag.mock_calls[0].kwargs["patterns"], patterns_args) - self.assertEqual(mock_flag.mock_calls[0].kwargs["codebaseresources"].count(), 0) - - def test_scanpipe_pipeline_class_extract_archive(self): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - - target = tempfile.mkdtemp() - input_location = str(self.data / "scancode" / "corrupted.tar.gz") - pipeline.extract_archive(input_location, target) - - projects_errors = project1.projectmessages.all() - self.assertEqual(1, len(projects_errors)) - project_error = projects_errors.get() - self.assertEqual("error", project_error.severity) - self.assertIn("gzip decompression failed", project_error.description) - self.assertEqual("extract_archive", project_error.model) - self.assertEqual({"filename": "corrupted.tar.gz"}, project_error.details) - self.assertEqual("", project_error.traceback) - - def test_scanpipe_pipeline_class_extract_archives(self): - project1 = make_project() - run = project1.add_pipeline("do_nothing") - pipeline = run.make_pipeline_instance() - - input_location = str(self.data / "scancode" / "corrupted.tar.gz") - resource_location = copy_input(input_location, project1.codebase_path) - pipeline.extract_archives() - - projects_errors = project1.projectmessages.all() - self.assertEqual(1, len(projects_errors)) - project_error = projects_errors.get() - self.assertEqual("error", project_error.severity) - self.assertIn("gzip decompression failed", project_error.description) - self.assertEqual("extract_archives", project_error.model) - self.assertEqual( - {"resource_path": str(resource_location)}, project_error.details - ) - self.assertEqual("", project_error.traceback) - - -class RootFSPipelineTest(TestCase): - def test_scanpipe_rootfs_pipeline_extract_input_files_errors(self): - project1 = make_project() - run = project1.add_pipeline("analyze_root_filesystem_or_vm_image") - pipeline_instance = analyze_root_filesystem.RootFS(run) - - # Create 2 files in the input/ directory to generate error twice - project1.move_input_from(tempfile.mkstemp()[1]) - project1.move_input_from(tempfile.mkstemp()[1]) - self.assertEqual(2, len(project1.input_files)) - - with mock.patch("scanpipe.pipes.scancode.extract_archive") as extract_archive: - extract_archive.return_value = {"path/to/resource": ["error1", "error2"]} - pipeline_instance.extract_input_files_to_codebase_directory() - - projects_errors = project1.projectmessages.all() - self.assertEqual(2, len(projects_errors)) - project_error = projects_errors[0] - self.assertEqual("error", project_error.severity) - self.assertEqual("error1\nerror2", project_error.description) - self.assertEqual("extract_archive", project_error.model) - self.assertEqual({"filename": "resource"}, project_error.details) - self.assertEqual("", project_error.traceback) - - -def sort_for_os_compatibility(scan_data): - """Sort the ``scan_data`` files and relations in place. Return ``scan_data``.""" - if files := scan_data.get("files"): - files.sort(key=lambda x: x["path"]) - - if relations := scan_data.get("relations"): - relations.sort(key=lambda x: x["to_resource"]) - - return scan_data - - -@tag("slow") -class PipelinesIntegrationTest(TestCase): - """Integration tests to ensure the proper output for each built-in Pipelines.""" - - # Un-comment the following to display full diffs: - # maxDiff = None - data = Path(__file__).parent / "data" - exclude_from_diff = [ - "start_timestamp", - "end_timestamp", - "date", - "duration", - "input", - "compliance_alert", - "policy", - "tool_version", - "other_tools", - "created_date", - "log", - "uuid", - "size", # directory sizes are OS dependant - "size_count", - "--json-pp", - "--processes", - "--verbose", - # system_environment differs between systems - "system_environment", - "file_type", - # mime type and is_script are inconsistent across systems - "mime_type", - "is_script", - "notes", - "settings", - "description", - "traceback", - ] - - def _without_keys(self, data, exclude_keys): - """Return the `data` excluding the provided `exclude_keys`.""" - if isinstance(data, list): - return [self._without_keys(entry, exclude_keys) for entry in data] - - if isinstance(data, dict): - return { - key: ( - self._without_keys(value, exclude_keys) - if type(value) in [list, dict] - else value - ) - for key, value in data.items() - if key not in exclude_keys - } - - return data - - def purl_fields_with_fake_uuid(self, value, key): - purl_fields = ["purl", "for_packages", "package_uid"] - purl_name = "fixed-name-for-testing-5642512d1758" - purl_namespace = "fixed-namespace-for-testing-5642512d1758" - - if key == "name": - return purl_name - elif key == "namespace": - return purl_namespace - elif key in purl_fields: - purl_old = PackageURL.from_string(value) - if purl_old.type != "local-files": - return purl_with_fake_uuid(value) - - purl = PackageURL( - name=purl_name, - namespace=purl_namespace, - type="local-files", - version=purl_old.version, - qualifiers=purl_old.qualifiers, - subpath=purl_old.subpath, - ) - return purl_with_fake_uuid(purl.to_string()) - - def _normalize_package_uids(self, data): - """ - Return the `data`, where any `package_uid` value has been normalized - with `purl_with_fake_uuid()` - """ - fields_with_package_uids = [ - "package_uid", - "dependency_uid", - "for_package_uid", - "resolved_to_package_uid", - ] - if isinstance(data, list): - return [self._normalize_package_uids(entry) for entry in data] - - if isinstance(data, dict): - is_local_files = False - if data.get("type") and data["type"] == "local-files": - is_local_files = True - normalized_data = {} - for key, value in data.items(): - if isinstance(value, list | dict): - value = self._normalize_package_uids(value) - if key in fields_with_package_uids and value: - value = purl_with_fake_uuid(value) - if key == "for_packages" and value: - value = sorted( - [ - self.purl_fields_with_fake_uuid(package_uid, key) - for package_uid in value - ] - ) - if ( - is_local_files - and key in ("name", "namespace", "purl", "package_uid") - and value - ): - value = self.purl_fields_with_fake_uuid(value, key) - normalized_data[key] = value - return normalized_data - - return data - - def _sort_dependencies(self, data): - """ - Sort dependencies by their "for_package_uid". - - After dependency resolution in some cases we have multiple - dependency requirements resolved to a same package, and they - are not sorted the same way every time. - """ - mappings = data.get("dependencies") - if mappings: - mappings_by_uid = {} - for mapping in mappings: - uid = mapping.get("for_package_uid") or "" - mappings_by_uid[uid] = mapping - data["dependencies"] = list(dict(sorted(mappings_by_uid.items())).values()) - return data - - def test_package_uids_normalized_in_pipeline_integration_tests(self): - self.maxDiff = 1000 - data = { - "type": "local-files", - "package_uid": ( - "pkg:local-files/analysis-438ebaf4/42440f35-1091-4c03-8c96-a0ed3d3caf23" - "?uuid=42440f35-1091-4c03-8c96-a0ed3d3caf24" - ), - "for_packages": [ - ( - "pkg:local-files/analysis-438ebaf4/42440f35-1091-4c03-8c96-a0ed3d3caf23" - "?uuid=42440f35-1091-4c03-8c96-a0ed3d3caf24" - ) - ], - } - normalized_data = self._normalize_package_uids(data=data) - expected_data = { - "type": "local-files", - "package_uid": ( - "pkg:local-files/fixed-namespace-for-testing-5642512d1758/" - "fixed-name-for-testing-5642512d1758?uuid=fixed-uid-done-for-testing-5642512d1758" - ), - "for_packages": [ - ( - "pkg:local-files/fixed-namespace-for-testing-5642512d1758/" - "fixed-name-for-testing-5642512d1758?uuid=fixed-uid-done-for-testing-5642512d1758" - ) - ], - } - self.assertEqual(normalized_data, expected_data) - - def assertPipelineResultEqual( - self, expected_file, result_file, sort_dependencies=False, regen=FIXTURES_REGEN - ): - """Set `regen` to True to regenerate the expected results.""" - result_json = json.loads(Path(result_file).read_text()) - result_json = self._normalize_package_uids(result_json) - result_data = self._without_keys(result_json, self.exclude_from_diff) - if sort_dependencies: - result_data = self._sort_dependencies(result_data) - result_data = sort_for_os_compatibility(result_data) - - if regen: - expected_file.write_text(json.dumps(result_data, indent=2)) - - expected_json = json.loads(expected_file.read_text()) - expected_json = self._normalize_package_uids(expected_json) - expected_data = self._without_keys(expected_json, self.exclude_from_diff) - if sort_dependencies: - result_data = self._sort_dependencies(result_data) - expected_data = sort_for_os_compatibility(expected_data) - - self.assertEqual(expected_data, result_data) - - @skipIf(from_docker_image, "Random failure in the Docker context.") - def test_scanpipe_scan_package_pipeline_integration(self): - pipeline_name = "scan_single_package" - project1 = make_project() - - input_location = self.data / "scancode" / "is-npm-1.0.0.tgz" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(4, project1.codebaseresources.count()) - self.assertEqual(1, project1.discoveredpackages.count()) - self.assertEqual(1, project1.discovereddependencies.count()) - - scancode_file = project1.get_latest_output(filename="scancode") - expected_file = self.data / "scancode" / "is-npm-1.0.0_scan_package.json" - self.assertPipelineResultEqual(expected_file, scancode_file) - - summary_file = project1.get_latest_output(filename="summary") - expected_file = ( - self.data / "scancode" / "is-npm-1.0.0_scan_package_summary.json" - ) - self.assertPipelineResultEqual(expected_file, summary_file) - - # Ensure that we only have one instance of is-npm in `key_files_packages` - summary_data = json.loads(Path(summary_file).read_text()) - key_files_packages = summary_data.get("key_files_packages", []) - self.assertEqual(1, len(key_files_packages)) - key_file_package = key_files_packages[0] - key_file_package_purl = key_file_package.get("purl", "") - self.assertEqual("pkg:npm/is-npm@1.0.0", key_file_package_purl) - - @skipIf(from_docker_image, "Random failure in the Docker context.") - def test_scanpipe_scan_package_pipeline_integration_multiple_packages(self): - pipeline_name = "scan_single_package" - project1 = make_project() - - input_location = self.data / "scancode" / "multiple-is-npm-1.0.0.tar.gz" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(9, project1.codebaseresources.count()) - self.assertEqual(2, project1.discoveredpackages.count()) - self.assertEqual(2, project1.discovereddependencies.count()) - - scancode_file = project1.get_latest_output(filename="scancode") - expected_file = ( - self.data / "scancode" / "multiple-is-npm-1.0.0_scan_package.json" - ) - # Do not override the regen as this file is generated in regen_test_data - self.assertPipelineResultEqual(expected_file, scancode_file) - - summary_file = project1.get_latest_output(filename="summary") - expected_file = ( - self.data / "scancode" / "multiple-is-npm-1.0.0_scan_package_summary.json" - ) - self.assertPipelineResultEqual(expected_file, summary_file) - - @mock.patch("scanpipe.pipelines.scan_single_package.is_archive") - def test_scanpipe_scan_package_single_extract_input_to_codebase_directory( - self, mock_is_archive - ): - project1 = make_project() - run = project1.add_pipeline("scan_single_package") - pipeline_instance = scan_single_package.ScanSinglePackage(run) - - project1.move_input_from(tempfile.mkstemp(suffix=".zip")[1]) - self.assertEqual(1, len(project1.input_files)) - - mock_is_archive.return_value = True - pipeline_instance.get_package_input() - with mock.patch("scanpipe.pipes.scancode.extract_archive") as extract_archive: - extract_archive.return_value = {"path/to/resource": ["error1", "error2"]} - pipeline_instance.extract_input_to_codebase_directory() - - projects_errors = project1.projectmessages.all() - self.assertEqual(1, len(projects_errors)) - project_error = projects_errors[0] - self.assertEqual("error", project_error.severity) - self.assertEqual("error1\nerror2", project_error.description) - self.assertEqual("extract_archive", project_error.model) - self.assertEqual({"filename": "resource"}, project_error.details) - self.assertEqual("", project_error.traceback) - - def test_scanpipe_scan_package_single_file(self): - pipeline_name = "scan_single_package" - project1 = make_project() - - input_location = self.data / "manifests" / "openpdf-parent-1.3.11.pom.xml" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(1, project1.codebaseresources.count()) - self.assertEqual(1, project1.discoveredpackages.count()) - self.assertEqual(10, project1.discovereddependencies.count()) - - scancode_file = project1.get_latest_output(filename="scancode") - expected_file = ( - self.data / "manifests" / "openpdf-parent-1.3.11_scan_package.json" - ) - self.assertPipelineResultEqual(expected_file, scancode_file) - - @mock.patch("git.repo.base.Repo.clone_from") - def test_scanpipe_scan_package_single_package_git_repo(self, mock_clone): - pipeline_name = "scan_single_package" - project1 = make_project() - - download_url = "https://github.com/aboutcode-org/scancode.io.git" - project1.add_input_source(download_url=download_url) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - # Create the "fetched" git directory content - def mock_make_git_directory(**kwargs): - to_path = kwargs.get("to_path") # scancode.io.git - to_path.mkdir() - file_location = self.data / "aboutcode" / "notice.NOTICE" - copy_input(file_location, to_path) - - mock_clone.side_effect = mock_make_git_directory - mock_clone.return_value = None - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(2, project1.codebaseresources.count()) - self.assertEqual(0, project1.discoveredpackages.count()) - - def test_scanpipe_scan_codebase_pipeline_integration(self): - pipeline_name = "scan_codebase" - project1 = make_project() - - filename = "is-npm-1.0.0.tgz" - input_location = self.data / "scancode" / filename - project1.copy_input_from(input_location) - project1.add_input_source("https://download.url", filename) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(6, project1.codebaseresources.count()) - self.assertEqual(1, project1.discoveredpackages.count()) - self.assertEqual(1, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = self.data / "scancode" / "is-npm-1.0.0_scan_codebase.json" - self.assertPipelineResultEqual(expected_file, result_file) - - def test_scanpipe_scan_codebase_creates_top_level_paths(self): - pipeline_name = "scan_codebase" - project1 = make_project() - - filename = "is-npm-1.0.0.tgz" - input_location = self.data / "scancode" / filename - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - expected_top_level_paths = ["is-npm-1.0.0.tgz", "is-npm-1.0.0.tgz-extract"] - - top_level_resources = project1.codebaseresources.filter(parent_path="") - top_level_paths = [resource.path for resource in top_level_resources] - - self.assertListEqual(top_level_paths, expected_top_level_paths) - - def test_scanpipe_scan_codebase_creates_parent_path_field(self): - pipeline_name = "scan_codebase" - project1 = make_project() - - filename = "is-npm-1.0.0.tgz" - input_location = self.data / "scancode" / filename - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - expected_top_level_paths = ["is-npm-1.0.0.tgz", "is-npm-1.0.0.tgz-extract"] - expected_nested_paths = [ - "is-npm-1.0.0.tgz-extract/package/index.js", - "is-npm-1.0.0.tgz-extract/package/package.json", - "is-npm-1.0.0.tgz-extract/package/readme.md", - ] - - top_level_resources = project1.codebaseresources.filter(parent_path="") - top_level_paths = [resource.path for resource in top_level_resources] - - self.assertListEqual(top_level_paths, expected_top_level_paths) - - nested_resources = project1.codebaseresources.filter( - parent_path="is-npm-1.0.0.tgz-extract/package" - ) - nested_paths = [resource.path for resource in nested_resources] - - self.assertListEqual(nested_paths, expected_nested_paths) - - def test_scanpipe_inspect_packages_creates_packages_npm(self): - pipeline_name = "inspect_packages" - project1 = make_project() - - filename = "is-npm-1.0.0.tgz" - input_location = self.data / "scancode" / filename - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(6, project1.codebaseresources.count()) - self.assertEqual(1, project1.discoveredpackages.count()) - self.assertEqual(1, project1.discovereddependencies.count()) - - package = project1.discoveredpackages.get() - dependency = project1.discovereddependencies.get() - - self.assertEqual(3, package.codebase_resources.count()) - self.assertEqual("pkg:npm/is-npm@1.0.0", dependency.for_package.purl) - self.assertEqual(package.datasource_ids, [dependency.datasource_id]) - self.assertEqual( - package.codebase_resources.get( - path="is-npm-1.0.0.tgz-extract/package/package.json" - ).path, - dependency.datafile_resource.path, - ) - - def test_scanpipe_inspect_packages_creates_packages_pypi(self): - pipeline_name = "inspect_packages" - project1 = make_project() - - input_location = self.data / "manifests" / "python-inspector-0.10.0.zip" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - self.assertEqual(6, project1.codebaseresources.count()) - self.assertEqual(0, project1.discoveredpackages.count()) - self.assertEqual(26, project1.discovereddependencies.count()) - - @skipIf(sys.platform == "darwin", "Not supported on macOS") - def test_scanpipe_inspect_packages_with_resolved_dependencies_npm(self): - pipeline_name = "inspect_packages" - project1 = make_project() - - input_location = self.data / "dependencies" / "resolved_dependencies_npm.zip" - project1.copy_input_from(input_location) - - run = project1.add_pipeline( - pipeline_name=pipeline_name, - selected_groups=["StaticResolver"], - ) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - self.assertEqual(4, project1.codebaseresources.count()) - self.assertEqual(7, project1.discoveredpackages.count()) - self.assertEqual(6, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = ( - self.data - / "dependencies" - / "resolved_dependencies_npm_inspect_packages.json" - ) - self.assertPipelineResultEqual(expected_file, result_file) - - @skipIf(sys.platform == "darwin", "Not supported on macOS") - def test_scanpipe_inspect_packages_with_resolved_dependencies_poetry(self): - pipeline_name = "inspect_packages" - project1 = make_project() - - input_location = self.data / "dependencies" / "resolved_dependencies_poetry.zip" - project1.copy_input_from(input_location) - - run = project1.add_pipeline( - pipeline_name=pipeline_name, - selected_groups=["StaticResolver"], - ) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - self.assertEqual(5, project1.codebaseresources.count()) - self.assertEqual(6, project1.discoveredpackages.count()) - self.assertEqual(10, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = ( - self.data - / "dependencies" - / "resolved_dependencies_poetry_inspect_packages.json" - ) - self.assertPipelineResultEqual(expected_file, result_file) - - @skipIf(sys.platform == "darwin", "Not supported on macOS") - def test_scanpipe_resolved_dependencies_cocoapods(self): - pipeline_name = "resolve_dependencies" - project1 = make_project() - - input_location = ( - self.data / "dependencies" / "resolved_dependencies_cocoapods.zip" - ) - project1.copy_input_from(input_location) - - run = project1.add_pipeline( - pipeline_name=pipeline_name, - selected_groups=["StaticResolver"], - ) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - self.assertEqual(3, project1.codebaseresources.count()) - self.assertEqual(25, project1.discoveredpackages.count()) - self.assertEqual(30, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = ( - self.data / "dependencies" / "resolved_dependencies_cocoapods.json" - ) - self.assertPipelineResultEqual( - expected_file, result_file, sort_dependencies=True - ) - - def test_scanpipe_resolved_dependencies_pip_inspect(self): - pipeline_name = "resolve_dependencies" - project1 = make_project() - - input_location = self.data / "dependencies" / "resolved_dependencies_pip.zip" - project1.copy_input_from(input_location) - - run = project1.add_pipeline( - pipeline_name=pipeline_name, - selected_groups=["StaticResolver"], - ) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - self.assertEqual(3, project1.codebaseresources.count()) - self.assertEqual(4, project1.discoveredpackages.count()) - self.assertEqual(17, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = self.data / "dependencies" / "resolved_dependencies_pip.json" - self.assertPipelineResultEqual( - expected_file, - result_file, - ) - - def test_scanpipe_resolved_dependencies_nuget(self): - pipeline_name = "resolve_dependencies" - project1 = make_project() - - input_location = self.data / "dependencies" / "resolved_dependencies_nuget.zip" - project1.copy_input_from(input_location) - - run = project1.add_pipeline( - pipeline_name=pipeline_name, - selected_groups=["StaticResolver"], - ) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - self.assertEqual(3, project1.codebaseresources.count()) - self.assertEqual(34, project1.discoveredpackages.count()) - self.assertEqual(108, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = self.data / "dependencies" / "resolved_dependencies_nuget.json" - self.assertPipelineResultEqual( - expected_file, - result_file, - sort_dependencies=True, - ) - - def test_scanpipe_scan_codebase_can_process_wheel(self): - pipeline_name = "scan_codebase" - project1 = make_project() - - filename = "daglib-0.6.0-py3-none-any.whl" - input_location = self.data / "scancode" / filename - project1.copy_input_from(input_location) - project1.add_input_source("https://download.url", filename) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(11, project1.codebaseresources.count()) - self.assertEqual(2, project1.discoveredpackages.count()) - self.assertEqual(8, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = ( - self.data / "scancode" / "daglib-0.6.0-py3-none-any.whl_scan_codebase.json" - ) - self.assertPipelineResultEqual(expected_file, result_file) - - @skipIf(sys.platform != "linux", "Expected results are inconsistent across OS") - def test_scanpipe_docker_pipeline_alpine_integration(self): - pipeline_name = "analyze_docker_image" - project1 = make_project() - - filename = "alpine_3_15_4.tar.gz" - input_location = self.data / "docker" / filename - project1.copy_input_from(input_location) - project1.add_input_source("https://download.url", filename) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(510, project1.codebaseresources.count()) - self.assertEqual(14, project1.discoveredpackages.count()) - self.assertEqual(0, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = self.data / "docker" / "alpine_3_15_4_scan_codebase.json" - self.assertPipelineResultEqual(expected_file, result_file) - - def test_scanpipe_docker_pipeline_does_not_report_errors_for_broken_symlinks(self): - pipeline_name = "analyze_docker_image" - project1 = make_project() - - filename = "minitag.tar" - input_location = self.data / "image-with-symlinks" / filename - project1.copy_input_from(input_location) - project1.add_input_source("https://download.url", filename) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - with redirect_stderr(io.StringIO()): - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - project_messages = project1.projectmessages.all() - self.assertEqual(1, len(project_messages)) - self.assertEqual("Distro not found.", project_messages[0].description) - - result_file = output.to_json(project1) - expected_file = ( - self.data / "image-with-symlinks" / (filename + "-expected-scan.json") - ) - self.assertPipelineResultEqual(expected_file, result_file) - - @skipIf(sys.platform != "linux", "RPM related features only supported on Linux.") - def test_scanpipe_docker_pipeline_rpm_integration(self): - pipeline_name = "analyze_docker_image" - project1 = make_project() - - filename = "centos.tar.gz" - input_location = self.data / "docker" / filename - project1.copy_input_from(input_location) - project1.add_input_source("https://download.url", filename) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(29, project1.codebaseresources.count()) - self.assertEqual(101, project1.discoveredpackages.count()) - self.assertEqual(0, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = self.data / "docker" / "centos_scan_codebase.json" - self.assertPipelineResultEqual(expected_file, result_file) - - def test_scanpipe_docker_pipeline_debian_integration(self): - pipeline_name = "analyze_docker_image" - project1 = make_project() - - filename = "debian.tar.gz" - input_location = self.data / "docker" / filename - project1.copy_input_from(input_location) - project1.add_input_source("https://download.url", filename) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(16, project1.codebaseresources.count()) - self.assertEqual(2, project1.discoveredpackages.count()) - self.assertEqual(0, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = self.data / "docker" / "debian_scan_codebase.json" - self.assertPipelineResultEqual(expected_file, result_file) - - def test_scanpipe_docker_pipeline_distroless_debian_integration(self): - pipeline_name = "analyze_docker_image" - project1 = make_project() - - filename = "gcr_io_distroless_base.tar.gz" - input_location = self.data / "docker" / filename - project1.copy_input_from(input_location) - project1.add_input_source("https://download.url", filename) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(2458, project1.codebaseresources.count()) - self.assertEqual(6, project1.discoveredpackages.count()) - self.assertEqual(0, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = ( - self.data / "docker" / "gcr_io_distroless_base_scan_codebase.json" - ) - self.assertPipelineResultEqual(expected_file, result_file) - - def test_scanpipe_rootfs_pipeline_integration(self): - pipeline_name = "analyze_root_filesystem_or_vm_image" - project1 = make_project() - - input_location = self.data / "rootfs" / "basic-rootfs.tar.gz" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(17, project1.codebaseresources.count()) - self.assertEqual(2, project1.discoveredpackages.count()) - self.assertEqual(0, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = self.data / "rootfs" / "basic-rootfs_root_filesystems.json" - self.assertPipelineResultEqual(expected_file, result_file) - - def test_scanpipe_load_inventory_pipeline_integration(self): - pipeline_name = "load_inventory" - project1 = make_project() - - input_location = self.data / "asgiref" / "asgiref-3.3.0_toolkit_scan.json" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(18, project1.codebaseresources.count()) - self.assertEqual(2, project1.discoveredpackages.count()) - self.assertEqual(4, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = ( - self.data / "asgiref" / "asgiref-3.3.0_load_inventory_expected.json" - ) - self.assertPipelineResultEqual(expected_file, result_file) - - # Using the ScanCode.io JSON output as the input - project2 = make_project() - - input_location = self.data / "asgiref" / "asgiref-3.3.0_scanpipe_output.json" - project2.copy_input_from(input_location) - - run = project2.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(18, project2.codebaseresources.count()) - self.assertEqual(2, project2.discoveredpackages.count()) - self.assertEqual(4, project2.discovereddependencies.count()) - - @mock.patch("scanpipe.pipes.vulnerablecode.is_available") - @mock.patch("scanpipe.pipes.vulnerablecode.is_configured") - @mock.patch("scanpipe.pipes.vulnerablecode.bulk_search_by_purl") - def test_scanpipe_find_vulnerabilities_pipeline_integration( - self, mock_bulk_search_by_purl, mock_is_configured, mock_is_available - ): - pipeline_name = "find_vulnerabilities" - project1 = make_project() - package1 = DiscoveredPackage.create_from_data(project1, package_data1) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - mock_is_configured.return_value = False - mock_is_available.return_value = False - exitcode, out = pipeline.execute() - self.assertEqual(1, exitcode, msg=out) - self.assertIn("VulnerableCode is not configured.", out) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - mock_is_configured.return_value = True - mock_is_available.return_value = True - vulnerability_data = [ - { - "purl": "pkg:deb/debian/adduser@3.118?arch=all", - "affected_by_vulnerabilities": [ - { - "vulnerability_id": "VCID-cah8-awtr-aaad", - "summary": "An issue was discovered.", - }, - ], - }, - { - "purl": "pkg:deb/debian/adduser@3.118?qualifiers=1", - "affected_by_vulnerabilities": [ - { - "vulnerability_id": "VCID-cah8-awtr-aaad", - "summary": "An issue was discovered.", - }, - ], - }, - ] - mock_bulk_search_by_purl.return_value = vulnerability_data - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - package1.refresh_from_db() - expected = vulnerability_data[0]["affected_by_vulnerabilities"] - self.assertEqual(expected, package1.affected_by_vulnerabilities) - - @mock.patch("scorecode.ossf_scorecard.is_available") - def test_scanpipe_fetch_scores_pipeline_integration(self, mock_is_available): - pipeline_name = "fetch_scores" - project1 = make_project() - package1 = DiscoveredPackage.create_from_data(project1, package_data1) - package1.vcs_url = "https://github.com/ossf/scorecard" - package1.save() - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - mock_is_available.return_value = False - exitcode, out = pipeline.execute() - self.assertEqual(1, exitcode, msg=out) - self.assertIn("ScoreCode service is not available.", out) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - mock_is_available.return_value = True - - package_score_data = { - "scoring_tool": "ossf_scorecard", - "scoring_tool_version": "v5.2.1", - "score": "9.7", - "scoring_tool_documentation_url": "https://github.com/[trunc...]", - "score_date": "2025-07-24T18:50:16Z", - } - with mock.patch("scorecode.ossf_scorecard.fetch_scorecard") as fetch: - fetch.return_value = PackageScore(**package_score_data) - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - package1.refresh_from_db() - scorecard_entry = package1.scores.filter(scoring_tool="ossf-scorecard").first() - self.assertIsNotNone(scorecard_entry) - self.assertEqual("ossf-scorecard", scorecard_entry.scoring_tool) - self.assertEqual("v5.2.1", scorecard_entry.scoring_tool_version) - self.assertTrue(scorecard_entry.score) - - def test_scanpipe_resolve_dependencies_pipeline_integration(self): - pipeline_name = "resolve_dependencies" - project1 = make_project() - selected_groups = ["DynamicResolver"] - - run = project1.add_pipeline( - pipeline_name=pipeline_name, selected_groups=selected_groups - ) - pipeline = run.make_pipeline_instance() - - project1.move_input_from(tempfile.mkstemp()[1]) - pipeline.execute() - self.assertEqual(1, project1.projectmessages.count()) - message = project1.projectmessages.get() - self.assertEqual("get_packages_from_manifest", message.model) - expected = "No resources containing package data found in codebase." - self.assertIn(expected, message.description) - - def test_scanpipe_resolve_dependencies_pipeline_integration_empty_manifest(self): - pipeline_name = "resolve_dependencies" - project1 = make_project() - selected_groups = ["DynamicResolver"] - - run = project1.add_pipeline( - pipeline_name=pipeline_name, selected_groups=selected_groups - ) - pipeline = run.make_pipeline_instance() - - project1.move_input_from(tempfile.mkstemp(suffix="requirements.txt")[1]) - pipeline.execute() - self.assertEqual(1, project1.projectmessages.count()) - message = project1.projectmessages.get() - self.assertEqual("get_packages_from_manifest", message.model) - expected = "No packages could be resolved" - self.assertIn(expected, message.description) - - @mock.patch("scanpipe.pipes.resolve.python_inspector.resolve_dependencies") - def test_scanpipe_resolve_dependencies_pipeline_integration_misc( - self, mock_resolve_dependencies - ): - pipeline_name = "resolve_dependencies" - project1 = make_project() - selected_groups = ["DynamicResolver"] - - input_location = self.data / "manifests" / "requirements.txt" - project1.copy_input_from(input_location) - - run = project1.add_pipeline( - pipeline_name=pipeline_name, selected_groups=selected_groups - ) - pipeline = run.make_pipeline_instance() - - mock_resolve_dependencies.return_value = mock.Mock(packages=[package_data1]) - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - self.assertEqual(1, project1.discoveredpackages.count()) - - @mock.patch("scanpipe.pipes.resolve.python_inspector.resolve_dependencies") - def test_scanpipe_resolve_dependencies_pipeline_pypi_integration( - self, mock_resolve_dependencies - ): - pipeline_name = "resolve_dependencies" - project1 = make_project() - selected_groups = ["DynamicResolver"] - - run = project1.add_pipeline( - pipeline_name=pipeline_name, selected_groups=selected_groups - ) - pipeline = run.make_pipeline_instance() - - project1.move_input_from(tempfile.mkstemp(suffix="requirements.txt")[1]) - mock_resolve_dependencies.return_value = mock.Mock(packages=[package_data1]) - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(1, project1.discoveredpackages.count()) - discoveredpackage = project1.discoveredpackages.get() - exclude_fields = ["qualifiers", "release_date", "size"] - for field_name, value in package_data1.items(): - if value and field_name not in exclude_fields: - self.assertEqual(value, getattr(discoveredpackage, field_name)) - - def test_scanpipe_load_sbom_pipeline_aboutfile_integration(self): - pipeline_name = "load_sbom" - project1 = make_project() - - input_location = self.data / "manifests" / "Django-4.0.8-py3-none-any.whl.ABOUT" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(1, project1.discoveredpackages.count()) - discoveredpackage = project1.discoveredpackages.get() - self.assertEqual("pypi", discoveredpackage.type) - self.assertEqual("django", discoveredpackage.name) - self.assertEqual("4.0.8", discoveredpackage.version) - self.assertEqual("bsd-new", discoveredpackage.declared_license_expression) - - def test_scanpipe_load_sbom_pipeline_spdx_integration(self): - pipeline_name = "load_sbom" - project1 = make_project() - - input_location = self.data / "manifests" / "toml.spdx.json" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(1, project1.discoveredpackages.count()) - discoveredpackage = project1.discoveredpackages.get() - self.assertEqual("pypi", discoveredpackage.type) - self.assertEqual("toml", discoveredpackage.name) - self.assertEqual("0.10.2", discoveredpackage.version) - self.assertEqual("https://github.com/uiri/toml", discoveredpackage.homepage_url) - self.assertEqual("MIT", discoveredpackage.extracted_license_statement) - self.assertEqual("mit", discoveredpackage.declared_license_expression) - - def test_scanpipe_load_sbom_pipeline_cyclonedx_integration(self): - pipeline_name = "load_sbom" - project1 = make_project() - - input_location = self.data / "cyclonedx" / "nested.cdx.json" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(3, project1.discoveredpackages.count()) - packages = project1.discoveredpackages.all() - expected_data = { - "pkg:pypi/toml@0.10.2?extension=tar.gz": { - "type": "pypi", - "name": "toml", - "version": "0.10.2", - "extracted_license_statement": "OFL-1.1\nApache-2.0", - "declared_license_expression": "ofl-1.1 OR apache-2.0", - "homepage_url": "https://cyclonedx.org/website", - "bug_tracking_url": "https://cyclonedx.org/issue-tracker", - "vcs_url": "https://cyclonedx.org/vcs", - "filename": "", - }, - "pkg:pypi/billiard@3.6.3.0": { - "type": "pypi", - "name": "billiard", - "version": "3.6.3.0", - "extracted_license_statement": "BSD-3-Clause", - "declared_license_expression": "bsd-new", - "homepage_url": "", - "bug_tracking_url": "", - "vcs_url": "", - "extra_data": "", - "filename": "", - }, - "pkg:pypi/fictional@9.10.2": { - "type": "pypi", - "name": "fictional", - "version": "9.10.2", - "extracted_license_statement": ( - "LGPL-3.0-or-later" - " AND " - "LicenseRef-scancode-openssl-exception-lgpl3.0plus" - ), - "declared_license_expression": ( - "lgpl-3.0-plus AND openssl-exception-lgpl-3.0-plus" - ), - "homepage_url": "https://home.page", - "bug_tracking_url": "", - "vcs_url": "", - "extra_data": "", - "filename": "package.zip", - }, - } - - for package in packages: - expected = expected_data.get(str(package)) - self.assertEqual(expected["type"], package.type) - self.assertEqual(expected["name"], package.name) - self.assertEqual(expected["version"], package.version) - self.assertEqual(expected["homepage_url"], package.homepage_url) - self.assertEqual( - expected["extracted_license_statement"], - package.extracted_license_statement, - ) - self.assertEqual( - expected["declared_license_expression"], - package.declared_license_expression, - ) - self.assertEqual(expected["filename"], package.filename) - - def test_scanpipe_load_sbom_pipeline_cyclonedx_with_dependencies_integration(self): - pipeline_name = "load_sbom" - project1 = make_project() - - input_location = self.data / "cyclonedx" / "laravel-7.12.0" / "bom.1.4.json" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(62, project1.discoveredpackages.count()) - self.assertEqual(112, project1.discovereddependencies.count()) - dependency = project1.discovereddependencies.all()[0] - self.assertEqual("bom.1.4.json", str(dependency.datafile_resource)) - - def test_scanpipe_load_sbom_pipeline_cyclonedx_with_vulnerabilities(self): - pipeline_name = "load_sbom" - project1 = make_project() - - input_location = ( - self.data / "cyclonedx" / "python-3.13.0-vulnerabilities.cdx.json" - ) - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(1, project1.discoveredpackages.count()) - package = project1.discoveredpackages.get() - expected = [ - { - "vulnerability_id": "CVE-2005-2541", - "summary": "Tar 1.15.1 does not properly warn the user when...", - } - ] - self.assertEqual(expected, package.affected_by_vulnerabilities) - - @mock.patch("scanpipe.pipes.purldb.request_post") - @mock.patch("uuid.uuid4") - def test_scanpipe_deploy_to_develop_pipeline_integration( - self, mock_uuid4, mock_request - ): - forced_uuid = "b74fe5df-e965-415e-ba65-f38421a0695d" - mock_uuid4.return_value = forced_uuid - mock_request.return_value = None - pipeline_name = "map_deploy_to_develop" - project1 = make_project(name="Analysis", uuid=forced_uuid) - selected_groups = ["Java"] - - jar_location = self.data / "d2d" / "jars" - project1.copy_input_from(jar_location / "from-flume-ng-node-1.9.0.zip") - project1.copy_input_from(jar_location / "to-flume-ng-node-1.9.0.zip") - - run = project1.add_pipeline( - pipeline_name=pipeline_name, selected_groups=selected_groups - ) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(57, project1.codebaseresources.count()) - self.assertEqual(18, project1.codebaserelations.count()) - self.assertEqual(1, project1.discoveredpackages.count()) - self.assertEqual(0, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = self.data / "d2d" / "flume-ng-node-d2d.json" - self.assertPipelineResultEqual(expected_file, result_file) - - @skipIf(sys.platform == "darwin", "Not supported on macOS") - def test_scanpipe_deploy_to_develop_pipeline_integration_elfs(self): - pipeline_name = "map_deploy_to_develop" - project1 = make_project(name="Analysis") - selected_groups = ["Elf"] - - elf_location = self.data / "d2d-elfs" - project1.copy_input_from(elf_location / "from-brotli-d2d.zip") - project1.copy_input_from(elf_location / "to-brotli-d2d.zip") - - run = project1.add_pipeline( - pipeline_name=pipeline_name, selected_groups=selected_groups - ) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(17, project1.codebaseresources.count()) - self.assertEqual(7, project1.codebaserelations.count()) - - result_file = output.to_json(project1) - expected_file = self.data / "d2d-elfs" / "brotli-elf-d2d.json" - self.assertPipelineResultEqual(expected_file, result_file) - - def test_scanpipe_deploy_to_develop_pipeline_extract_input_files_errors(self): - project1 = make_project() - run = project1.add_pipeline("map_deploy_to_develop") - pipeline_instance = deploy_to_develop.DeployToDevelop(run) - - # Create 2 files in the input/ directory to generate error twice - project1.move_input_from(tempfile.mkstemp(prefix="from-")[1]) - project1.move_input_from(tempfile.mkstemp(prefix="to-")[1]) - self.assertEqual(2, len(project1.input_files)) - - pipeline_instance.get_inputs() - with mock.patch("scanpipe.pipes.scancode.extract_archive") as extract_archive: - extract_archive.return_value = {"path/to/resource": ["error1", "error2"]} - inputs_with_codebase_path_destination = [ - (pipeline_instance.from_files, project1.codebase_path / d2d.FROM), - (pipeline_instance.to_files, project1.codebase_path / d2d.TO), - ] - - for input_files, codebase_path in inputs_with_codebase_path_destination: - for input_file_path in input_files: - pipeline_instance.extract_archive(input_file_path, codebase_path) - - projects_errors = project1.projectmessages.all() - self.assertEqual(2, len(projects_errors)) - project_error = projects_errors[0] - self.assertEqual("error", project_error.severity) - self.assertEqual("error1\nerror2", project_error.description) - self.assertEqual("extract_archive", project_error.model) - self.assertEqual({"filename": "resource"}, project_error.details) - self.assertEqual("", project_error.traceback) - - @mock.patch("scanpipe.pipes.purldb.request_post") - @mock.patch("uuid.uuid4") - def test_scanpipe_deploy_to_develop_pipeline_with_about_file( - self, mock_uuid4, mock_request - ): - forced_uuid = "90cb6382-431c-4187-be76-d4f1a2199a2f" - mock_uuid4.return_value = forced_uuid - mock_request.return_value = None - pipeline_name = "map_deploy_to_develop" - project1 = make_project(name="Analysis", uuid=forced_uuid) - selected_groups = ["Java"] - - data_dir = self.data / "d2d" / "about_files" - project1.copy_input_from(data_dir / "from-with-about-file.zip") - project1.copy_input_from(data_dir / "to-with-jar.zip") - - run = project1.add_pipeline( - pipeline_name=pipeline_name, selected_groups=selected_groups - ) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertEqual(44, project1.codebaseresources.count()) - self.assertEqual(31, project1.codebaserelations.count()) - self.assertEqual(2, project1.discoveredpackages.count()) - self.assertEqual(0, project1.discovereddependencies.count()) - - result_file = output.to_json(project1) - expected_file = data_dir / "expected.json" - self.assertPipelineResultEqual(expected_file, result_file) - - self.assertEqual(1, project1.projectmessages.count()) - message = project1.projectmessages.get() - self.assertEqual("map_about_files", message.model) - expected = ( - "Resource paths listed at about_resource is not found in the to/ codebase" - ) - self.assertIn(expected, message.description) - - @mock.patch("scanpipe.pipes.purldb.request_post") - @mock.patch("scanpipe.pipes.purldb.is_available") - def test_scanpipe_populate_purldb_pipeline_integration( - self, mock_is_available, mock_request_post - ): - pipeline_name1 = "load_inventory" - pipeline_name2 = "populate_purldb" - project1 = make_project() - - input_location = self.data / "asgiref" / "asgiref-3.3.0_toolkit_scan.json" - project1.copy_input_from(input_location) - - run = project1.add_pipeline(pipeline_name1) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - def mock_request_post_return(url, data, headers, timeout): - payload = json.loads(data) - return { - "queued_packages_count": len(payload["packages"]), - "queued_packages": payload["packages"], - "unqueued_packages_count": 1, - "unqueued_packages": [], - "unsupported_packages_count": 1, - "unsupported_packages": [], - } - - mock_request_post.side_effect = mock_request_post_return - mock_is_available.return_value = True - - run = project1.add_pipeline(pipeline_name2) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertIn("Populating PurlDB with 2 PURLs from DiscoveredPackage", run.log) - self.assertIn("Successfully queued 2 PURLs for indexing in PurlDB", run.log) - self.assertIn("1 PURLs were already present in PurlDB index queue", run.log) - self.assertIn("Couldn't index 1 unsupported PURLs", run.log) - - @mock.patch("scanpipe.pipes.purldb.request_post") - @mock.patch("scanpipe.pipes.purldb.is_available") - def test_scanpipe_populate_purldb_pipeline_integration_without_assembly( - self, mock_is_available, mock_request_post - ): - pipeline_name = "populate_purldb" - project1 = make_project() - - def mock_request_post_return(url, data, headers, timeout): - payload = json.loads(data) - return { - "queued_packages_count": len(payload["packages"]), - "queued_packages": payload["packages"], - "unqueued_packages_count": 1, - "unqueued_packages": [], - "unsupported_packages_count": 1, - "unsupported_packages": [], - } - - mock_request_post.side_effect = mock_request_post_return - mock_is_available.return_value = True - - package_json_location = self.data / "manifests" / "package.json" - copy_input(package_json_location, project1.codebase_path) - pipes.collect_and_create_codebase_resources(project1) - - scancode.scan_for_application_packages(project1, assemble=False) - scancode.process_package_data(project1) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - self.assertIn("Populating PurlDB with 1 PURLs from DiscoveredPackage", run.log) - self.assertIn( - "Populating PurlDB with 6 unresolved PURLs from DiscoveredDependency", - run.log, - ) - self.assertIn("1 PURLs were already present in PurlDB index queue", run.log) - self.assertIn("Couldn't index 1 unsupported PURLs", run.log) - - @skipIf(sys.platform == "darwin", "Not supported on macOS") - def test_scanpipe_collect_symbols_ctags_pipeline_integration(self): - pipeline_name = "collect_symbols_ctags" - project1 = make_project() - - dir = project1.codebase_path / "codefile" - dir.mkdir(parents=True) - - file_location = self.data / "d2d-javascript" / "from" / "main.js" - copy_input(file_location, dir) - - pipes.collect_and_create_codebase_resources(project1) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - main_file = project1.codebaseresources.files()[0] - result_extra_data_symbols = main_file.extra_data.get("source_symbols") - expected_extra_data_symbols = ["generatePassword", "passwordLength", "charSet"] - self.assertCountEqual(expected_extra_data_symbols, result_extra_data_symbols) - - @skipIf(sys.platform != "linux", "Only supported on Linux") - def test_scanpipe_collect_strings_gettext_pipeline_integration(self): - pipeline_name = "collect_strings_gettext" - project1 = make_project() - - dir = project1.codebase_path / "codefile" - dir.mkdir(parents=True) - - file_location = self.data / "d2d-javascript" / "from" / "main.js" - copy_input(file_location, dir) - - pipes.collect_and_create_codebase_resources(project1) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - main_file = project1.codebaseresources.files()[0] - result_extra_data_strings = main_file.extra_data.get("source_strings") - expected_extra_data_strings = [ - "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_-+=", # noqa - "Enter the desired length of your password:", - ] - self.assertCountEqual(expected_extra_data_strings, result_extra_data_strings) - - @skipIf(sys.platform == "darwin", "Not supported on macOS") - def test_scanpipe_collect_symbols_pygments_pipeline_integration(self): - pipeline_name = "collect_symbols_pygments" - project1 = make_project() - - dir = project1.codebase_path / "codefile" - dir.mkdir(parents=True) - - file_location = self.data / "source-inspector" / "test3.cpp" - copy_input(file_location, dir) - - pipes.collect_and_create_codebase_resources(project1) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - main_file = project1.codebaseresources.files()[0] - result_extra_data = main_file.extra_data - - expected_extra_data = ( - self.data / "source-inspector" / "test3.cpp-pygments-expected.json" - ) - - with open(expected_extra_data) as f: - expected_extra_data = json.load(f) - - self.assertDictEqual(expected_extra_data, result_extra_data) - - @skipIf(sys.platform == "darwin", "Not supported on macOS") - def test_scanpipe_collect_symbols_tree_sitter_pipeline_integration(self): - pipeline_name = "collect_symbols_tree_sitter" - project1 = make_project() - - dir = project1.codebase_path / "codefile" - dir.mkdir(parents=True) - - file_location = self.data / "source-inspector" / "test3.cpp" - copy_input(file_location, dir) - - pipes.collect_and_create_codebase_resources(project1) - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - main_file = project1.codebaseresources.files()[0] - result_extra_data = main_file.extra_data - - expected_extra_data = ( - self.data / "source-inspector" / "test3.cpp-tree-sitter-expected.json" - ) - - with open(expected_extra_data) as f: - expected_extra_data = json.load(f) - - self.assertDictEqual(expected_extra_data, result_extra_data) - - @mock.patch("scanpipe.pipes.purldb.is_available") - @mock.patch("scanpipe.pipes.purldb.is_configured") - @mock.patch("scanpipe.pipes.purldb.collect_data_for_purl") - def test_scanpipe_enrich_with_purldb_pipeline_integration( - self, mock_collect_data, mock_is_configured, mock_is_available - ): - pipeline_name = "enrich_with_purldb" - project1 = make_project() - package1 = make_package(project1, package_url="pkg:npm/csvtojson@2.0.10") - - mock_is_configured.return_value = True - mock_is_available.return_value = True - - purldb_entry_file = self.data / "purldb" / "csvtojson-2.0.10.json" - purldb_entry = json.loads(purldb_entry_file.read_text()) - mock_collect_data.return_value = [purldb_entry] - - run = project1.add_pipeline(pipeline_name) - pipeline = run.make_pipeline_instance() - - exitcode, out = pipeline.execute() - self.assertEqual(0, exitcode, msg=out) - - package1.refresh_from_db() - self.assertTrue(package1.extra_data.get("enrich_with_purldb")) - - run.refresh_from_db() - self.assertIn("pkg:npm/csvtojson@2.0.10 ['release_date'", run.log) - self.assertIn("1 discovered package enriched with the PurlDB.", run.log) - - +<<<<<<< HEAD + +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/nexB/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/nexB/scancode.io for support and download. + +import io +import json +import os +import sys +import tempfile +from contextlib import redirect_stderr +from pathlib import Path +from unittest import mock +from unittest import skipIf + +from django.conf import settings +from django.test import TestCase +from django.test import tag + +from packageurl import PackageURL +from scancode.cli_test_utils import purl_with_fake_uuid +from scorecode.models import PackageScore + +from scanpipe import pipes +from scanpipe.models import CodebaseResource +from scanpipe.models import DiscoveredPackage +from scanpipe.models import InputSource +from scanpipe.pipelines import CommonStepsMixin +from scanpipe.pipelines import InputFilesError +from scanpipe.pipelines import Pipeline +from scanpipe.pipelines import analyze_root_filesystem +from scanpipe.pipelines import deploy_to_develop +from scanpipe.pipelines import is_pipeline +from scanpipe.pipelines import scan_single_package +from scanpipe.pipes import d2d +from scanpipe.pipes import flag +from scanpipe.pipes import output +from scanpipe.pipes import scancode +from scanpipe.pipes.input import copy_input +from scanpipe.tests import FIXTURES_REGEN +from scanpipe.tests import make_mock_response +from scanpipe.tests import make_package +from scanpipe.tests import make_project +from scanpipe.tests import package_data1 +from scanpipe.tests.pipelines.do_nothing import DoNothing +from scanpipe.tests.pipelines.download_inputs import DownloadInput +from scanpipe.tests.pipelines.profile_step import ProfileStep +from scanpipe.tests.pipelines.steps_as_attribute import StepsAsAttribute +from scanpipe.tests.pipelines.with_groups import WithGroups + +from_docker_image = os.environ.get("FROM_DOCKER_IMAGE") + + +class ScanPipePipelinesTest(TestCase): + data = Path(__file__).parent / "data" + + def test_scanpipe_pipeline_class_pipeline_name_attribute(self): + project1 = make_project() + run = project1.add_pipeline("do_nothing") + pipeline_instance = DoNothing(run) + self.assertEqual("do_nothing", pipeline_instance.pipeline_name) + + def test_scanpipe_pipeline_class_get_info(self): + expected = { + "description": "Description section of the doc string.", + "summary": "Do nothing, in 2 steps.", + "steps": [ + {"name": "step1", "doc": "Step1 doc.", "groups": []}, + {"name": "step2", "doc": "Step2 doc.", "groups": []}, + ], + "available_groups": [], + } + self.assertEqual(expected, DoNothing.get_info()) + + expected = { + "summary": "Profile a step using the @profile decorator.", + "description": "", + "steps": [ + {"name": "step", "doc": "", "groups": []}, + ], + "available_groups": [], + } + self.assertEqual(expected, ProfileStep.get_info()) + + def test_scanpipe_pipeline_class_get_summary(self): + expected = "Do nothing, in 2 steps." + self.assertEqual(expected, DoNothing.get_summary()) + + expected = "Profile a step using the @profile decorator." + self.assertEqual(expected, ProfileStep.get_summary()) + + def test_scanpipe_pipeline_class_log(self): + project1 = make_project() + run = project1.add_pipeline("do_nothing") + pipeline = run.make_pipeline_instance() + pipeline.log("Event1") + pipeline.log("Event2") + + run.refresh_from_db() + self.assertIn("Event1", run.log) + self.assertIn("Event2", run.log) + + def test_scanpipe_pipeline_class_execute(self): + project1 = make_project() + run = project1.add_pipeline("do_nothing") + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode) + self.assertEqual("", out) + + run.refresh_from_db() + self.assertIn("Pipeline [do_nothing] starting", run.log) + self.assertIn("Step [step1] starting", run.log) + self.assertIn("Step [step1] completed", run.log) + self.assertIn("Step [step2] starting", run.log) + self.assertIn("Step [step2] completed", run.log) + self.assertIn("Pipeline completed", run.log) + + def test_scanpipe_pipeline_class_execute_with_exception(self): + project1 = make_project() + run = project1.add_pipeline("raise_exception") + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(1, exitcode) + self.assertTrue(out.startswith("Error message")) + self.assertIn("Traceback:", out) + self.assertIn("in execute", out) + self.assertIn("step(self)", out) + self.assertIn("in raise_exception", out) + self.assertIn("raise ValueError", out) + + run.refresh_from_db() + self.assertIn("Pipeline [raise_exception] starting", run.log) + self.assertIn("Step [raise_exception_step] starting", run.log) + self.assertIn("Pipeline failed", run.log) + + @mock.patch("scanpipe.tests.pipelines.do_nothing.DoNothing.step1") + @mock.patch("scanpipe.tests.pipelines.do_nothing.DoNothing.step2") + def test_scanpipe_pipeline_class_execute_with_selected_steps(self, step2, step1): + step1.__name__ = "step1" + step1.groups = [] + step2.__name__ = "step2" + step2.groups = [] + + project1 = make_project() + run = project1.add_pipeline("do_nothing") + run.update(selected_steps=["step2", "not_existing_step"]) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode) + self.assertEqual("", out) + + step1.assert_not_called() + step2.assert_called() + + run.refresh_from_db() + self.assertIn("Pipeline [do_nothing] starting", run.log) + self.assertIn("Step [step1] skipped", run.log) + self.assertIn("Step [step2] starting", run.log) + self.assertIn("Step [step2] completed", run.log) + self.assertIn("Pipeline completed", run.log) + + def test_scanpipe_pipeline_class_download_inputs_attribute(self): + project1 = make_project() + run = project1.add_pipeline("download_inputs") + pipeline = run.make_pipeline_instance() + self.assertTrue(pipeline.download_inputs) + expected = (CommonStepsMixin.download_missing_inputs,) + self.assertEqual(expected, pipeline.get_initial_steps()) + expected = (CommonStepsMixin.download_missing_inputs, DownloadInput.step1) + self.assertEqual(expected, pipeline.get_steps()) + pipeline.execute() + self.assertIn("Step [download_missing_inputs]", run.log) + + run = project1.add_pipeline("profile_step") + pipeline = run.make_pipeline_instance() + self.assertFalse(pipeline.download_inputs) + pipeline.execute() + self.assertNotIn("Step [download_missing_inputs]", run.log) + + @mock.patch("requests.sessions.Session.get") + def test_scanpipe_pipeline_class_download_missing_inputs(self, mock_get): + project1 = make_project() + run = project1.add_pipeline("do_nothing") + pipeline = run.make_pipeline_instance() + + file_location = self.data / "aboutcode" / "notice.NOTICE" + input_source = project1.add_input_source( + filename=file_location.name, is_uploaded=True + ) + self.assertFalse(input_source.exists()) + with self.assertRaises(InputFilesError) as error: + pipeline.download_missing_inputs() + error_msg = ( + "InputFilesError encountered with the following issues:\n\n" + "Error 1: Uploaded file filename=notice.NOTICE [uploaded] not available." + "\n\nNo traceback available." + ) + self.assertEqual(error_msg, str(error.exception)) + self.assertIn( + "Uploaded file filename=notice.NOTICE [uploaded] not available.", run.log + ) + + project1.copy_input_from(file_location) + self.assertTrue(input_source.exists()) + run = project1.add_pipeline("do_nothing") + pipeline = run.make_pipeline_instance() + pipeline.download_missing_inputs() + self.assertEqual("", run.log) + + download_url = "https://download.url/file.zip" + mock_get.return_value = make_mock_response(url=download_url) + input_source2 = project1.add_input_source(download_url=download_url) + pipeline.download_missing_inputs() + self.assertIn("Fetching input from https://download.url/file.zip", run.log) + input_source2.refresh_from_db() + self.assertEqual("file.zip", input_source2.filename) + self.assertTrue(input_source2.exists()) + mock_get.assert_called_once() + + @mock.patch("scanpipe.models.InputSource.fetch") + def test_scanpipe_pipeline_class_download_fetch_exception(self, mock_fetch): + project1 = make_project() + run = project1.add_pipeline("do_nothing") + pipeline = run.make_pipeline_instance() + + mock_fetch.side_effect = Exception("File not found") + download_url = "https://download.url/file.zip" + project1.add_input_source(download_url=download_url) + + with self.assertRaises(InputFilesError) as error: + pipeline.download_missing_inputs() + self.assertIn( + "InputFilesError encountered with the following issues:", + str(error.exception), + ) + self.assertIn("Error 1: File not found", str(error.exception)) + self.assertIn("Traceback (most recent call last):", str(error.exception)) + self.assertIn("Exception: File not found", str(error.exception)) + + self.assertIn("Fetching input from https://download.url/file.zip", run.log) + self.assertIn("https://download.url/file.zip could not be fetched.", run.log) + + @mock.patch("git.repo.base.Repo.clone_from") + def test_scanpipe_pipeline_class_download_missing_inputs_git_repo(self, mock_clone): + project1 = make_project() + run = project1.add_pipeline("do_nothing") + pipeline = run.make_pipeline_instance() + + download_url = "https://github.com/aboutcode-org/scancode.io.git" + input_source = project1.add_input_source(download_url=download_url) + + def mock_make_to_path(**kwargs): + to_path = kwargs.get("to_path") + to_path.mkdir() + + mock_clone.side_effect = mock_make_to_path + mock_clone.return_value = None + + pipeline.download_missing_inputs() + self.assertIn( + "Fetching input from https://github.com/aboutcode-org/scancode.io.git", + run.log, + ) + input_source.refresh_from_db() + self.assertEqual("scancode.io.git", input_source.filename) + self.assertTrue(input_source.exists()) + + @mock.patch("requests.get") + def test_archive_downloads(self, mock_get): + project1 = make_project() + run = project1.add_pipeline("scan_codebase") + pipeline = run.make_pipeline_instance() + test_filename = "sample.tar.gz" + test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" + test_data_path = ( + Path(__file__).parent / "data" / "test-downloads" / test_filename + ) + with open(test_data_path, "rb") as f: + test_content = f.read() + + input_source=InputSource.objects.create( + project=project1, + filename=test_filename, + download_url=test_url, + is_uploaded=False, + ) + + mock_get.return_value.content = test_content + mock_get.return_value.status_code = 200 + + pipeline.download_missing_inputs() + input_source.refresh_from_db() + self.assertTrue( + input_source.file_path.startswith(settings.CENTRAL_ARCHIVE_PATH) + ) + self.assertTrue(Path(input_source.file_path).exists()) + + pipeline.archive_downloads() + input_source = InputSource.refresh_from_db() + self.assertTrue(input_source.sha256) + self.assertTrue(input_source.download_date) + self.assertEqual(input_source.download_url, test_url) + self.assertEqual(input_source.filename, test_filename) + + project2 = make_project(name="project2") + input_source2 = InputSource.objects.create( + project=project2, + filename=test_filename, + download_url=test_url, + is_uploaded=False, + ) + run2 = project2.add_pipeline("scan_codebase") + pipeline2 = run2.make_pipeline_instance() + pipeline2.download_missing_inputs() + input_source2.refresh_from_db() + self.assertEqual(input_source.file_path, input_source2.file_path) + self.assertTrue(Path(input_source2.file_path).exists()) + + def test_scanpipe_pipeline_class_save_errors_context_manager(self): + project1 = make_project() + run = project1.add_pipeline("do_nothing") + pipeline = run.make_pipeline_instance() + self.assertEqual(project1, pipeline.project) + + with pipeline.save_errors(Exception): + raise Exception("Error message") + + message = project1.projectmessages.get() + self.assertEqual("do_nothing", message.model) + self.assertEqual({}, message.details) + self.assertEqual("Error message", message.description) + self.assertIn('raise Exception("Error message")', message.traceback) + + resource1 = CodebaseResource.objects.create(project=project1, path="filename") + with pipeline.save_errors(Exception, resource=resource1): + raise Exception("Error message") + message = project1.projectmessages.latest("created_date") + self.assertEqual({"resource_path": str(resource1.path)}, message.details) + + def test_scanpipe_pipelines_is_pipeline(self): + self.assertFalse(is_pipeline(None)) + self.assertFalse(is_pipeline(Pipeline)) + self.assertTrue(is_pipeline(DoNothing)) + + class SubSubClass(DoNothing): + pass + + self.assertTrue(is_pipeline(SubSubClass)) + + def test_scanpipe_pipeline_class_get_graph(self): + expected = [ + {"name": "step1", "doc": "Step1 doc.", "groups": []}, + {"name": "step2", "doc": "Step2 doc.", "groups": []}, + ] + self.assertEqual(expected, DoNothing.get_graph()) + + def test_scanpipe_pipelines_profile_decorator(self): + project1 = make_project() + run = project1.add_pipeline("profile_step") + pipeline_instance = run.make_pipeline_instance() + + exitcode, out = pipeline_instance.execute() + self.assertEqual(0, exitcode) + + run.refresh_from_db() + self.assertIn("Profiling results at", run.log) + self.assertIn("Pipeline completed", run.log) + + self.assertEqual(1, len(project1.output_root)) + output_file = project1.output_root[0] + self.assertTrue(output_file.startswith("profile-")) + self.assertTrue(output_file.endswith(".html")) + + def test_scanpipe_pipeline_class_get_steps(self): + expected = ( + DoNothing.step1, + DoNothing.step2, + ) + self.assertEqual(expected, DoNothing.get_steps()) + + with self.assertRaises(TypeError) as cm: + StepsAsAttribute.get_steps() + expected = "Use a ``steps(cls)`` classmethod to declare the steps." + self.assertEqual(expected, str(cm.exception)) + + def test_scanpipe_pipeline_class_get_steps_with_groups(self): + expected = (WithGroups.no_groups,) + self.assertEqual(expected, WithGroups.get_steps()) + self.assertEqual(expected, WithGroups.get_steps(groups=[])) + self.assertEqual(expected, WithGroups.get_steps(groups=["not_defined"])) + + expected = ( + WithGroups.grouped_with_foo_and_bar, + WithGroups.grouped_with_bar, + WithGroups.no_groups, + ) + self.assertEqual(expected, WithGroups.get_steps(groups=["bar"])) + self.assertEqual(expected, WithGroups.get_steps(groups=["foo", "bar"])) + + expected = ( + WithGroups.grouped_with_foo_and_bar, + WithGroups.no_groups, + ) + self.assertEqual(expected, WithGroups.get_steps(groups=["foo"])) + + def test_scanpipe_pipeline_class_get_available_groups(self): + self.assertEqual(["bar", "excluded", "foo"], WithGroups.get_available_groups()) + self.assertEqual([], DoNothing.get_available_groups()) + + def test_scanpipe_pipeline_class_env_loaded_from_config_file(self): + project1 = make_project() + run = project1.add_pipeline("do_nothing") + pipeline = run.make_pipeline_instance() + self.assertEqual({}, pipeline.env) + + config_file = project1.input_path / settings.SCANCODEIO_CONFIG_FILE + config_file.write_text("{*this is not valid yml*}") + pipeline = run.make_pipeline_instance() + self.assertEqual({}, pipeline.env) + + config_file.write_text("product_name: Product") + pipeline = run.make_pipeline_instance() + self.assertEqual({"product_name": "Product"}, pipeline.env) + + def test_scanpipe_pipeline_class_env_reloaded_after_extraction(self): + project1 = make_project() + + input_location = self.data / "settings" / "archived-scancode-config.zip" + project1.copy_input_from(input_location) + run = project1.add_pipeline("scan_codebase") + pipeline = run.make_pipeline_instance() + self.assertEqual({}, pipeline.env) + + # Manually run steps, env is reload from the scancode-config.yml contained in + # the archive + pipeline.copy_inputs_to_codebase_directory() + pipeline.extract_archives() + + expected = { + "product_name": "My Product Name", + "product_version": "1.0", + "ignored_patterns": ["*.tmp", "tests/*"], + } + self.assertEqual(expected, pipeline.env) + + def test_scanpipe_pipeline_class_flag_ignored_resources(self): + project1 = make_project() + run = project1.add_pipeline("do_nothing") + pipeline = run.make_pipeline_instance() + self.assertIsNone(pipeline.env.get("ignored_patterns")) + + project1.settings.update({"ignored_patterns": "*.ext"}) + project1.save() + pipeline = run.make_pipeline_instance() + + with mock.patch("scanpipe.pipes.flag.flag_ignored_patterns") as mock_flag: + mock_flag.return_value = None + pipeline.flag_ignored_resources() + + mock_flag.assert_called_once() + patterns_args = ["*.ext", *flag.DEFAULT_IGNORED_PATTERNS] + self.assertEqual(mock_flag.mock_calls[0].kwargs["patterns"], patterns_args) + self.assertEqual(mock_flag.mock_calls[0].kwargs["codebaseresources"].count(), 0) + + def test_scanpipe_pipeline_class_extract_archive(self): + project1 = make_project() + run = project1.add_pipeline("do_nothing") + pipeline = run.make_pipeline_instance() + + target = tempfile.mkdtemp() + input_location = str(self.data / "scancode" / "corrupted.tar.gz") + pipeline.extract_archive(input_location, target) + + projects_errors = project1.projectmessages.all() + self.assertEqual(1, len(projects_errors)) + project_error = projects_errors.get() + self.assertEqual("error", project_error.severity) + self.assertIn("gzip decompression failed", project_error.description) + self.assertEqual("extract_archive", project_error.model) + self.assertEqual({"filename": "corrupted.tar.gz"}, project_error.details) + self.assertEqual("", project_error.traceback) + + def test_scanpipe_pipeline_class_extract_archives(self): + project1 = make_project() + run = project1.add_pipeline("do_nothing") + pipeline = run.make_pipeline_instance() + + input_location = str(self.data / "scancode" / "corrupted.tar.gz") + resource_location = copy_input(input_location, project1.codebase_path) + pipeline.extract_archives() + + projects_errors = project1.projectmessages.all() + self.assertEqual(1, len(projects_errors)) + project_error = projects_errors.get() + self.assertEqual("error", project_error.severity) + self.assertIn("gzip decompression failed", project_error.description) + self.assertEqual("extract_archives", project_error.model) + self.assertEqual( + {"resource_path": str(resource_location)}, project_error.details + ) + self.assertEqual("", project_error.traceback) + + +class RootFSPipelineTest(TestCase): + def test_scanpipe_rootfs_pipeline_extract_input_files_errors(self): + project1 = make_project() + run = project1.add_pipeline("analyze_root_filesystem_or_vm_image") + pipeline_instance = analyze_root_filesystem.RootFS(run) + + # Create 2 files in the input/ directory to generate error twice + project1.move_input_from(tempfile.mkstemp()[1]) + project1.move_input_from(tempfile.mkstemp()[1]) + self.assertEqual(2, len(project1.input_files)) + + with mock.patch("scanpipe.pipes.scancode.extract_archive") as extract_archive: + extract_archive.return_value = {"path/to/resource": ["error1", "error2"]} + pipeline_instance.extract_input_files_to_codebase_directory() + + projects_errors = project1.projectmessages.all() + self.assertEqual(2, len(projects_errors)) + project_error = projects_errors[0] + self.assertEqual("error", project_error.severity) + self.assertEqual("error1\nerror2", project_error.description) + self.assertEqual("extract_archive", project_error.model) + self.assertEqual({"filename": "resource"}, project_error.details) + self.assertEqual("", project_error.traceback) + + +def sort_for_os_compatibility(scan_data): + """Sort the ``scan_data`` files and relations in place. Return ``scan_data``.""" + if files := scan_data.get("files"): + files.sort(key=lambda x: x["path"]) + + if relations := scan_data.get("relations"): + relations.sort(key=lambda x: x["to_resource"]) + + return scan_data + + +@tag("slow") +class PipelinesIntegrationTest(TestCase): + """Integration tests to ensure the proper output for each built-in Pipelines.""" + + # Un-comment the following to display full diffs: + # maxDiff = None + data = Path(__file__).parent / "data" + exclude_from_diff = [ + "start_timestamp", + "end_timestamp", + "date", + "duration", + "input", + "compliance_alert", + "policy", + "tool_version", + "other_tools", + "created_date", + "log", + "uuid", + "size", # directory sizes are OS dependant + "size_count", + "--json-pp", + "--processes", + "--verbose", + # system_environment differs between systems + "system_environment", + "file_type", + # mime type and is_script are inconsistent across systems + "mime_type", + "is_script", + "notes", + "settings", + "description", + "traceback", + ] + + def _without_keys(self, data, exclude_keys): + """Return the `data` excluding the provided `exclude_keys`.""" + if isinstance(data, list): + return [self._without_keys(entry, exclude_keys) for entry in data] + + if isinstance(data, dict): + return { + key: ( + self._without_keys(value, exclude_keys) + if type(value) in [list, dict] + else value + ) + for key, value in data.items() + if key not in exclude_keys + } + + return data + + def purl_fields_with_fake_uuid(self, value, key): + purl_fields = ["purl", "for_packages", "package_uid"] + purl_name = "fixed-name-for-testing-5642512d1758" + purl_namespace = "fixed-namespace-for-testing-5642512d1758" + + if key == "name": + return purl_name + elif key == "namespace": + return purl_namespace + elif key in purl_fields: + purl_old = PackageURL.from_string(value) + if purl_old.type != "local-files": + return purl_with_fake_uuid(value) + + purl = PackageURL( + name=purl_name, + namespace=purl_namespace, + type="local-files", + version=purl_old.version, + qualifiers=purl_old.qualifiers, + subpath=purl_old.subpath, + ) + return purl_with_fake_uuid(purl.to_string()) + + def _normalize_package_uids(self, data): + """ + Return the `data`, where any `package_uid` value has been normalized + with `purl_with_fake_uuid()` + """ + fields_with_package_uids = [ + "package_uid", + "dependency_uid", + "for_package_uid", + "resolved_to_package_uid", + ] + if isinstance(data, list): + return [self._normalize_package_uids(entry) for entry in data] + + if isinstance(data, dict): + is_local_files = False + if data.get("type") and data["type"] == "local-files": + is_local_files = True + normalized_data = {} + for key, value in data.items(): + if isinstance(value, list | dict): + value = self._normalize_package_uids(value) + if key in fields_with_package_uids and value: + value = purl_with_fake_uuid(value) + if key == "for_packages" and value: + value = sorted( + [ + self.purl_fields_with_fake_uuid(package_uid, key) + for package_uid in value + ] + ) + if ( + is_local_files + and key in ("name", "namespace", "purl", "package_uid") + and value + ): + value = self.purl_fields_with_fake_uuid(value, key) + normalized_data[key] = value + return normalized_data + + return data + + def _sort_dependencies(self, data): + """ + Sort dependencies by their "for_package_uid". + + After dependency resolution in some cases we have multiple + dependency requirements resolved to a same package, and they + are not sorted the same way every time. + """ + mappings = data.get("dependencies") + if mappings: + mappings_by_uid = {} + for mapping in mappings: + uid = mapping.get("for_package_uid") or "" + mappings_by_uid[uid] = mapping + data["dependencies"] = list(dict(sorted(mappings_by_uid.items())).values()) + return data + + def test_package_uids_normalized_in_pipeline_integration_tests(self): + self.maxDiff = 1000 + data = { + "type": "local-files", + "package_uid": ( + "pkg:local-files/analysis-438ebaf4/42440f35-1091-4c03-8c96-a0ed3d3caf23" + "?uuid=42440f35-1091-4c03-8c96-a0ed3d3caf24" + ), + "for_packages": [ + ( + "pkg:local-files/analysis-438ebaf4/42440f35-1091-4c03-8c96-a0ed3d3caf23" + "?uuid=42440f35-1091-4c03-8c96-a0ed3d3caf24" + ) + ], + } + normalized_data = self._normalize_package_uids(data=data) + expected_data = { + "type": "local-files", + "package_uid": ( + "pkg:local-files/fixed-namespace-for-testing-5642512d1758/" + "fixed-name-for-testing-5642512d1758?uuid=fixed-uid-done-for-testing-5642512d1758" + ), + "for_packages": [ + ( + "pkg:local-files/fixed-namespace-for-testing-5642512d1758/" + "fixed-name-for-testing-5642512d1758?uuid=fixed-uid-done-for-testing-5642512d1758" + ) + ], + } + self.assertEqual(normalized_data, expected_data) + + def assertPipelineResultEqual( + self, expected_file, result_file, sort_dependencies=False, regen=FIXTURES_REGEN + ): + """Set `regen` to True to regenerate the expected results.""" + result_json = json.loads(Path(result_file).read_text()) + result_json = self._normalize_package_uids(result_json) + result_data = self._without_keys(result_json, self.exclude_from_diff) + if sort_dependencies: + result_data = self._sort_dependencies(result_data) + result_data = sort_for_os_compatibility(result_data) + + if regen: + expected_file.write_text(json.dumps(result_data, indent=2)) + + expected_json = json.loads(expected_file.read_text()) + expected_json = self._normalize_package_uids(expected_json) + expected_data = self._without_keys(expected_json, self.exclude_from_diff) + if sort_dependencies: + result_data = self._sort_dependencies(result_data) + expected_data = sort_for_os_compatibility(expected_data) + + self.assertEqual(expected_data, result_data) + + @skipIf(from_docker_image, "Random failure in the Docker context.") + def test_scanpipe_scan_package_pipeline_integration(self): + pipeline_name = "scan_single_package" + project1 = make_project() + + input_location = self.data / "scancode" / "is-npm-1.0.0.tgz" + project1.copy_input_from(input_location) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(4, project1.codebaseresources.count()) + self.assertEqual(1, project1.discoveredpackages.count()) + self.assertEqual(1, project1.discovereddependencies.count()) + + scancode_file = project1.get_latest_output(filename="scancode") + expected_file = self.data / "scancode" / "is-npm-1.0.0_scan_package.json" + self.assertPipelineResultEqual(expected_file, scancode_file) + + summary_file = project1.get_latest_output(filename="summary") + expected_file = ( + self.data / "scancode" / "is-npm-1.0.0_scan_package_summary.json" + ) + self.assertPipelineResultEqual(expected_file, summary_file) + + # Ensure that we only have one instance of is-npm in `key_files_packages` + summary_data = json.loads(Path(summary_file).read_text()) + key_files_packages = summary_data.get("key_files_packages", []) + self.assertEqual(1, len(key_files_packages)) + key_file_package = key_files_packages[0] + key_file_package_purl = key_file_package.get("purl", "") + self.assertEqual("pkg:npm/is-npm@1.0.0", key_file_package_purl) + + @skipIf(from_docker_image, "Random failure in the Docker context.") + def test_scanpipe_scan_package_pipeline_integration_multiple_packages(self): + pipeline_name = "scan_single_package" + project1 = make_project() + + input_location = self.data / "scancode" / "multiple-is-npm-1.0.0.tar.gz" + project1.copy_input_from(input_location) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(9, project1.codebaseresources.count()) + self.assertEqual(2, project1.discoveredpackages.count()) + self.assertEqual(2, project1.discovereddependencies.count()) + + scancode_file = project1.get_latest_output(filename="scancode") + expected_file = ( + self.data / "scancode" / "multiple-is-npm-1.0.0_scan_package.json" + ) + # Do not override the regen as this file is generated in regen_test_data + self.assertPipelineResultEqual(expected_file, scancode_file) + + summary_file = project1.get_latest_output(filename="summary") + expected_file = ( + self.data / "scancode" / "multiple-is-npm-1.0.0_scan_package_summary.json" + ) + self.assertPipelineResultEqual(expected_file, summary_file) + + @mock.patch("scanpipe.pipelines.scan_single_package.is_archive") + def test_scanpipe_scan_package_single_extract_input_to_codebase_directory( + self, mock_is_archive + ): + project1 = make_project() + run = project1.add_pipeline("scan_single_package") + pipeline_instance = scan_single_package.ScanSinglePackage(run) + + project1.move_input_from(tempfile.mkstemp(suffix=".zip")[1]) + self.assertEqual(1, len(project1.input_files)) + + mock_is_archive.return_value = True + pipeline_instance.get_package_input() + with mock.patch("scanpipe.pipes.scancode.extract_archive") as extract_archive: + extract_archive.return_value = {"path/to/resource": ["error1", "error2"]} + pipeline_instance.extract_input_to_codebase_directory() + + projects_errors = project1.projectmessages.all() + self.assertEqual(1, len(projects_errors)) + project_error = projects_errors[0] + self.assertEqual("error", project_error.severity) + self.assertEqual("error1\nerror2", project_error.description) + self.assertEqual("extract_archive", project_error.model) + self.assertEqual({"filename": "resource"}, project_error.details) + self.assertEqual("", project_error.traceback) + + def test_scanpipe_scan_package_single_file(self): + pipeline_name = "scan_single_package" + project1 = make_project() + + input_location = self.data / "manifests" / "openpdf-parent-1.3.11.pom.xml" + project1.copy_input_from(input_location) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(1, project1.codebaseresources.count()) + self.assertEqual(1, project1.discoveredpackages.count()) + self.assertEqual(10, project1.discovereddependencies.count()) + + scancode_file = project1.get_latest_output(filename="scancode") + expected_file = ( + self.data / "manifests" / "openpdf-parent-1.3.11_scan_package.json" + ) + self.assertPipelineResultEqual(expected_file, scancode_file) + + @mock.patch("git.repo.base.Repo.clone_from") + def test_scanpipe_scan_package_single_package_git_repo(self, mock_clone): + pipeline_name = "scan_single_package" + project1 = make_project() + + download_url = "https://github.com/aboutcode-org/scancode.io.git" + project1.add_input_source(download_url=download_url) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + # Create the "fetched" git directory content + def mock_make_git_directory(**kwargs): + to_path = kwargs.get("to_path") # scancode.io.git + to_path.mkdir() + file_location = self.data / "aboutcode" / "notice.NOTICE" + copy_input(file_location, to_path) + + mock_clone.side_effect = mock_make_git_directory + mock_clone.return_value = None + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(2, project1.codebaseresources.count()) + self.assertEqual(0, project1.discoveredpackages.count()) + + def test_scanpipe_scan_codebase_pipeline_integration(self): + pipeline_name = "scan_codebase" + project1 = make_project() + + filename = "is-npm-1.0.0.tgz" + input_location = self.data / "scancode" / filename + project1.copy_input_from(input_location) + project1.add_input_source("https://download.url", filename) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(6, project1.codebaseresources.count()) + self.assertEqual(1, project1.discoveredpackages.count()) + self.assertEqual(1, project1.discovereddependencies.count()) + + result_file = output.to_json(project1) + expected_file = self.data / "scancode" / "is-npm-1.0.0_scan_codebase.json" + self.assertPipelineResultEqual(expected_file, result_file) + + def test_scanpipe_scan_codebase_creates_top_level_paths(self): + pipeline_name = "scan_codebase" + project1 = make_project() + + filename = "is-npm-1.0.0.tgz" + input_location = self.data / "scancode" / filename + project1.copy_input_from(input_location) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + expected_top_level_paths = ["is-npm-1.0.0.tgz", "is-npm-1.0.0.tgz-extract"] + + top_level_resources = project1.codebaseresources.filter(parent_path="") + top_level_paths = [resource.path for resource in top_level_resources] + + self.assertListEqual(top_level_paths, expected_top_level_paths) + + def test_scanpipe_scan_codebase_creates_parent_path_field(self): + pipeline_name = "scan_codebase" + project1 = make_project() + + filename = "is-npm-1.0.0.tgz" + input_location = self.data / "scancode" / filename + project1.copy_input_from(input_location) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + expected_top_level_paths = ["is-npm-1.0.0.tgz", "is-npm-1.0.0.tgz-extract"] + expected_nested_paths = [ + "is-npm-1.0.0.tgz-extract/package/index.js", + "is-npm-1.0.0.tgz-extract/package/package.json", + "is-npm-1.0.0.tgz-extract/package/readme.md", + ] + + top_level_resources = project1.codebaseresources.filter(parent_path="") + top_level_paths = [resource.path for resource in top_level_resources] + + self.assertListEqual(top_level_paths, expected_top_level_paths) + + nested_resources = project1.codebaseresources.filter( + parent_path="is-npm-1.0.0.tgz-extract/package" + ) + nested_paths = [resource.path for resource in nested_resources] + + self.assertListEqual(nested_paths, expected_nested_paths) + + def test_scanpipe_inspect_packages_creates_packages_npm(self): + pipeline_name = "inspect_packages" + project1 = make_project() + + filename = "is-npm-1.0.0.tgz" + input_location = self.data / "scancode" / filename + project1.copy_input_from(input_location) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(6, project1.codebaseresources.count()) + self.assertEqual(1, project1.discoveredpackages.count()) + self.assertEqual(1, project1.discovereddependencies.count()) + + package = project1.discoveredpackages.get() + dependency = project1.discovereddependencies.get() + + self.assertEqual(3, package.codebase_resources.count()) + self.assertEqual("pkg:npm/is-npm@1.0.0", dependency.for_package.purl) + self.assertEqual(package.datasource_ids, [dependency.datasource_id]) + self.assertEqual( + package.codebase_resources.get( + path="is-npm-1.0.0.tgz-extract/package/package.json" + ).path, + dependency.datafile_resource.path, + ) + + def test_scanpipe_inspect_packages_creates_packages_pypi(self): + pipeline_name = "inspect_packages" + project1 = make_project() + + input_location = self.data / "manifests" / "python-inspector-0.10.0.zip" + project1.copy_input_from(input_location) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + self.assertEqual(6, project1.codebaseresources.count()) + self.assertEqual(0, project1.discoveredpackages.count()) + self.assertEqual(26, project1.discovereddependencies.count()) + + @skipIf(sys.platform == "darwin", "Not supported on macOS") + def test_scanpipe_inspect_packages_with_resolved_dependencies_npm(self): + pipeline_name = "inspect_packages" + project1 = make_project() + + input_location = self.data / "dependencies" / "resolved_dependencies_npm.zip" + project1.copy_input_from(input_location) + + run = project1.add_pipeline( + pipeline_name=pipeline_name, + selected_groups=["StaticResolver"], + ) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + self.assertEqual(4, project1.codebaseresources.count()) + self.assertEqual(7, project1.discoveredpackages.count()) + self.assertEqual(6, project1.discovereddependencies.count()) + + result_file = output.to_json(project1) + expected_file = ( + self.data + / "dependencies" + / "resolved_dependencies_npm_inspect_packages.json" + ) + self.assertPipelineResultEqual(expected_file, result_file) + + @skipIf(sys.platform == "darwin", "Not supported on macOS") + def test_scanpipe_inspect_packages_with_resolved_dependencies_poetry(self): + pipeline_name = "inspect_packages" + project1 = make_project() + + input_location = self.data / "dependencies" / "resolved_dependencies_poetry.zip" + project1.copy_input_from(input_location) + + run = project1.add_pipeline( + pipeline_name=pipeline_name, + selected_groups=["StaticResolver"], + ) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + self.assertEqual(5, project1.codebaseresources.count()) + self.assertEqual(6, project1.discoveredpackages.count()) + self.assertEqual(10, project1.discovereddependencies.count()) + + result_file = output.to_json(project1) + expected_file = ( + self.data + / "dependencies" + / "resolved_dependencies_poetry_inspect_packages.json" + ) + self.assertPipelineResultEqual(expected_file, result_file) + + @skipIf(sys.platform == "darwin", "Not supported on macOS") + def test_scanpipe_resolved_dependencies_cocoapods(self): + pipeline_name = "resolve_dependencies" + project1 = make_project() + + input_location = ( + self.data / "dependencies" / "resolved_dependencies_cocoapods.zip" + ) + project1.copy_input_from(input_location) + + run = project1.add_pipeline( + pipeline_name=pipeline_name, + selected_groups=["StaticResolver"], + ) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + self.assertEqual(3, project1.codebaseresources.count()) + self.assertEqual(25, project1.discoveredpackages.count()) + self.assertEqual(30, project1.discovereddependencies.count()) + + result_file = output.to_json(project1) + expected_file = ( + self.data / "dependencies" / "resolved_dependencies_cocoapods.json" + ) + self.assertPipelineResultEqual( + expected_file, result_file, sort_dependencies=True + ) + + def test_scanpipe_resolved_dependencies_pip_inspect(self): + pipeline_name = "resolve_dependencies" + project1 = make_project() + + input_location = self.data / "dependencies" / "resolved_dependencies_pip.zip" + project1.copy_input_from(input_location) + + run = project1.add_pipeline( + pipeline_name=pipeline_name, + selected_groups=["StaticResolver"], + ) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + self.assertEqual(3, project1.codebaseresources.count()) + self.assertEqual(4, project1.discoveredpackages.count()) + self.assertEqual(17, project1.discovereddependencies.count()) + + result_file = output.to_json(project1) + expected_file = self.data / "dependencies" / "resolved_dependencies_pip.json" + self.assertPipelineResultEqual( + expected_file, + result_file, + ) + + def test_scanpipe_resolved_dependencies_nuget(self): + pipeline_name = "resolve_dependencies" + project1 = make_project() + + input_location = self.data / "dependencies" / "resolved_dependencies_nuget.zip" + project1.copy_input_from(input_location) + + run = project1.add_pipeline( + pipeline_name=pipeline_name, + selected_groups=["StaticResolver"], + ) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + self.assertEqual(3, project1.codebaseresources.count()) + self.assertEqual(34, project1.discoveredpackages.count()) + self.assertEqual(108, project1.discovereddependencies.count()) + + result_file = output.to_json(project1) + expected_file = self.data / "dependencies" / "resolved_dependencies_nuget.json" + self.assertPipelineResultEqual( + expected_file, + result_file, + sort_dependencies=True, + ) + + def test_scanpipe_scan_codebase_can_process_wheel(self): + pipeline_name = "scan_codebase" + project1 = make_project() + + filename = "daglib-0.6.0-py3-none-any.whl" + input_location = self.data / "scancode" / filename + project1.copy_input_from(input_location) + project1.add_input_source("https://download.url", filename) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(11, project1.codebaseresources.count()) + self.assertEqual(2, project1.discoveredpackages.count()) + self.assertEqual(8, project1.discovereddependencies.count()) + + result_file = output.to_json(project1) + expected_file = ( + self.data / "scancode" / "daglib-0.6.0-py3-none-any.whl_scan_codebase.json" + ) + self.assertPipelineResultEqual(expected_file, result_file) + + @skipIf(sys.platform != "linux", "Expected results are inconsistent across OS") + def test_scanpipe_docker_pipeline_alpine_integration(self): + pipeline_name = "analyze_docker_image" + project1 = make_project() + + filename = "alpine_3_15_4.tar.gz" + input_location = self.data / "docker" / filename + project1.copy_input_from(input_location) + project1.add_input_source("https://download.url", filename) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(510, project1.codebaseresources.count()) + self.assertEqual(14, project1.discoveredpackages.count()) + self.assertEqual(0, project1.discovereddependencies.count()) + + result_file = output.to_json(project1) + expected_file = self.data / "docker" / "alpine_3_15_4_scan_codebase.json" + self.assertPipelineResultEqual(expected_file, result_file) + + def test_scanpipe_docker_pipeline_does_not_report_errors_for_broken_symlinks(self): + pipeline_name = "analyze_docker_image" + project1 = make_project() + + filename = "minitag.tar" + input_location = self.data / "image-with-symlinks" / filename + project1.copy_input_from(input_location) + project1.add_input_source("https://download.url", filename) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + with redirect_stderr(io.StringIO()): + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + project_messages = project1.projectmessages.all() + self.assertEqual(1, len(project_messages)) + self.assertEqual("Distro not found.", project_messages[0].description) + + result_file = output.to_json(project1) + expected_file = ( + self.data / "image-with-symlinks" / (filename + "-expected-scan.json") + ) + self.assertPipelineResultEqual(expected_file, result_file) + + @skipIf(sys.platform != "linux", "RPM related features only supported on Linux.") + def test_scanpipe_docker_pipeline_rpm_integration(self): + pipeline_name = "analyze_docker_image" + project1 = make_project() + + filename = "centos.tar.gz" + input_location = self.data / "docker" / filename + project1.copy_input_from(input_location) + project1.add_input_source("https://download.url", filename) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(29, project1.codebaseresources.count()) + self.assertEqual(101, project1.discoveredpackages.count()) + self.assertEqual(0, project1.discovereddependencies.count()) + + result_file = output.to_json(project1) + expected_file = self.data / "docker" / "centos_scan_codebase.json" + self.assertPipelineResultEqual(expected_file, result_file) + + def test_scanpipe_docker_pipeline_debian_integration(self): + pipeline_name = "analyze_docker_image" + project1 = make_project() + + filename = "debian.tar.gz" + input_location = self.data / "docker" / filename + project1.copy_input_from(input_location) + project1.add_input_source("https://download.url", filename) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(16, project1.codebaseresources.count()) + self.assertEqual(2, project1.discoveredpackages.count()) + self.assertEqual(0, project1.discovereddependencies.count()) + + result_file = output.to_json(project1) + expected_file = self.data / "docker" / "debian_scan_codebase.json" + self.assertPipelineResultEqual(expected_file, result_file) + + def test_scanpipe_docker_pipeline_distroless_debian_integration(self): + pipeline_name = "analyze_docker_image" + project1 = make_project() + + filename = "gcr_io_distroless_base.tar.gz" + input_location = self.data / "docker" / filename + project1.copy_input_from(input_location) + project1.add_input_source("https://download.url", filename) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(2458, project1.codebaseresources.count()) + self.assertEqual(6, project1.discoveredpackages.count()) + self.assertEqual(0, project1.discovereddependencies.count()) + + result_file = output.to_json(project1) + expected_file = ( + self.data / "docker" / "gcr_io_distroless_base_scan_codebase.json" + ) + self.assertPipelineResultEqual(expected_file, result_file) + + def test_scanpipe_rootfs_pipeline_integration(self): + pipeline_name = "analyze_root_filesystem_or_vm_image" + project1 = make_project() + + input_location = self.data / "rootfs" / "basic-rootfs.tar.gz" + project1.copy_input_from(input_location) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(17, project1.codebaseresources.count()) + self.assertEqual(2, project1.discoveredpackages.count()) + self.assertEqual(0, project1.discovereddependencies.count()) + + result_file = output.to_json(project1) + expected_file = self.data / "rootfs" / "basic-rootfs_root_filesystems.json" + self.assertPipelineResultEqual(expected_file, result_file) + + def test_scanpipe_load_inventory_pipeline_integration(self): + pipeline_name = "load_inventory" + project1 = make_project() + + input_location = self.data / "asgiref" / "asgiref-3.3.0_toolkit_scan.json" + project1.copy_input_from(input_location) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(18, project1.codebaseresources.count()) + self.assertEqual(2, project1.discoveredpackages.count()) + self.assertEqual(4, project1.discovereddependencies.count()) + + result_file = output.to_json(project1) + expected_file = ( + self.data / "asgiref" / "asgiref-3.3.0_load_inventory_expected.json" + ) + self.assertPipelineResultEqual(expected_file, result_file) + + # Using the ScanCode.io JSON output as the input + project2 = make_project() + + input_location = self.data / "asgiref" / "asgiref-3.3.0_scanpipe_output.json" + project2.copy_input_from(input_location) + + run = project2.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(18, project2.codebaseresources.count()) + self.assertEqual(2, project2.discoveredpackages.count()) + self.assertEqual(4, project2.discovereddependencies.count()) + + @mock.patch("scanpipe.pipes.vulnerablecode.is_available") + @mock.patch("scanpipe.pipes.vulnerablecode.is_configured") + @mock.patch("scanpipe.pipes.vulnerablecode.bulk_search_by_purl") + def test_scanpipe_find_vulnerabilities_pipeline_integration( + self, mock_bulk_search_by_purl, mock_is_configured, mock_is_available + ): + pipeline_name = "find_vulnerabilities" + project1 = make_project() + package1 = DiscoveredPackage.create_from_data(project1, package_data1) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + mock_is_configured.return_value = False + mock_is_available.return_value = False + exitcode, out = pipeline.execute() + self.assertEqual(1, exitcode, msg=out) + self.assertIn("VulnerableCode is not configured.", out) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + mock_is_configured.return_value = True + mock_is_available.return_value = True + vulnerability_data = [ + { + "purl": "pkg:deb/debian/adduser@3.118?arch=all", + "affected_by_vulnerabilities": [ + { + "vulnerability_id": "VCID-cah8-awtr-aaad", + "summary": "An issue was discovered.", + }, + ], + }, + { + "purl": "pkg:deb/debian/adduser@3.118?qualifiers=1", + "affected_by_vulnerabilities": [ + { + "vulnerability_id": "VCID-cah8-awtr-aaad", + "summary": "An issue was discovered.", + }, + ], + }, + ] + mock_bulk_search_by_purl.return_value = vulnerability_data + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + package1.refresh_from_db() + expected = vulnerability_data[0]["affected_by_vulnerabilities"] + self.assertEqual(expected, package1.affected_by_vulnerabilities) + + @mock.patch("scorecode.ossf_scorecard.is_available") + def test_scanpipe_fetch_scores_pipeline_integration(self, mock_is_available): + pipeline_name = "fetch_scores" + project1 = make_project() + package1 = DiscoveredPackage.create_from_data(project1, package_data1) + package1.vcs_url = "https://github.com/ossf/scorecard" + package1.save() + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + mock_is_available.return_value = False + exitcode, out = pipeline.execute() + self.assertEqual(1, exitcode, msg=out) + self.assertIn("ScoreCode service is not available.", out) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + mock_is_available.return_value = True + + package_score_data = { + "scoring_tool": "ossf_scorecard", + "scoring_tool_version": "v5.2.1", + "score": "9.7", + "scoring_tool_documentation_url": "https://github.com/[trunc...]", + "score_date": "2025-07-24T18:50:16Z", + } + with mock.patch("scorecode.ossf_scorecard.fetch_scorecard") as fetch: + fetch.return_value = PackageScore(**package_score_data) + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + package1.refresh_from_db() + scorecard_entry = package1.scores.filter(scoring_tool="ossf-scorecard").first() + self.assertIsNotNone(scorecard_entry) + self.assertEqual("ossf-scorecard", scorecard_entry.scoring_tool) + self.assertEqual("v5.2.1", scorecard_entry.scoring_tool_version) + self.assertTrue(scorecard_entry.score) + + def test_scanpipe_resolve_dependencies_pipeline_integration(self): + pipeline_name = "resolve_dependencies" + project1 = make_project() + selected_groups = ["DynamicResolver"] + + run = project1.add_pipeline( + pipeline_name=pipeline_name, selected_groups=selected_groups + ) + pipeline = run.make_pipeline_instance() + + project1.move_input_from(tempfile.mkstemp()[1]) + pipeline.execute() + self.assertEqual(1, project1.projectmessages.count()) + message = project1.projectmessages.get() + self.assertEqual("get_packages_from_manifest", message.model) + expected = "No resources containing package data found in codebase." + self.assertIn(expected, message.description) + + def test_scanpipe_resolve_dependencies_pipeline_integration_empty_manifest(self): + pipeline_name = "resolve_dependencies" + project1 = make_project() + selected_groups = ["DynamicResolver"] + + run = project1.add_pipeline( + pipeline_name=pipeline_name, selected_groups=selected_groups + ) + pipeline = run.make_pipeline_instance() + + project1.move_input_from(tempfile.mkstemp(suffix="requirements.txt")[1]) + pipeline.execute() + self.assertEqual(1, project1.projectmessages.count()) + message = project1.projectmessages.get() + self.assertEqual("get_packages_from_manifest", message.model) + expected = "No packages could be resolved" + self.assertIn(expected, message.description) + + @mock.patch("scanpipe.pipes.resolve.python_inspector.resolve_dependencies") + def test_scanpipe_resolve_dependencies_pipeline_integration_misc( + self, mock_resolve_dependencies + ): + pipeline_name = "resolve_dependencies" + project1 = make_project() + selected_groups = ["DynamicResolver"] + + input_location = self.data / "manifests" / "requirements.txt" + project1.copy_input_from(input_location) + + run = project1.add_pipeline( + pipeline_name=pipeline_name, selected_groups=selected_groups + ) + pipeline = run.make_pipeline_instance() + + mock_resolve_dependencies.return_value = mock.Mock(packages=[package_data1]) + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + self.assertEqual(1, project1.discoveredpackages.count()) + + @mock.patch("scanpipe.pipes.resolve.python_inspector.resolve_dependencies") + def test_scanpipe_resolve_dependencies_pipeline_pypi_integration( + self, mock_resolve_dependencies + ): + pipeline_name = "resolve_dependencies" + project1 = make_project() + selected_groups = ["DynamicResolver"] + + run = project1.add_pipeline( + pipeline_name=pipeline_name, selected_groups=selected_groups + ) + pipeline = run.make_pipeline_instance() + + project1.move_input_from(tempfile.mkstemp(suffix="requirements.txt")[1]) + mock_resolve_dependencies.return_value = mock.Mock(packages=[package_data1]) + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(1, project1.discoveredpackages.count()) + discoveredpackage = project1.discoveredpackages.get() + exclude_fields = ["qualifiers", "release_date", "size"] + for field_name, value in package_data1.items(): + if value and field_name not in exclude_fields: + self.assertEqual(value, getattr(discoveredpackage, field_name)) + + def test_scanpipe_load_sbom_pipeline_aboutfile_integration(self): + pipeline_name = "load_sbom" + project1 = make_project() + + input_location = self.data / "manifests" / "Django-4.0.8-py3-none-any.whl.ABOUT" + project1.copy_input_from(input_location) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(1, project1.discoveredpackages.count()) + discoveredpackage = project1.discoveredpackages.get() + self.assertEqual("pypi", discoveredpackage.type) + self.assertEqual("django", discoveredpackage.name) + self.assertEqual("4.0.8", discoveredpackage.version) + self.assertEqual("bsd-new", discoveredpackage.declared_license_expression) + + def test_scanpipe_load_sbom_pipeline_spdx_integration(self): + pipeline_name = "load_sbom" + project1 = make_project() + + input_location = self.data / "manifests" / "toml.spdx.json" + project1.copy_input_from(input_location) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(1, project1.discoveredpackages.count()) + discoveredpackage = project1.discoveredpackages.get() + self.assertEqual("pypi", discoveredpackage.type) + self.assertEqual("toml", discoveredpackage.name) + self.assertEqual("0.10.2", discoveredpackage.version) + self.assertEqual("https://github.com/uiri/toml", discoveredpackage.homepage_url) + self.assertEqual("MIT", discoveredpackage.extracted_license_statement) + self.assertEqual("mit", discoveredpackage.declared_license_expression) + + def test_scanpipe_load_sbom_pipeline_cyclonedx_integration(self): + pipeline_name = "load_sbom" + project1 = make_project() + + input_location = self.data / "cyclonedx" / "nested.cdx.json" + project1.copy_input_from(input_location) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(3, project1.discoveredpackages.count()) + packages = project1.discoveredpackages.all() + expected_data = { + "pkg:pypi/toml@0.10.2?extension=tar.gz": { + "type": "pypi", + "name": "toml", + "version": "0.10.2", + "extracted_license_statement": "OFL-1.1\nApache-2.0", + "declared_license_expression": "ofl-1.1 OR apache-2.0", + "homepage_url": "https://cyclonedx.org/website", + "bug_tracking_url": "https://cyclonedx.org/issue-tracker", + "vcs_url": "https://cyclonedx.org/vcs", + "filename": "", + }, + "pkg:pypi/billiard@3.6.3.0": { + "type": "pypi", + "name": "billiard", + "version": "3.6.3.0", + "extracted_license_statement": "BSD-3-Clause", + "declared_license_expression": "bsd-new", + "homepage_url": "", + "bug_tracking_url": "", + "vcs_url": "", + "extra_data": "", + "filename": "", + }, + "pkg:pypi/fictional@9.10.2": { + "type": "pypi", + "name": "fictional", + "version": "9.10.2", + "extracted_license_statement": ( + "LGPL-3.0-or-later" + " AND " + "LicenseRef-scancode-openssl-exception-lgpl3.0plus" + ), + "declared_license_expression": ( + "lgpl-3.0-plus AND openssl-exception-lgpl-3.0-plus" + ), + "homepage_url": "https://home.page", + "bug_tracking_url": "", + "vcs_url": "", + "extra_data": "", + "filename": "package.zip", + }, + } + + for package in packages: + expected = expected_data.get(str(package)) + self.assertEqual(expected["type"], package.type) + self.assertEqual(expected["name"], package.name) + self.assertEqual(expected["version"], package.version) + self.assertEqual(expected["homepage_url"], package.homepage_url) + self.assertEqual( + expected["extracted_license_statement"], + package.extracted_license_statement, + ) + self.assertEqual( + expected["declared_license_expression"], + package.declared_license_expression, + ) + self.assertEqual(expected["filename"], package.filename) + + def test_scanpipe_load_sbom_pipeline_cyclonedx_with_dependencies_integration(self): + pipeline_name = "load_sbom" + project1 = make_project() + + input_location = self.data / "cyclonedx" / "laravel-7.12.0" / "bom.1.4.json" + project1.copy_input_from(input_location) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(62, project1.discoveredpackages.count()) + self.assertEqual(112, project1.discovereddependencies.count()) + dependency = project1.discovereddependencies.all()[0] + self.assertEqual("bom.1.4.json", str(dependency.datafile_resource)) + + def test_scanpipe_load_sbom_pipeline_cyclonedx_with_vulnerabilities(self): + pipeline_name = "load_sbom" + project1 = make_project() + + input_location = ( + self.data / "cyclonedx" / "python-3.13.0-vulnerabilities.cdx.json" + ) + project1.copy_input_from(input_location) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(1, project1.discoveredpackages.count()) + package = project1.discoveredpackages.get() + expected = [ + { + "vulnerability_id": "CVE-2005-2541", + "summary": "Tar 1.15.1 does not properly warn the user when...", + } + ] + self.assertEqual(expected, package.affected_by_vulnerabilities) + + @mock.patch("scanpipe.pipes.purldb.request_post") + @mock.patch("uuid.uuid4") + def test_scanpipe_deploy_to_develop_pipeline_integration( + self, mock_uuid4, mock_request + ): + forced_uuid = "b74fe5df-e965-415e-ba65-f38421a0695d" + mock_uuid4.return_value = forced_uuid + mock_request.return_value = None + pipeline_name = "map_deploy_to_develop" + project1 = make_project(name="Analysis", uuid=forced_uuid) + selected_groups = ["Java"] + + jar_location = self.data / "d2d" / "jars" + project1.copy_input_from(jar_location / "from-flume-ng-node-1.9.0.zip") + project1.copy_input_from(jar_location / "to-flume-ng-node-1.9.0.zip") + + run = project1.add_pipeline( + pipeline_name=pipeline_name, selected_groups=selected_groups + ) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(57, project1.codebaseresources.count()) + self.assertEqual(18, project1.codebaserelations.count()) + self.assertEqual(1, project1.discoveredpackages.count()) + self.assertEqual(0, project1.discovereddependencies.count()) + + result_file = output.to_json(project1) + expected_file = self.data / "d2d" / "flume-ng-node-d2d.json" + self.assertPipelineResultEqual(expected_file, result_file) + + @skipIf(sys.platform == "darwin", "Not supported on macOS") + def test_scanpipe_deploy_to_develop_pipeline_integration_elfs(self): + pipeline_name = "map_deploy_to_develop" + project1 = make_project(name="Analysis") + selected_groups = ["Elf"] + + elf_location = self.data / "d2d-elfs" + project1.copy_input_from(elf_location / "from-brotli-d2d.zip") + project1.copy_input_from(elf_location / "to-brotli-d2d.zip") + + run = project1.add_pipeline( + pipeline_name=pipeline_name, selected_groups=selected_groups + ) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(17, project1.codebaseresources.count()) + self.assertEqual(7, project1.codebaserelations.count()) + + result_file = output.to_json(project1) + expected_file = self.data / "d2d-elfs" / "brotli-elf-d2d.json" + self.assertPipelineResultEqual(expected_file, result_file) + + def test_scanpipe_deploy_to_develop_pipeline_extract_input_files_errors(self): + project1 = make_project() + run = project1.add_pipeline("map_deploy_to_develop") + pipeline_instance = deploy_to_develop.DeployToDevelop(run) + + # Create 2 files in the input/ directory to generate error twice + project1.move_input_from(tempfile.mkstemp(prefix="from-")[1]) + project1.move_input_from(tempfile.mkstemp(prefix="to-")[1]) + self.assertEqual(2, len(project1.input_files)) + + pipeline_instance.get_inputs() + with mock.patch("scanpipe.pipes.scancode.extract_archive") as extract_archive: + extract_archive.return_value = {"path/to/resource": ["error1", "error2"]} + inputs_with_codebase_path_destination = [ + (pipeline_instance.from_files, project1.codebase_path / d2d.FROM), + (pipeline_instance.to_files, project1.codebase_path / d2d.TO), + ] + + for input_files, codebase_path in inputs_with_codebase_path_destination: + for input_file_path in input_files: + pipeline_instance.extract_archive(input_file_path, codebase_path) + + projects_errors = project1.projectmessages.all() + self.assertEqual(2, len(projects_errors)) + project_error = projects_errors[0] + self.assertEqual("error", project_error.severity) + self.assertEqual("error1\nerror2", project_error.description) + self.assertEqual("extract_archive", project_error.model) + self.assertEqual({"filename": "resource"}, project_error.details) + self.assertEqual("", project_error.traceback) + + @mock.patch("scanpipe.pipes.purldb.request_post") + @mock.patch("uuid.uuid4") + def test_scanpipe_deploy_to_develop_pipeline_with_about_file( + self, mock_uuid4, mock_request + ): + forced_uuid = "90cb6382-431c-4187-be76-d4f1a2199a2f" + mock_uuid4.return_value = forced_uuid + mock_request.return_value = None + pipeline_name = "map_deploy_to_develop" + project1 = make_project(name="Analysis", uuid=forced_uuid) + selected_groups = ["Java"] + + data_dir = self.data / "d2d" / "about_files" + project1.copy_input_from(data_dir / "from-with-about-file.zip") + project1.copy_input_from(data_dir / "to-with-jar.zip") + + run = project1.add_pipeline( + pipeline_name=pipeline_name, selected_groups=selected_groups + ) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertEqual(44, project1.codebaseresources.count()) + self.assertEqual(31, project1.codebaserelations.count()) + self.assertEqual(2, project1.discoveredpackages.count()) + self.assertEqual(0, project1.discovereddependencies.count()) + + result_file = output.to_json(project1) + expected_file = data_dir / "expected.json" + self.assertPipelineResultEqual(expected_file, result_file) + + self.assertEqual(1, project1.projectmessages.count()) + message = project1.projectmessages.get() + self.assertEqual("map_about_files", message.model) + expected = ( + "Resource paths listed at about_resource is not found in the to/ codebase" + ) + self.assertIn(expected, message.description) + + @mock.patch("scanpipe.pipes.purldb.request_post") + @mock.patch("scanpipe.pipes.purldb.is_available") + def test_scanpipe_populate_purldb_pipeline_integration( + self, mock_is_available, mock_request_post + ): + pipeline_name1 = "load_inventory" + pipeline_name2 = "populate_purldb" + project1 = make_project() + + input_location = self.data / "asgiref" / "asgiref-3.3.0_toolkit_scan.json" + project1.copy_input_from(input_location) + + run = project1.add_pipeline(pipeline_name1) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + def mock_request_post_return(url, data, headers, timeout): + payload = json.loads(data) + return { + "queued_packages_count": len(payload["packages"]), + "queued_packages": payload["packages"], + "unqueued_packages_count": 1, + "unqueued_packages": [], + "unsupported_packages_count": 1, + "unsupported_packages": [], + } + + mock_request_post.side_effect = mock_request_post_return + mock_is_available.return_value = True + + run = project1.add_pipeline(pipeline_name2) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertIn("Populating PurlDB with 2 PURLs from DiscoveredPackage", run.log) + self.assertIn("Successfully queued 2 PURLs for indexing in PurlDB", run.log) + self.assertIn("1 PURLs were already present in PurlDB index queue", run.log) + self.assertIn("Couldn't index 1 unsupported PURLs", run.log) + + @mock.patch("scanpipe.pipes.purldb.request_post") + @mock.patch("scanpipe.pipes.purldb.is_available") + def test_scanpipe_populate_purldb_pipeline_integration_without_assembly( + self, mock_is_available, mock_request_post + ): + pipeline_name = "populate_purldb" + project1 = make_project() + + def mock_request_post_return(url, data, headers, timeout): + payload = json.loads(data) + return { + "queued_packages_count": len(payload["packages"]), + "queued_packages": payload["packages"], + "unqueued_packages_count": 1, + "unqueued_packages": [], + "unsupported_packages_count": 1, + "unsupported_packages": [], + } + + mock_request_post.side_effect = mock_request_post_return + mock_is_available.return_value = True + + package_json_location = self.data / "manifests" / "package.json" + copy_input(package_json_location, project1.codebase_path) + pipes.collect_and_create_codebase_resources(project1) + + scancode.scan_for_application_packages(project1, assemble=False) + scancode.process_package_data(project1) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + self.assertIn("Populating PurlDB with 1 PURLs from DiscoveredPackage", run.log) + self.assertIn( + "Populating PurlDB with 6 unresolved PURLs from DiscoveredDependency", + run.log, + ) + self.assertIn("1 PURLs were already present in PurlDB index queue", run.log) + self.assertIn("Couldn't index 1 unsupported PURLs", run.log) + + @skipIf(sys.platform == "darwin", "Not supported on macOS") + def test_scanpipe_collect_symbols_ctags_pipeline_integration(self): + pipeline_name = "collect_symbols_ctags" + project1 = make_project() + + dir = project1.codebase_path / "codefile" + dir.mkdir(parents=True) + + file_location = self.data / "d2d-javascript" / "from" / "main.js" + copy_input(file_location, dir) + + pipes.collect_and_create_codebase_resources(project1) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + main_file = project1.codebaseresources.files()[0] + result_extra_data_symbols = main_file.extra_data.get("source_symbols") + expected_extra_data_symbols = ["generatePassword", "passwordLength", "charSet"] + self.assertCountEqual(expected_extra_data_symbols, result_extra_data_symbols) + + @skipIf(sys.platform != "linux", "Only supported on Linux") + def test_scanpipe_collect_strings_gettext_pipeline_integration(self): + pipeline_name = "collect_strings_gettext" + project1 = make_project() + + dir = project1.codebase_path / "codefile" + dir.mkdir(parents=True) + + file_location = self.data / "d2d-javascript" / "from" / "main.js" + copy_input(file_location, dir) + + pipes.collect_and_create_codebase_resources(project1) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + main_file = project1.codebaseresources.files()[0] + result_extra_data_strings = main_file.extra_data.get("source_strings") + expected_extra_data_strings = [ + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_-+=", # noqa + "Enter the desired length of your password:", + ] + self.assertCountEqual(expected_extra_data_strings, result_extra_data_strings) + + @skipIf(sys.platform == "darwin", "Not supported on macOS") + def test_scanpipe_collect_symbols_pygments_pipeline_integration(self): + pipeline_name = "collect_symbols_pygments" + project1 = make_project() + + dir = project1.codebase_path / "codefile" + dir.mkdir(parents=True) + + file_location = self.data / "source-inspector" / "test3.cpp" + copy_input(file_location, dir) + + pipes.collect_and_create_codebase_resources(project1) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + main_file = project1.codebaseresources.files()[0] + result_extra_data = main_file.extra_data + + expected_extra_data = ( + self.data / "source-inspector" / "test3.cpp-pygments-expected.json" + ) + + with open(expected_extra_data) as f: + expected_extra_data = json.load(f) + + self.assertDictEqual(expected_extra_data, result_extra_data) + + @skipIf(sys.platform == "darwin", "Not supported on macOS") + def test_scanpipe_collect_symbols_tree_sitter_pipeline_integration(self): + pipeline_name = "collect_symbols_tree_sitter" + project1 = make_project() + + dir = project1.codebase_path / "codefile" + dir.mkdir(parents=True) + + file_location = self.data / "source-inspector" / "test3.cpp" + copy_input(file_location, dir) + + pipes.collect_and_create_codebase_resources(project1) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + main_file = project1.codebaseresources.files()[0] + result_extra_data = main_file.extra_data + + expected_extra_data = ( + self.data / "source-inspector" / "test3.cpp-tree-sitter-expected.json" + ) + + with open(expected_extra_data) as f: + expected_extra_data = json.load(f) + + self.assertDictEqual(expected_extra_data, result_extra_data) + + @mock.patch("scanpipe.pipes.purldb.is_available") + @mock.patch("scanpipe.pipes.purldb.is_configured") + @mock.patch("scanpipe.pipes.purldb.collect_data_for_purl") + def test_scanpipe_enrich_with_purldb_pipeline_integration( + self, mock_collect_data, mock_is_configured, mock_is_available + ): + pipeline_name = "enrich_with_purldb" + project1 = make_project() + package1 = make_package(project1, package_url="pkg:npm/csvtojson@2.0.10") + + mock_is_configured.return_value = True + mock_is_available.return_value = True + + purldb_entry_file = self.data / "purldb" / "csvtojson-2.0.10.json" + purldb_entry = json.loads(purldb_entry_file.read_text()) + mock_collect_data.return_value = [purldb_entry] + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + package1.refresh_from_db() + self.assertTrue(package1.extra_data.get("enrich_with_purldb")) + + run.refresh_from_db() + self.assertIn("pkg:npm/csvtojson@2.0.10 ['release_date'", run.log) + self.assertIn("1 discovered package enriched with the PurlDB.", run.log) + From 7f177b9b46c7b8a1668b4854daebe1074b5202e6 Mon Sep 17 00:00:00 2001 From: Varsha U N Date: Thu, 18 Sep 2025 07:56:34 +0530 Subject: [PATCH 10/18] Revert "Revert "add tests for storing packages"" This reverts commit cd04f3f1062f3ac8c78af3a7b0ed042633f5b375. --- Dockerfile | 7 + scancodeio/settings.py | 979 +++++++++++++++---------------- scanpipe/archiving.py | 375 ++++++------ scanpipe/pipelines/__init__.py | 699 +++++++++++----------- scanpipe/pipes/input.py | 692 +++++++++++----------- scanpipe/tests/test_archiving.py | 172 +++--- scanpipe/tests/test_input.py | 255 ++++---- scanpipe/tests/test_pipelines.py | 1 + 8 files changed, 1577 insertions(+), 1603 deletions(-) diff --git a/Dockerfile b/Dockerfile index 42761550d9..eae3f12edb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,7 @@ +<<<<<<< HEAD +======= +>>>>>>> b6d23428 (Revert "Revert "add tests for storing packages"") # SPDX-License-Identifier: Apache-2.0 # # http://nexb.com and https://github.com/aboutcode-org/scancode.io @@ -92,4 +95,8 @@ COPY --chown=$APP_USER:$APP_USER pyproject.toml $APP_DIR/ RUN pip install --no-cache-dir . # Copy the codebase and set the proper permissions for the APP_USER +<<<<<<< HEAD COPY --chown=$APP_USER:$APP_USER . $APP_DIR +======= +COPY --chown=$APP_USER:$APP_USER . $APP_DIR +>>>>>>> b6d23428 (Revert "Revert "add tests for storing packages"") diff --git a/scancodeio/settings.py b/scancodeio/settings.py index 2d7686900c..15e52a4440 100644 --- a/scancodeio/settings.py +++ b/scancodeio/settings.py @@ -1,491 +1,488 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - -import sys -import tempfile -from pathlib import Path -import logging - -import environ - -from scanpipe.archiving import LocalFilesystemProvider - - -PROJECT_DIR = environ.Path(__file__) - 1 -ROOT_DIR = PROJECT_DIR - 1 - -# True if running tests through `./manage test` -IS_TESTS = "test" in sys.argv - -# Environment - -ENV_FILE = "/etc/scancodeio/.env" -if not Path(ENV_FILE).exists(): - ENV_FILE = ROOT_DIR(".env") - -# Do not use local .env environment when running the tests. -if IS_TESTS: - ENV_FILE = None - -env = environ.Env() -environ.Env.read_env(ENV_FILE) - -# Security - -SECRET_KEY = env.str("SECRET_KEY", default="") - -ALLOWED_HOSTS = env.list( - "ALLOWED_HOSTS", - default=[".localhost", "127.0.0.1", "[::1]", "host.docker.internal", "172.17.0.1"], -) - -CSRF_TRUSTED_ORIGINS = env.list("CSRF_TRUSTED_ORIGINS", default=[]) - -# SECURITY WARNING: don't run with debug turned on in production -DEBUG = env.bool("SCANCODEIO_DEBUG", default=False) - -SCANCODEIO_REQUIRE_AUTHENTICATION = env.bool( - "SCANCODEIO_REQUIRE_AUTHENTICATION", default=False -) - -SCANCODEIO_ENABLE_ADMIN_SITE = env.bool("SCANCODEIO_ENABLE_ADMIN_SITE", default=False) - -SECURE_CONTENT_TYPE_NOSNIFF = env.bool("SECURE_CONTENT_TYPE_NOSNIFF", default=True) - -X_FRAME_OPTIONS = env.str("X_FRAME_OPTIONS", default="DENY") - -SESSION_COOKIE_SECURE = env.bool("SESSION_COOKIE_SECURE", default=True) - -CSRF_COOKIE_SECURE = env.bool("CSRF_COOKIE_SECURE", default=True) - -# ``security.W004`` SECURE_HSTS_SECONDS and ``security.W008`` SECURE_SSL_REDIRECT -# are handled by the web server. -SILENCED_SYSTEM_CHECKS = ["security.W004", "security.W008"] - -# ScanCode.io - -SCANCODEIO_WORKSPACE_LOCATION = env.str("SCANCODEIO_WORKSPACE_LOCATION", default="var") - -SCANCODEIO_CONFIG_DIR = env.str("SCANCODEIO_CONFIG_DIR", default=".scancode") - -SCANCODEIO_CONFIG_FILE = env.str( - "SCANCODEIO_CONFIG_FILE", default="scancode-config.yml" -) - -SCANCODEIO_LOG_LEVEL = env.str("SCANCODEIO_LOG_LEVEL", "INFO") - -# Set the number of parallel processes to use for ScanCode related scan execution. -# If the SCANCODEIO_PROCESSES argument is not set, defaults to an optimal number of CPUs -# available on the machine. -SCANCODEIO_PROCESSES = env.int("SCANCODEIO_PROCESSES", default=None) - -SCANCODEIO_POLICIES_FILE = env.str("SCANCODEIO_POLICIES_FILE", default="policies.yml") - -# This setting defines the additional locations ScanCode.io will search for pipelines. -# This should be set to a list of strings that contain full paths to your additional -# pipelines directories. -SCANCODEIO_PIPELINES_DIRS = env.list("SCANCODEIO_PIPELINES_DIRS", default=[]) - -# Maximum time allowed for a pipeline to complete. -SCANCODEIO_TASK_TIMEOUT = env.str("SCANCODEIO_TASK_TIMEOUT", default="24h") - -# Default to 2 minutes. -SCANCODEIO_SCAN_FILE_TIMEOUT = env.int("SCANCODEIO_SCAN_FILE_TIMEOUT", default=120) - -# Default to None which scans all files -SCANCODEIO_SCAN_MAX_FILE_SIZE = env.int("SCANCODEIO_SCAN_MAX_FILE_SIZE", default=None) - -# List views pagination, controls the number of items displayed per page. -# Syntax in .env: SCANCODEIO_PAGINATE_BY=project=10,project_error=10 -SCANCODEIO_PAGINATE_BY = env.dict( - "SCANCODEIO_PAGINATE_BY", - default={ - "project": 20, - "error": 50, - "resource": 100, - "package": 100, - "dependency": 100, - "license": 100, - "relation": 100, - }, -) - -# Default limit for "most common" entries in QuerySets. -SCANCODEIO_MOST_COMMON_LIMIT = env.int("SCANCODEIO_MOST_COMMON_LIMIT", default=7) - -# The base URL (e.g., https://hostname/) of this application instance. -# Required for generating URLs to reference objects within the app, -# such as in webhook notifications. -SCANCODEIO_SITE_URL = env.str("SCANCODEIO_SITE_URL", default="") - -# Fetch authentication credentials - -# SCANCODEIO_FETCH_BASIC_AUTH="host=user,password;" -SCANCODEIO_FETCH_BASIC_AUTH = env.dict( - "SCANCODEIO_FETCH_BASIC_AUTH", - cast={"value": tuple}, - default={}, -) - -# SCANCODEIO_FETCH_DIGEST_AUTH="host=user,password;" -SCANCODEIO_FETCH_DIGEST_AUTH = env.dict( - "SCANCODEIO_FETCH_DIGEST_AUTH", - cast={"value": tuple}, - default={}, -) - -# SCANCODEIO_FETCH_HEADERS="host=Header1=value,Header2=value;" -SCANCODEIO_FETCH_HEADERS = {} -FETCH_HEADERS_STR = env.str("SCANCODEIO_FETCH_HEADERS", default="") -for entry in FETCH_HEADERS_STR.split(";"): - if entry.strip(): - host, headers = entry.split("=", 1) - SCANCODEIO_FETCH_HEADERS[host] = env.parse_value(headers, cast=dict) - -# SCANCODEIO_NETRC_LOCATION="~/.netrc" -SCANCODEIO_NETRC_LOCATION = env.str("SCANCODEIO_NETRC_LOCATION", default="") -if SCANCODEIO_NETRC_LOCATION: - # Propagate the location to the environ for `requests.utils.get_netrc_auth` - env.ENVIRON["NETRC"] = SCANCODEIO_NETRC_LOCATION - -# SCANCODEIO_SKOPEO_CREDENTIALS="host1=user:password,host2=user:password" -SCANCODEIO_SKOPEO_CREDENTIALS = env.dict("SCANCODEIO_SKOPEO_CREDENTIALS", default={}) - -# SCANCODEIO_SKOPEO_AUTHFILE_LOCATION="/path/to/auth.json" -SCANCODEIO_SKOPEO_AUTHFILE_LOCATION = env.str( - "SCANCODEIO_SKOPEO_AUTHFILE_LOCATION", default="" -) - -# This webhook will be added as WebhookSubscription for each new project. -# SCANCODEIO_GLOBAL_WEBHOOK=target_url=https://webhook.url,trigger_on_each_run=False,include_summary=True,include_results=False -SCANCODEIO_GLOBAL_WEBHOOK = env.dict("SCANCODEIO_GLOBAL_WEBHOOK", default={}) - -# Application definition - -INSTALLED_APPS = [ - # Local apps - # Must come before Third-party apps for proper templates override - "scanpipe", - # Django built-in - "django.contrib.auth", - "django.contrib.contenttypes", - "django.contrib.sessions", - "django.contrib.messages", - "django.contrib.staticfiles", - "django.contrib.admin", - "django.contrib.humanize", - # Third-party apps - "crispy_forms", - "crispy_bootstrap3", # required for the djangorestframework browsable API - "django_filters", - "rest_framework", - "rest_framework.authtoken", - "django_rq", - "django_probes", - "taggit", -] - -MIDDLEWARE = [ - "django.middleware.security.SecurityMiddleware", - "django.contrib.sessions.middleware.SessionMiddleware", - "django.middleware.common.CommonMiddleware", - "django.middleware.csrf.CsrfViewMiddleware", - "django.contrib.auth.middleware.AuthenticationMiddleware", - "django.contrib.messages.middleware.MessageMiddleware", - "django.middleware.clickjacking.XFrameOptionsMiddleware", - "scancodeio.middleware.TimezoneMiddleware", -] - -ROOT_URLCONF = "scancodeio.urls" - -WSGI_APPLICATION = "scancodeio.wsgi.application" - -SECURE_PROXY_SSL_HEADER = env.tuple( - "SECURE_PROXY_SSL_HEADER", default=("HTTP_X_FORWARDED_PROTO", "https") -) - -# Database - -DATABASES = { - "default": { - "ENGINE": env.str("SCANCODEIO_DB_ENGINE", "django.db.backends.postgresql"), - "HOST": env.str("SCANCODEIO_DB_HOST", "localhost"), - "NAME": env.str("SCANCODEIO_DB_NAME", "scancodeio"), - "USER": env.str("SCANCODEIO_DB_USER", "scancodeio"), - "PASSWORD": env.str("SCANCODEIO_DB_PASSWORD", "scancodeio"), - "PORT": env.str("SCANCODEIO_DB_PORT", "5432"), - "ATOMIC_REQUESTS": True, - } -} - -DEFAULT_AUTO_FIELD = "django.db.models.AutoField" - -# Forms and filters - -FILTERS_EMPTY_CHOICE_LABEL = env.str("FILTERS_EMPTY_CHOICE_LABEL", default="All") - -# Templates - -TEMPLATES = [ - { - "BACKEND": "django.template.backends.django.DjangoTemplates", - "APP_DIRS": True, - "OPTIONS": { - "debug": DEBUG, - "context_processors": [ - "django.contrib.auth.context_processors.auth", - "django.contrib.messages.context_processors.messages", - "django.template.context_processors.request", - "scancodeio.context_processors.versions", - ], - }, - }, -] - -# Login - -LOGIN_REDIRECT_URL = "project_list" - -# Passwords - -AUTH_PASSWORD_VALIDATORS = [ - { - "NAME": ( - "django.contrib.auth.password_validation.UserAttributeSimilarityValidator" - ), - }, - { - "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator", - "OPTIONS": { - "min_length": env.int("SCANCODEIO_PASSWORD_MIN_LENGTH", default=12), - }, - }, - { - "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator", - }, - { - "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator", - }, -] - -# Testing - -if IS_TESTS: - from django.core.management.utils import get_random_secret_key - - SECRET_KEY = get_random_secret_key() - # Do not pollute the workspace while running the tests. - SCANCODEIO_WORKSPACE_LOCATION = tempfile.mkdtemp() - SCANCODEIO_REQUIRE_AUTHENTICATION = True - SCANCODEIO_SCAN_FILE_TIMEOUT = 120 - SCANCODEIO_POLICIES_FILE = None - # The default password hasher is rather slow by design. - # Using a faster hashing algorithm in the testing context to speed up the run. - PASSWORD_HASHERS = ["django.contrib.auth.hashers.MD5PasswordHasher"] - -# Debug toolbar - -DEBUG_TOOLBAR = env.bool("SCANCODEIO_DEBUG_TOOLBAR", default=False) -if DEBUG and DEBUG_TOOLBAR: - INSTALLED_APPS.append("debug_toolbar") - MIDDLEWARE.append("debug_toolbar.middleware.DebugToolbarMiddleware") - INTERNAL_IPS = ["127.0.0.1"] - -# Logging - -LOGGING = { - "version": 1, - "disable_existing_loggers": False, - "formatters": { - "simple": { - "format": "{levelname} {message}", - "style": "{", - }, - }, - "handlers": { - "null": { - "class": "logging.NullHandler", - }, - "console": { - "class": "logging.StreamHandler", - "formatter": "simple", - }, - }, - "loggers": { - "scanpipe": { - "handlers": ["null"] if IS_TESTS else ["console"], - "level": SCANCODEIO_LOG_LEVEL, - "propagate": False, - }, - "django": { - "handlers": ["null"] if IS_TESTS else ["console"], - "propagate": False, - }, - # Set SCANCODEIO_LOG_LEVEL=DEBUG to display all SQL queries in the console. - "django.db.backends": { - "level": SCANCODEIO_LOG_LEVEL, - }, - }, -} - -# Instead of sending out real emails the console backend just writes the emails -# that would be sent to the standard output. -EMAIL_BACKEND = "django.core.mail.backends.console.EmailBackend" - -# Internationalization - -LANGUAGE_CODE = "en-us" - -FORMAT_MODULE_PATH = ["scancodeio.formats"] - -TIME_ZONE = env.str("TIME_ZONE", default="UTC") - -USE_I18N = True - -USE_TZ = True - -# Static files (CSS, JavaScript, Images) - -STATIC_URL = "/static/" - -STATIC_ROOT = env.str("STATIC_ROOT", default="/var/scancodeio/static/") - -STATICFILES_DIRS = [ - PROJECT_DIR("static"), -] - -# Third-party apps - -CRISPY_TEMPLATE_PACK = "bootstrap3" - -# Centralized archive directory for all projects -CENTRAL_ARCHIVE_PATH = env.str( - "CENTRAL_ARCHIVE_PATH", default="/var/scancodeio/archives" -) - -# localstorage configuration -DOWNLOAD_ARCHIVING_PROVIDER = env.str( - "DOWNLOAD_ARCHIVING_PROVIDER", default="localstorage" -) - -# For local storage, we would store the root path in that setting -DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION = env.dict( - "DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION", default=None -) - -# Initialize the DownloadStore for local storage - -download_store = None -logger = logging.getLogger(__name__) -if DOWNLOAD_ARCHIVING_PROVIDER == "localstorage": - config = DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION or {} - root_path = Path(config.get("root_path", CENTRAL_ARCHIVE_PATH)) - try: - download_store = LocalFilesystemProvider(root_path=root_path) - except Exception as e: - logger.error(f"Failed to initialize LocalFilesystemProvider: {e}") -else: - logger.error( - f"Unknown DOWNLOAD_ARCHIVING_PROVIDER: {DOWNLOAD_ARCHIVING_PROVIDER}" - ) - -# Job Queue - -RQ_QUEUES = { - "default": { - "HOST": env.str("SCANCODEIO_RQ_REDIS_HOST", default="localhost"), - "PORT": env.str("SCANCODEIO_RQ_REDIS_PORT", default="6379"), - "DB": env.int("SCANCODEIO_RQ_REDIS_DB", default=0), - "USERNAME": env.str("SCANCODEIO_RQ_REDIS_USERNAME", default=None), - "PASSWORD": env.str("SCANCODEIO_RQ_REDIS_PASSWORD", default=""), - "DEFAULT_TIMEOUT": env.int("SCANCODEIO_RQ_REDIS_DEFAULT_TIMEOUT", default=360), - # Enable SSL for Redis connections when deploying ScanCode.io in environments - # where Redis is hosted on a separate system (e.g., cloud deployment or remote - # Redis server) to secure data in transit. - "SSL": env.bool("SCANCODEIO_RQ_REDIS_SSL", default=False), - }, -} - -SCANCODEIO_ASYNC = env.bool("SCANCODEIO_ASYNC", default=False) -if not SCANCODEIO_ASYNC: - for queue_config in RQ_QUEUES.values(): - queue_config["ASYNC"] = False - -# ClamAV virus scan -CLAMD_USE_TCP = env.bool("CLAMD_USE_TCP", default=True) -CLAMD_TCP_ADDR = env.str("CLAMD_TCP_ADDR", default="clamav") - -# Django restframework - -REST_FRAMEWORK = { - "DEFAULT_AUTHENTICATION_CLASSES": ( - "rest_framework.authentication.TokenAuthentication", - ), - "DEFAULT_PERMISSION_CLASSES": ("rest_framework.permissions.IsAuthenticated",), - "DEFAULT_RENDERER_CLASSES": ( - "rest_framework.renderers.JSONRenderer", - "rest_framework.renderers.BrowsableAPIRenderer", - "rest_framework.renderers.AdminRenderer", - ), - "DEFAULT_FILTER_BACKENDS": ( - "django_filters.rest_framework.DjangoFilterBackend", - "rest_framework.filters.SearchFilter", - ), - "DEFAULT_PAGINATION_CLASS": "rest_framework.pagination.PageNumberPagination", - "PAGE_SIZE": env.int("SCANCODEIO_REST_API_PAGE_SIZE", default=50), - "UPLOADED_FILES_USE_URL": False, -} - -if not SCANCODEIO_REQUIRE_AUTHENTICATION: - REST_FRAMEWORK["DEFAULT_PERMISSION_CLASSES"] = ( - "rest_framework.permissions.AllowAny", - ) - -# VulnerableCode integration - -VULNERABLECODE_URL = env.str("VULNERABLECODE_URL", default="").rstrip("/") -VULNERABLECODE_USER = env.str("VULNERABLECODE_USER", default="") -VULNERABLECODE_PASSWORD = env.str("VULNERABLECODE_PASSWORD", default="") -VULNERABLECODE_API_KEY = env.str("VULNERABLECODE_API_KEY", default="") - -# PurlDB integration - -PURLDB_URL = env.str("PURLDB_URL", default="").rstrip("/") -PURLDB_USER = env.str("PURLDB_USER", default="") -PURLDB_PASSWORD = env.str("PURLDB_PASSWORD", default="") -PURLDB_API_KEY = env.str("PURLDB_API_KEY", default="") - -# MatchCode.io integration - -MATCHCODEIO_URL = env.str("MATCHCODEIO_URL", default="").rstrip("/") -MATCHCODEIO_USER = env.str("MATCHCODEIO_USER", default="") -MATCHCODEIO_PASSWORD = env.str("MATCHCODEIO_PASSWORD", default="") -MATCHCODEIO_API_KEY = env.str("MATCHCODEIO_API_KEY", default="") - -# FederatedCode integration - -FEDERATEDCODE_GIT_ACCOUNT_URL = env.str( - "FEDERATEDCODE_GIT_ACCOUNT_URL", default="" -).rstrip("/") -FEDERATEDCODE_GIT_SERVICE_TOKEN = env.str("FEDERATEDCODE_GIT_SERVICE_TOKEN", default="") -FEDERATEDCODE_GIT_SERVICE_NAME = env.str("FEDERATEDCODE_GIT_SERVICE_NAME", default="") -FEDERATEDCODE_GIT_SERVICE_EMAIL = env.str("FEDERATEDCODE_GIT_SERVICE_EMAIL", default="") +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +import logging +import sys +import tempfile +from pathlib import Path + +import environ + +from scanpipe.archiving import LocalFilesystemProvider + +PROJECT_DIR = environ.Path(__file__) - 1 +ROOT_DIR = PROJECT_DIR - 1 + +# True if running tests through `./manage test` +IS_TESTS = "test" in sys.argv + +# Environment + +ENV_FILE = "/etc/scancodeio/.env" +if not Path(ENV_FILE).exists(): + ENV_FILE = ROOT_DIR(".env") + +# Do not use local .env environment when running the tests. +if IS_TESTS: + ENV_FILE = None + +env = environ.Env() +environ.Env.read_env(ENV_FILE) + +# Security + +SECRET_KEY = env.str("SECRET_KEY", default="") + +ALLOWED_HOSTS = env.list( + "ALLOWED_HOSTS", + default=[".localhost", "127.0.0.1", "[::1]", "host.docker.internal", "172.17.0.1"], +) + +CSRF_TRUSTED_ORIGINS = env.list("CSRF_TRUSTED_ORIGINS", default=[]) + +# SECURITY WARNING: don't run with debug turned on in production +DEBUG = env.bool("SCANCODEIO_DEBUG", default=False) + +SCANCODEIO_REQUIRE_AUTHENTICATION = env.bool( + "SCANCODEIO_REQUIRE_AUTHENTICATION", default=False +) + +SCANCODEIO_ENABLE_ADMIN_SITE = env.bool("SCANCODEIO_ENABLE_ADMIN_SITE", default=False) + +SECURE_CONTENT_TYPE_NOSNIFF = env.bool("SECURE_CONTENT_TYPE_NOSNIFF", default=True) + +X_FRAME_OPTIONS = env.str("X_FRAME_OPTIONS", default="DENY") + +SESSION_COOKIE_SECURE = env.bool("SESSION_COOKIE_SECURE", default=True) + +CSRF_COOKIE_SECURE = env.bool("CSRF_COOKIE_SECURE", default=True) + +# ``security.W004`` SECURE_HSTS_SECONDS and ``security.W008`` SECURE_SSL_REDIRECT +# are handled by the web server. +SILENCED_SYSTEM_CHECKS = ["security.W004", "security.W008"] + +# ScanCode.io + +SCANCODEIO_WORKSPACE_LOCATION = env.str("SCANCODEIO_WORKSPACE_LOCATION", default="var") + +SCANCODEIO_CONFIG_DIR = env.str("SCANCODEIO_CONFIG_DIR", default=".scancode") + +SCANCODEIO_CONFIG_FILE = env.str( + "SCANCODEIO_CONFIG_FILE", default="scancode-config.yml" +) + +SCANCODEIO_LOG_LEVEL = env.str("SCANCODEIO_LOG_LEVEL", "INFO") + +# Set the number of parallel processes to use for ScanCode related scan execution. +# If the SCANCODEIO_PROCESSES argument is not set, defaults to an optimal number of CPUs +# available on the machine. +SCANCODEIO_PROCESSES = env.int("SCANCODEIO_PROCESSES", default=None) + +SCANCODEIO_POLICIES_FILE = env.str("SCANCODEIO_POLICIES_FILE", default="policies.yml") + +# This setting defines the additional locations ScanCode.io will search for pipelines. +# This should be set to a list of strings that contain full paths to your additional +# pipelines directories. +SCANCODEIO_PIPELINES_DIRS = env.list("SCANCODEIO_PIPELINES_DIRS", default=[]) + +# Maximum time allowed for a pipeline to complete. +SCANCODEIO_TASK_TIMEOUT = env.str("SCANCODEIO_TASK_TIMEOUT", default="24h") + +# Default to 2 minutes. +SCANCODEIO_SCAN_FILE_TIMEOUT = env.int("SCANCODEIO_SCAN_FILE_TIMEOUT", default=120) + +# Default to None which scans all files +SCANCODEIO_SCAN_MAX_FILE_SIZE = env.int("SCANCODEIO_SCAN_MAX_FILE_SIZE", default=None) + +# List views pagination, controls the number of items displayed per page. +# Syntax in .env: SCANCODEIO_PAGINATE_BY=project=10,project_error=10 +SCANCODEIO_PAGINATE_BY = env.dict( + "SCANCODEIO_PAGINATE_BY", + default={ + "project": 20, + "error": 50, + "resource": 100, + "package": 100, + "dependency": 100, + "license": 100, + "relation": 100, + }, +) + +# Default limit for "most common" entries in QuerySets. +SCANCODEIO_MOST_COMMON_LIMIT = env.int("SCANCODEIO_MOST_COMMON_LIMIT", default=7) + +# The base URL (e.g., https://hostname/) of this application instance. +# Required for generating URLs to reference objects within the app, +# such as in webhook notifications. +SCANCODEIO_SITE_URL = env.str("SCANCODEIO_SITE_URL", default="") + +# Fetch authentication credentials + +# SCANCODEIO_FETCH_BASIC_AUTH="host=user,password;" +SCANCODEIO_FETCH_BASIC_AUTH = env.dict( + "SCANCODEIO_FETCH_BASIC_AUTH", + cast={"value": tuple}, + default={}, +) + +# SCANCODEIO_FETCH_DIGEST_AUTH="host=user,password;" +SCANCODEIO_FETCH_DIGEST_AUTH = env.dict( + "SCANCODEIO_FETCH_DIGEST_AUTH", + cast={"value": tuple}, + default={}, +) + +# SCANCODEIO_FETCH_HEADERS="host=Header1=value,Header2=value;" +SCANCODEIO_FETCH_HEADERS = {} +FETCH_HEADERS_STR = env.str("SCANCODEIO_FETCH_HEADERS", default="") +for entry in FETCH_HEADERS_STR.split(";"): + if entry.strip(): + host, headers = entry.split("=", 1) + SCANCODEIO_FETCH_HEADERS[host] = env.parse_value(headers, cast=dict) + +# SCANCODEIO_NETRC_LOCATION="~/.netrc" +SCANCODEIO_NETRC_LOCATION = env.str("SCANCODEIO_NETRC_LOCATION", default="") +if SCANCODEIO_NETRC_LOCATION: + # Propagate the location to the environ for `requests.utils.get_netrc_auth` + env.ENVIRON["NETRC"] = SCANCODEIO_NETRC_LOCATION + +# SCANCODEIO_SKOPEO_CREDENTIALS="host1=user:password,host2=user:password" +SCANCODEIO_SKOPEO_CREDENTIALS = env.dict("SCANCODEIO_SKOPEO_CREDENTIALS", default={}) + +# SCANCODEIO_SKOPEO_AUTHFILE_LOCATION="/path/to/auth.json" +SCANCODEIO_SKOPEO_AUTHFILE_LOCATION = env.str( + "SCANCODEIO_SKOPEO_AUTHFILE_LOCATION", default="" +) + +# This webhook will be added as WebhookSubscription for each new project. +# SCANCODEIO_GLOBAL_WEBHOOK=target_url=https://webhook.url,trigger_on_each_run=False,include_summary=True,include_results=False +SCANCODEIO_GLOBAL_WEBHOOK = env.dict("SCANCODEIO_GLOBAL_WEBHOOK", default={}) + +# Application definition + +INSTALLED_APPS = [ + # Local apps + # Must come before Third-party apps for proper templates override + "scanpipe", + # Django built-in + "django.contrib.auth", + "django.contrib.contenttypes", + "django.contrib.sessions", + "django.contrib.messages", + "django.contrib.staticfiles", + "django.contrib.admin", + "django.contrib.humanize", + # Third-party apps + "crispy_forms", + "crispy_bootstrap3", # required for the djangorestframework browsable API + "django_filters", + "rest_framework", + "rest_framework.authtoken", + "django_rq", + "django_probes", + "taggit", +] + +MIDDLEWARE = [ + "django.middleware.security.SecurityMiddleware", + "django.contrib.sessions.middleware.SessionMiddleware", + "django.middleware.common.CommonMiddleware", + "django.middleware.csrf.CsrfViewMiddleware", + "django.contrib.auth.middleware.AuthenticationMiddleware", + "django.contrib.messages.middleware.MessageMiddleware", + "django.middleware.clickjacking.XFrameOptionsMiddleware", + "scancodeio.middleware.TimezoneMiddleware", +] + +ROOT_URLCONF = "scancodeio.urls" + +WSGI_APPLICATION = "scancodeio.wsgi.application" + +SECURE_PROXY_SSL_HEADER = env.tuple( + "SECURE_PROXY_SSL_HEADER", default=("HTTP_X_FORWARDED_PROTO", "https") +) + +# Database + +DATABASES = { + "default": { + "ENGINE": env.str("SCANCODEIO_DB_ENGINE", "django.db.backends.postgresql"), + "HOST": env.str("SCANCODEIO_DB_HOST", "localhost"), + "NAME": env.str("SCANCODEIO_DB_NAME", "scancodeio"), + "USER": env.str("SCANCODEIO_DB_USER", "scancodeio"), + "PASSWORD": env.str("SCANCODEIO_DB_PASSWORD", "scancodeio"), + "PORT": env.str("SCANCODEIO_DB_PORT", "5432"), + "ATOMIC_REQUESTS": True, + } +} + +DEFAULT_AUTO_FIELD = "django.db.models.AutoField" + +# Forms and filters + +FILTERS_EMPTY_CHOICE_LABEL = env.str("FILTERS_EMPTY_CHOICE_LABEL", default="All") + +# Templates + +TEMPLATES = [ + { + "BACKEND": "django.template.backends.django.DjangoTemplates", + "APP_DIRS": True, + "OPTIONS": { + "debug": DEBUG, + "context_processors": [ + "django.contrib.auth.context_processors.auth", + "django.contrib.messages.context_processors.messages", + "django.template.context_processors.request", + "scancodeio.context_processors.versions", + ], + }, + }, +] + +# Login + +LOGIN_REDIRECT_URL = "project_list" + +# Passwords + +AUTH_PASSWORD_VALIDATORS = [ + { + "NAME": ( + "django.contrib.auth.password_validation.UserAttributeSimilarityValidator" + ), + }, + { + "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator", + "OPTIONS": { + "min_length": env.int("SCANCODEIO_PASSWORD_MIN_LENGTH", default=12), + }, + }, + { + "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator", + }, + { + "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator", + }, +] + +# Testing + +if IS_TESTS: + from django.core.management.utils import get_random_secret_key + + SECRET_KEY = get_random_secret_key() + # Do not pollute the workspace while running the tests. + SCANCODEIO_WORKSPACE_LOCATION = tempfile.mkdtemp() + SCANCODEIO_REQUIRE_AUTHENTICATION = True + SCANCODEIO_SCAN_FILE_TIMEOUT = 120 + SCANCODEIO_POLICIES_FILE = None + # The default password hasher is rather slow by design. + # Using a faster hashing algorithm in the testing context to speed up the run. + PASSWORD_HASHERS = ["django.contrib.auth.hashers.MD5PasswordHasher"] + +# Debug toolbar + +DEBUG_TOOLBAR = env.bool("SCANCODEIO_DEBUG_TOOLBAR", default=False) +if DEBUG and DEBUG_TOOLBAR: + INSTALLED_APPS.append("debug_toolbar") + MIDDLEWARE.append("debug_toolbar.middleware.DebugToolbarMiddleware") + INTERNAL_IPS = ["127.0.0.1"] + +# Logging + +LOGGING = { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "simple": { + "format": "{levelname} {message}", + "style": "{", + }, + }, + "handlers": { + "null": { + "class": "logging.NullHandler", + }, + "console": { + "class": "logging.StreamHandler", + "formatter": "simple", + }, + }, + "loggers": { + "scanpipe": { + "handlers": ["null"] if IS_TESTS else ["console"], + "level": SCANCODEIO_LOG_LEVEL, + "propagate": False, + }, + "django": { + "handlers": ["null"] if IS_TESTS else ["console"], + "propagate": False, + }, + # Set SCANCODEIO_LOG_LEVEL=DEBUG to display all SQL queries in the console. + "django.db.backends": { + "level": SCANCODEIO_LOG_LEVEL, + }, + }, +} + +# Instead of sending out real emails the console backend just writes the emails +# that would be sent to the standard output. +EMAIL_BACKEND = "django.core.mail.backends.console.EmailBackend" + +# Internationalization + +LANGUAGE_CODE = "en-us" + +FORMAT_MODULE_PATH = ["scancodeio.formats"] + +TIME_ZONE = env.str("TIME_ZONE", default="UTC") + +USE_I18N = True + +USE_TZ = True + +# Static files (CSS, JavaScript, Images) + +STATIC_URL = "/static/" + +STATIC_ROOT = env.str("STATIC_ROOT", default="/var/scancodeio/static/") + +STATICFILES_DIRS = [ + PROJECT_DIR("static"), +] + +# Third-party apps + +CRISPY_TEMPLATE_PACK = "bootstrap3" + +# Centralized archive directory for all projects +CENTRAL_ARCHIVE_PATH = env.str( + "CENTRAL_ARCHIVE_PATH", default="/var/scancodeio/archives" +) + +# localstorage configuration +DOWNLOAD_ARCHIVING_PROVIDER = env.str( + "DOWNLOAD_ARCHIVING_PROVIDER", default="localstorage" +) + +# For local storage, we would store the root path in that setting +DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION = env.dict( + "DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION", default=None +) + +# Initialize the DownloadStore for local storage + +download_store = None +logger = logging.getLogger(__name__) +if DOWNLOAD_ARCHIVING_PROVIDER == "localstorage": + config = DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION or {} + root_path = Path(config.get("root_path", CENTRAL_ARCHIVE_PATH)) + try: + download_store = LocalFilesystemProvider(root_path=root_path) + except Exception as e: + logger.error(f"Failed to initialize LocalFilesystemProvider: {e}") +else: + logger.error(f"Unknown DOWNLOAD_ARCHIVING_PROVIDER: {DOWNLOAD_ARCHIVING_PROVIDER}") + +# Job Queue + +RQ_QUEUES = { + "default": { + "HOST": env.str("SCANCODEIO_RQ_REDIS_HOST", default="localhost"), + "PORT": env.str("SCANCODEIO_RQ_REDIS_PORT", default="6379"), + "DB": env.int("SCANCODEIO_RQ_REDIS_DB", default=0), + "USERNAME": env.str("SCANCODEIO_RQ_REDIS_USERNAME", default=None), + "PASSWORD": env.str("SCANCODEIO_RQ_REDIS_PASSWORD", default=""), + "DEFAULT_TIMEOUT": env.int("SCANCODEIO_RQ_REDIS_DEFAULT_TIMEOUT", default=360), + # Enable SSL for Redis connections when deploying ScanCode.io in environments + # where Redis is hosted on a separate system (e.g., cloud deployment or remote + # Redis server) to secure data in transit. + "SSL": env.bool("SCANCODEIO_RQ_REDIS_SSL", default=False), + }, +} + +SCANCODEIO_ASYNC = env.bool("SCANCODEIO_ASYNC", default=False) +if not SCANCODEIO_ASYNC: + for queue_config in RQ_QUEUES.values(): + queue_config["ASYNC"] = False + +# ClamAV virus scan +CLAMD_USE_TCP = env.bool("CLAMD_USE_TCP", default=True) +CLAMD_TCP_ADDR = env.str("CLAMD_TCP_ADDR", default="clamav") + +# Django restframework + +REST_FRAMEWORK = { + "DEFAULT_AUTHENTICATION_CLASSES": ( + "rest_framework.authentication.TokenAuthentication", + ), + "DEFAULT_PERMISSION_CLASSES": ("rest_framework.permissions.IsAuthenticated",), + "DEFAULT_RENDERER_CLASSES": ( + "rest_framework.renderers.JSONRenderer", + "rest_framework.renderers.BrowsableAPIRenderer", + "rest_framework.renderers.AdminRenderer", + ), + "DEFAULT_FILTER_BACKENDS": ( + "django_filters.rest_framework.DjangoFilterBackend", + "rest_framework.filters.SearchFilter", + ), + "DEFAULT_PAGINATION_CLASS": "rest_framework.pagination.PageNumberPagination", + "PAGE_SIZE": env.int("SCANCODEIO_REST_API_PAGE_SIZE", default=50), + "UPLOADED_FILES_USE_URL": False, +} + +if not SCANCODEIO_REQUIRE_AUTHENTICATION: + REST_FRAMEWORK["DEFAULT_PERMISSION_CLASSES"] = ( + "rest_framework.permissions.AllowAny", + ) + +# VulnerableCode integration + +VULNERABLECODE_URL = env.str("VULNERABLECODE_URL", default="").rstrip("/") +VULNERABLECODE_USER = env.str("VULNERABLECODE_USER", default="") +VULNERABLECODE_PASSWORD = env.str("VULNERABLECODE_PASSWORD", default="") +VULNERABLECODE_API_KEY = env.str("VULNERABLECODE_API_KEY", default="") + +# PurlDB integration + +PURLDB_URL = env.str("PURLDB_URL", default="").rstrip("/") +PURLDB_USER = env.str("PURLDB_USER", default="") +PURLDB_PASSWORD = env.str("PURLDB_PASSWORD", default="") +PURLDB_API_KEY = env.str("PURLDB_API_KEY", default="") + +# MatchCode.io integration + +MATCHCODEIO_URL = env.str("MATCHCODEIO_URL", default="").rstrip("/") +MATCHCODEIO_USER = env.str("MATCHCODEIO_USER", default="") +MATCHCODEIO_PASSWORD = env.str("MATCHCODEIO_PASSWORD", default="") +MATCHCODEIO_API_KEY = env.str("MATCHCODEIO_API_KEY", default="") + +# FederatedCode integration + +FEDERATEDCODE_GIT_ACCOUNT_URL = env.str( + "FEDERATEDCODE_GIT_ACCOUNT_URL", default="" +).rstrip("/") +FEDERATEDCODE_GIT_SERVICE_TOKEN = env.str("FEDERATEDCODE_GIT_SERVICE_TOKEN", default="") +FEDERATEDCODE_GIT_SERVICE_NAME = env.str("FEDERATEDCODE_GIT_SERVICE_NAME", default="") +FEDERATEDCODE_GIT_SERVICE_EMAIL = env.str("FEDERATEDCODE_GIT_SERVICE_EMAIL", default="") diff --git a/scanpipe/archiving.py b/scanpipe/archiving.py index 482f448de5..3f3d66e2e8 100644 --- a/scanpipe/archiving.py +++ b/scanpipe/archiving.py @@ -1,190 +1,185 @@ -# scanpipe/archiving.py -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - -import hashlib -import json -import logging -import os -import stat -from abc import ABC -from abc import abstractmethod -from dataclasses import dataclass -from pathlib import Path - - -logger = logging.getLogger(__name__) - - -@dataclass -class Download: - sha256: str - download_date: str - download_url: str - filename: str - - -class DownloadStore(ABC): - def _compute_sha256(self, content: bytes) -> str: - """Compute SHA256 hash for content.""" - return hashlib.sha256(content).hexdigest() - - def _compute_origin_hash( - self, filename: str, download_date: str, download_url: str - ) -> str: - """Compute a hash for the metadata to name the origin JSON file.""" - to_hash = f"{filename}{download_date}{download_url}".encode() - return hashlib.sha256(to_hash).hexdigest() - - def _build_metadata( - self, sha256: str, filename: str, download_date: str, download_url: str - ) -> dict: - """Build metadata dictionary for JSON storage.""" - return { - "sha256": sha256, - "filename": filename, - "download_date": download_date, - "download_url": download_url, - } - - @abstractmethod - def _get_content_path(self, sha256: str) -> str: - """Get the storage path/key for the content based on SHA256.""" - pass - - @abstractmethod - def list(self): - """Return an iterable of all stored downloads.""" - pass - - @abstractmethod - def get(self, sha256_checksum: str): - """Return a Download object for this checksum or None.""" - pass - - @abstractmethod - def put(self, content: bytes, download_url: str, download_date: str, filename: str): - """ - Store content with its metadata. Return a Download object on success. - Raise an exception on error. - """ - pass - - @abstractmethod - def find( - self, download_url: str = None, filename: str = None, download_date: str = None - ): - """Return a Download object matching the metadata or None.""" - pass - - -class LocalFilesystemProvider(DownloadStore): - def __init__(self, root_path: Path): - self.root_path = root_path - - def _get_content_path(self, sha256: str) -> Path: - """Create a nested path like 59/4c/67/... based on the SHA256 hash.""" - return self.root_path / sha256[:2] / sha256[2:4] / sha256[4:] - - def list(self): - """Return an iterable of all stored downloads.""" - downloads = [] - for content_path in self.root_path.rglob("content"): - origin_files = list(content_path.parent.glob("origin-*.json")) - for origin_file in origin_files: - try: - with open(origin_file) as f: - data = json.load(f) - downloads.append(Download(**data)) - except Exception as e: - logger.error(f"Error reading {origin_file}: {e}") - return downloads - - def get(self, sha256_checksum: str): - """Retrieve a Download object for the given SHA256 hash.""" - content_path = self._get_content_path(sha256_checksum) - if content_path.exists(): - origin_files = list(content_path.glob("origin-*.json")) - if origin_files: - try: - with open(origin_files[0]) as f: - data = json.load(f) - return Download(**data) - except Exception as e: - logger.error( - f"Error reading origin file for {sha256_checksum}: {e}" - ) - return None - - def put(self, content: bytes, download_url: str, download_date: str, filename: str): - """Store the content and its metadata.""" - sha256 = self._compute_sha256(content) - content_path = self._get_content_path(sha256) - content_path.mkdir(parents=True, exist_ok=True) - - content_file = content_path / "content" - if not content_file.exists(): - try: - with open(content_file, "wb") as f: - f.write(content) - except Exception as e: - raise Exception(f"Failed to write content to {content_file}: {e}") - - origin_hash = self._compute_origin_hash(filename, download_date, download_url) - origin_filename = f"origin-{origin_hash}.json" - origin_path = content_path / origin_filename - if origin_path.exists(): - raise Exception(f"Origin {origin_filename} already exists") - - metadata = self._build_metadata(sha256, filename, download_date, download_url) - try: - with open(origin_path, "w") as f: - json.dump(metadata, f, indent=2) - except Exception as e: - raise Exception(f"Failed to write metadata to {origin_path}: {e}") - - return Download(**metadata) - - def find( - self, download_url: str = None, filename: str = None, download_date: str = None - ): - """Find a download based on metadata.""" - if not (download_url or filename or download_date): - return None - for content_path in self.root_path.rglob("origin-*.json"): - try: - with open(content_path) as f: - data = json.load(f) - if ( - (download_url is None or data.get("url") == download_url) - and (filename is None or data.get("filename") == filename) - and ( - download_date is None - or data.get("download_date") == download_date - ) - ): - return Download(**data) - except Exception as e: - logger.error(f"Error reading {content_path}: {e}") - return None - - +# scanpipe/archiving.py +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +import hashlib +import json +import logging +from abc import ABC +from abc import abstractmethod +from dataclasses import dataclass +from pathlib import Path + +logger = logging.getLogger(__name__) + + +@dataclass +class Download: + sha256: str + download_date: str + download_url: str + filename: str + + +class DownloadStore(ABC): + def _compute_sha256(self, content: bytes) -> str: + """Compute SHA256 hash for content.""" + return hashlib.sha256(content).hexdigest() + + def _compute_origin_hash( + self, filename: str, download_date: str, download_url: str + ) -> str: + """Compute a hash for the metadata to name the origin JSON file.""" + to_hash = f"{filename}{download_date}{download_url}".encode() + return hashlib.sha256(to_hash).hexdigest() + + def _build_metadata( + self, sha256: str, filename: str, download_date: str, download_url: str + ) -> dict: + """Build metadata dictionary for JSON storage.""" + return { + "sha256": sha256, + "filename": filename, + "download_date": download_date, + "download_url": download_url, + } + + @abstractmethod + def _get_content_path(self, sha256: str) -> str: + """Get the storage path/key for the content based on SHA256.""" + pass + + @abstractmethod + def list(self): + """Return an iterable of all stored downloads.""" + pass + + @abstractmethod + def get(self, sha256_checksum: str): + """Return a Download object for this checksum or None.""" + pass + + @abstractmethod + def put(self, content: bytes, download_url: str, download_date: str, filename: str): + """ + Store content with its metadata. Return a Download object on success. + Raise an exception on error. + """ + pass + + @abstractmethod + def find( + self, download_url: str = None, filename: str = None, download_date: str = None + ): + """Return a Download object matching the metadata or None.""" + pass + + +class LocalFilesystemProvider(DownloadStore): + def __init__(self, root_path: Path): + self.root_path = root_path + + def _get_content_path(self, sha256: str) -> Path: + """Create a nested path like 59/4c/67/... based on the SHA256 hash.""" + return self.root_path / sha256[:2] / sha256[2:4] / sha256[4:] + + def list(self): + """Return an iterable of all stored downloads.""" + downloads = [] + for content_path in self.root_path.rglob("content"): + origin_files = list(content_path.parent.glob("origin-*.json")) + for origin_file in origin_files: + try: + with open(origin_file) as f: + data = json.load(f) + downloads.append(Download(**data)) + except Exception as e: + logger.error(f"Error reading {origin_file}: {e}") + return downloads + + def get(self, sha256_checksum: str): + """Retrieve a Download object for the given SHA256 hash.""" + content_path = self._get_content_path(sha256_checksum) + if content_path.exists(): + origin_files = list(content_path.glob("origin-*.json")) + if origin_files: + try: + with open(origin_files[0]) as f: + data = json.load(f) + return Download(**data) + except Exception as e: + logger.error( + f"Error reading origin file for {sha256_checksum}: {e}" + ) + return None + + def put(self, content: bytes, download_url: str, download_date: str, filename: str): + """Store the content and its metadata.""" + sha256 = self._compute_sha256(content) + content_path = self._get_content_path(sha256) + content_path.mkdir(parents=True, exist_ok=True) + + content_file = content_path / "content" + if not content_file.exists(): + try: + with open(content_file, "wb") as f: + f.write(content) + except Exception as e: + raise Exception(f"Failed to write content to {content_file}: {e}") + + origin_hash = self._compute_origin_hash(filename, download_date, download_url) + origin_filename = f"origin-{origin_hash}.json" + origin_path = content_path / origin_filename + if origin_path.exists(): + raise Exception(f"Origin {origin_filename} already exists") + + metadata = self._build_metadata(sha256, filename, download_date, download_url) + try: + with open(origin_path, "w") as f: + json.dump(metadata, f, indent=2) + except Exception as e: + raise Exception(f"Failed to write metadata to {origin_path}: {e}") + + return Download(**metadata) + + def find( + self, download_url: str = None, filename: str = None, download_date: str = None + ): + """Find a download based on metadata.""" + if not (download_url or filename or download_date): + return None + for content_path in self.root_path.rglob("origin-*.json"): + try: + with open(content_path) as f: + data = json.load(f) + if ( + (download_url is None or data.get("url") == download_url) + and (filename is None or data.get("filename") == filename) + and ( + download_date is None + or data.get("download_date") == download_date + ) + ): + return Download(**data) + except Exception as e: + logger.error(f"Error reading {content_path}: {e}") + return None diff --git a/scanpipe/pipelines/__init__.py b/scanpipe/pipelines/__init__.py index 1b6cd4e0a0..5153bf1887 100644 --- a/scanpipe/pipelines/__init__.py +++ b/scanpipe/pipelines/__init__.py @@ -1,346 +1,353 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - -import inspect -import logging -import traceback -import hashlib -from contextlib import contextmanager -from datetime import datetime -from functools import wraps -from pathlib import Path - -import bleach -import requests -from markdown_it import MarkdownIt -from pyinstrument import Profiler - -from aboutcode.pipeline import BasePipeline -from scancodeio.settings import download_store - -logger = logging.getLogger(__name__) - - -class InputFilesError(Exception): - """InputFile is missing or cannot be downloaded.""" - - def __init__(self, error_tracebacks): - self.error_tracebacks = error_tracebacks - super().__init__(self._generate_message()) - - def _generate_message(self): - message = "InputFilesError encountered with the following issues:\n" - for index, (error, tb) in enumerate(self.error_tracebacks, start=1): - message += f"\nError {index}: {str(error)}\n\n{tb}" - return message - - -def convert_markdown_to_html(markdown_text): - """Convert Markdown text to sanitized HTML.""" - # Using the "js-default" for safety. - html_content = MarkdownIt("js-default").renderInline(markdown_text) - # Sanitize HTML using bleach. - sanitized_html = bleach.clean(html_content) - return sanitized_html - - -class CommonStepsMixin: - """Common steps available on all project pipelines.""" - - def flag_empty_files(self): - """Flag empty files.""" - from scanpipe.pipes import flag - - flag.flag_empty_files(self.project) - - def flag_ignored_resources(self): - """Flag ignored resources based on Project ``ignored_patterns`` setting.""" - from scanpipe.pipes import flag - - ignored_patterns = self.env.get("ignored_patterns", []) - - if isinstance(ignored_patterns, str): - ignored_patterns = ignored_patterns.splitlines() - ignored_patterns.extend(flag.DEFAULT_IGNORED_PATTERNS) - - flag.flag_ignored_patterns( - codebaseresources=self.project.codebaseresources.no_status(), - patterns=ignored_patterns, - ) - - def extract_archive(self, location, target): - """Extract archive at `location` to `target`. Save errors as messages.""" - from scanpipe.pipes import scancode - - extract_errors = scancode.extract_archive(location, target) - - for resource_location, errors in extract_errors.items(): - resource_path = Path(resource_location) - - if resource_path.is_relative_to(self.project.codebase_path): - resource_path = resource_path.relative_to(self.project.codebase_path) - details = {"resource_path": str(resource_path)} - elif resource_path.is_relative_to(self.project.input_path): - resource_path = resource_path.relative_to(self.project.input_path) - details = {"path": f"input/{str(resource_path)}"} - else: - details = {"filename": str(resource_path.name)} - - self.project.add_error( - description="\n".join(errors), - model="extract_archive", - details=details, - ) - - def extract_archives(self, location=None): - """Extract archives located in the codebase/ directory with extractcode.""" - from scanpipe.pipes import scancode - - if not location: - location = self.project.codebase_path - - extract_errors = scancode.extract_archives(location=location, recurse=True) - - for resource_path, errors in extract_errors.items(): - self.project.add_error( - description="\n".join(errors), - model="extract_archives", - details={"resource_path": resource_path}, - ) - - # Reload the project env post-extraction as the scancode-config.yml file - # may be located in one of the extracted archives. - self.env = self.project.get_env() - - def download_missing_inputs(self): - """ - Download any InputSource missing on disk. - Raise an error if any of the uploaded files is not available or not reachable. - """ - error_tracebacks = [] - - for input_source in self.project.inputsources.all(): - if input_source.exists(): - continue - - if input_source.is_uploaded: - msg = f"Uploaded file {input_source} not available." - self.log(msg) - error_tracebacks.append((msg, "No traceback available.")) - continue - - download_url = input_source.download_url - if not download_url: - continue - - url_hash = hashlib.sha256(download_url.encode()).hexdigest() - filename = input_source.filename or Path(download_url).name or f"{url_hash}.archive" - archive_path = Path(settings.CENTRAL_ARCHIVE_PATH) / url_hash / filename - - if archive_path.exists(): - logger.info(f"Reusing existing archive at {archive_path}") - input_source.file_path = str(archive_path) - input_source.save() - continue - - self.log(f"Fetching input from {input_source.download_url}") - try: - input_source.fetch() - - except Exception as error: - traceback_str = traceback.format_exc() - logger.error(traceback_str) - self.log(f"{input_source.download_url} could not be fetched.") - error_tracebacks.append((str(error), traceback_str)) - - if error_tracebacks: - raise InputFilesError(error_tracebacks) - - def archive_downloads(self): - """ - Archive downloaded inputs to the centralized DownloadStore if not already - archived.Updates InputSource with archiving metadata (sha256, download_date). - """ - logger.info(f"Archiving downloads for project {self.project.name}") - for input_source in self.project.inputsources.filter( - sha256__isnull=True, is_uploaded=False - ): - if input_source.download_url: - try: - response = requests.get( - input_source.download_url, stream=True,timeout=30 - ) - response.raise_for_status() - content = response.content - filename = ( - input_source.filename - or input_source.download_url.split("/")[-1] - ) - download = download_store.put( - content=content, - download_url=input_source.download_url, - download_date=datetime.now().isoformat(), - filename=filename, - ) - input_source.sha256 = download.sha256 - input_source.download_date = download.download_date - input_source.save() - except Exception as e: - self.add_error( - exception=e, - message=f"Failed to archive {input_source.download_url}", - ) - else: - logger.warning( - f"No download URL for input {input_source.filename}," - "skipping archiving" - ) - - -class ProjectPipeline(CommonStepsMixin, BasePipeline): - """Main class for all project related pipelines including common steps methods.""" - - # Flag specifying whether to download missing inputs as an initial step. - download_inputs = True - - # Optional URL that targets a view of the results relative to this Pipeline. - # This URL may contain dictionary-style string formatting, which will be - # interpolated against the project's field attributes. - # For example, you could use results_url="/project/{slug}/packages/?filter=value" - # to target the Package list view with an active filtering. - results_url = "" - - def __init__(self, run_instance): - """Load the Pipeline execution context from a Run database object.""" - self.run = run_instance - self.project = run_instance.project - self.env = self.project.get_env() - - self.pipeline_class = run_instance.pipeline_class - self.pipeline_name = run_instance.pipeline_name - - self.selected_groups = run_instance.selected_groups or [] - self.selected_steps = run_instance.selected_steps or [] - - self.ecosystem_config = None - - @classmethod - def get_initial_steps(cls): - """Add the ``download_inputs`` step as an initial step if enabled.""" - steps = [] - if cls.download_inputs: - steps.append(cls.download_missing_inputs) - if ENABLE_DOWNLOAD_ARCHIVING: - steps.append(cls.archive_downloads) - return tuple(steps) - - @classmethod - def get_info(cls, as_html=False): - """Add the option to render the values as HTML.""" - info = super().get_info() - - if as_html: - info["summary"] = convert_markdown_to_html(info["summary"]) - info["description"] = convert_markdown_to_html(info["description"]) - for step in info["steps"]: - step["doc"] = convert_markdown_to_html(step["doc"]) - - return info - - def append_to_log(self, message): - self.run.append_to_log(message) - - def set_current_step(self, message): - self.run.set_current_step(message) - - def add_error(self, exception, resource=None): - """Create a ``ProjectMessage`` ERROR record on the current `project`.""" - self.project.add_error( - model=self.pipeline_name, - exception=exception, - object_instance=resource, - ) - - @contextmanager - def save_errors(self, *exceptions, **kwargs): - """ - Context manager to save specified exceptions as ``ProjectMessage`` in the - database. - - - Example in a Pipeline step:: - - with self.save_errors(rootfs.DistroNotFound): - rootfs.scan_rootfs_for_system_packages(self.project, rfs) - - - Example when iterating over resources:: - - for resource in self.project.codebaseresources.all(): - with self.save_errors(Exception, resource=resource): - analyse(resource) - """ - try: - yield - except exceptions as error: - self.add_error(exception=error, **kwargs) - - -class Pipeline(ProjectPipeline): - """Alias for the ProjectPipeline class.""" - - pass - - -def is_pipeline(obj): - """ - Return True if the `obj` is a subclass of `Pipeline` except for the - `Pipeline` class itself. - """ - return inspect.isclass(obj) and issubclass(obj, Pipeline) and obj is not Pipeline - - -def profile(step): - """ - Profile a Pipeline step and save the results as HTML file in the project output - directory. - - Usage: - @profile - def step(self): - pass - """ - - @wraps(step) - def wrapper(*arg, **kwargs): - pipeline_instance = arg[0] - project = pipeline_instance.project - - with Profiler() as profiler: - result = step(*arg, **kwargs) - - output_file = project.get_output_file_path("profile", "html") - output_file.write_text(profiler.output_html()) - - pipeline_instance.log(f"Profiling results at {output_file.resolve()}") - - return result - - return wrapper +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +import hashlib +import inspect +import logging +import traceback +from contextlib import contextmanager +from datetime import datetime +from functools import wraps +from pathlib import Path + +import bleach +from markdown_it import MarkdownIt +from pyinstrument import Profiler + +from aboutcode.pipeline import BasePipeline +from scancodeio.settings import download_store +from scancodeio.settings import settings + +logger = logging.getLogger(__name__) + + +class InputFilesError(Exception): + """InputFile is missing or cannot be downloaded.""" + + def __init__(self, error_tracebacks): + self.error_tracebacks = error_tracebacks + super().__init__(self._generate_message()) + + def _generate_message(self): + message = "InputFilesError encountered with the following issues:\n" + for index, (error, tb) in enumerate(self.error_tracebacks, start=1): + message += f"\nError {index}: {str(error)}\n\n{tb}" + return message + + +def convert_markdown_to_html(markdown_text): + """Convert Markdown text to sanitized HTML.""" + # Using the "js-default" for safety. + html_content = MarkdownIt("js-default").renderInline(markdown_text) + # Sanitize HTML using bleach. + sanitized_html = bleach.clean(html_content) + return sanitized_html + + +class CommonStepsMixin: + """Common steps available on all project pipelines.""" + + def flag_empty_files(self): + """Flag empty files.""" + from scanpipe.pipes import flag + + flag.flag_empty_files(self.project) + + def flag_ignored_resources(self): + """Flag ignored resources based on Project ``ignored_patterns`` setting.""" + from scanpipe.pipes import flag + + ignored_patterns = self.env.get("ignored_patterns", []) + + if isinstance(ignored_patterns, str): + ignored_patterns = ignored_patterns.splitlines() + ignored_patterns.extend(flag.DEFAULT_IGNORED_PATTERNS) + + flag.flag_ignored_patterns( + codebaseresources=self.project.codebaseresources.no_status(), + patterns=ignored_patterns, + ) + + def extract_archive(self, location, target): + """Extract archive at `location` to `target`. Save errors as messages.""" + from scanpipe.pipes import scancode + + extract_errors = scancode.extract_archive(location, target) + + for resource_location, errors in extract_errors.items(): + resource_path = Path(resource_location) + + if resource_path.is_relative_to(self.project.codebase_path): + resource_path = resource_path.relative_to(self.project.codebase_path) + details = {"resource_path": str(resource_path)} + elif resource_path.is_relative_to(self.project.input_path): + resource_path = resource_path.relative_to(self.project.input_path) + details = {"path": f"input/{str(resource_path)}"} + else: + details = {"filename": str(resource_path.name)} + + self.project.add_error( + description="\n".join(errors), + model="extract_archive", + details=details, + ) + + def extract_archives(self, location=None): + """Extract archives located in the codebase/ directory with extractcode.""" + from scanpipe.pipes import scancode + + if not location: + location = self.project.codebase_path + + extract_errors = scancode.extract_archives(location=location, recurse=True) + + for resource_path, errors in extract_errors.items(): + self.project.add_error( + description="\n".join(errors), + model="extract_archives", + details={"resource_path": resource_path}, + ) + + # Reload the project env post-extraction as the scancode-config.yml file + # may be located in one of the extracted archives. + self.env = self.project.get_env() + + def download_missing_inputs(self): + """ + Download any InputSource missing on disk. + Raise an error if any of the uploaded files is not available or not reachable. + """ + error_tracebacks = [] + + for input_source in self.project.inputsources.all(): + if input_source.exists(): + continue + + if input_source.is_uploaded: + msg = f"Uploaded file {input_source} not available." + self.log(msg) + error_tracebacks.append((msg, "No traceback available.")) + continue + + download_url = input_source.download_url + if not download_url: + continue + + url_hash = hashlib.sha256(download_url.encode()).hexdigest() + filename = ( + input_source.filename + or Path(download_url).name + or f"{url_hash}.archive" + ) + archive_path = Path(settings.CENTRAL_ARCHIVE_PATH) / url_hash / filename + + if archive_path.exists(): + logger.info(f"Reusing existing archive at {archive_path}") + input_source.file_path = str(archive_path) + input_source.save() + continue + + self.log(f"Fetching input from {input_source.download_url}") + try: + input_source.fetch() + + except Exception as error: + traceback_str = traceback.format_exc() + logger.error(traceback_str) + self.log(f"{input_source.download_url} could not be fetched.") + error_tracebacks.append((str(error), traceback_str)) + + if error_tracebacks: + raise InputFilesError(error_tracebacks) + + def archive_downloads(self): + """ + Archive downloaded inputs to the centralized DownloadStore if not already + archived.Updates InputSource with archiving metadata (sha256, download_date). + """ + logger.info(f"Archiving downloads for project {self.project.name}") + for input_source in self.project.inputsources.filter( + sha256__isnull=True, is_uploaded=False + ): + if input_source.download_url: + logger.warning( + f"No download URL for input {input_source.filename}, " + "skipping archiving" + ) + continue + + if not input_source.file_path: + logger.warning( + f"No file_path for input {input_source.download_url}, " + "skipping archiving" + ) + continue + try: + with open(input_source.file_path, "rb") as f: + content = f.read() + filename = ( + input_source.filename or input_source.download_url.split("/")[-1] + ) + download = download_store.put( + content=content, + download_url=input_source.download_url, + download_date=datetime.now().isoformat(), + filename=filename, + ) + input_source.sha256 = download.sha256 + input_source.download_date = download.download_date + input_source.file_path = str(download.path) + input_source.save() + except Exception as e: + self.add_error( + exception=e, + message=f"Failed to archive {input_source.download_url}", + ) + + +class ProjectPipeline(CommonStepsMixin, BasePipeline): + """Main class for all project related pipelines including common steps methods.""" + + # Flag specifying whether to download missing inputs as an initial step. + download_inputs = True + + # Optional URL that targets a view of the results relative to this Pipeline. + # This URL may contain dictionary-style string formatting, which will be + # interpolated against the project's field attributes. + # For example, you could use results_url="/project/{slug}/packages/?filter=value" + # to target the Package list view with an active filtering. + results_url = "" + + def __init__(self, run_instance): + """Load the Pipeline execution context from a Run database object.""" + self.run = run_instance + self.project = run_instance.project + self.env = self.project.get_env() + + self.pipeline_class = run_instance.pipeline_class + self.pipeline_name = run_instance.pipeline_name + + self.selected_groups = run_instance.selected_groups or [] + self.selected_steps = run_instance.selected_steps or [] + + self.ecosystem_config = None + + @classmethod + def get_initial_steps(cls): + """Add the ``download_inputs`` step as an initial step if enabled.""" + steps = [] + if cls.download_inputs: + steps.append(cls.download_missing_inputs) + steps.append(cls.archive_downloads) + return tuple(steps) + + @classmethod + def get_info(cls, as_html=False): + """Add the option to render the values as HTML.""" + info = super().get_info() + + if as_html: + info["summary"] = convert_markdown_to_html(info["summary"]) + info["description"] = convert_markdown_to_html(info["description"]) + for step in info["steps"]: + step["doc"] = convert_markdown_to_html(step["doc"]) + + return info + + def append_to_log(self, message): + self.run.append_to_log(message) + + def set_current_step(self, message): + self.run.set_current_step(message) + + def add_error(self, exception, resource=None): + """Create a ``ProjectMessage`` ERROR record on the current `project`.""" + self.project.add_error( + model=self.pipeline_name, + exception=exception, + object_instance=resource, + ) + + @contextmanager + def save_errors(self, *exceptions, **kwargs): + """ + Context manager to save specified exceptions as ``ProjectMessage`` in the + database. + + - Example in a Pipeline step:: + + with self.save_errors(rootfs.DistroNotFound): + rootfs.scan_rootfs_for_system_packages(self.project, rfs) + + - Example when iterating over resources:: + + for resource in self.project.codebaseresources.all(): + with self.save_errors(Exception, resource=resource): + analyse(resource) + """ + try: + yield + except exceptions as error: + self.add_error(exception=error, **kwargs) + + +class Pipeline(ProjectPipeline): + """Alias for the ProjectPipeline class.""" + + pass + + +def is_pipeline(obj): + """ + Return True if the `obj` is a subclass of `Pipeline` except for the + `Pipeline` class itself. + """ + return inspect.isclass(obj) and issubclass(obj, Pipeline) and obj is not Pipeline + + +def profile(step): + """ + Profile a Pipeline step and save the results as HTML file in the project output + directory. + + Usage: + @profile + def step(self): + pass + """ + + @wraps(step) + def wrapper(*arg, **kwargs): + pipeline_instance = arg[0] + project = pipeline_instance.project + + with Profiler() as profiler: + result = step(*arg, **kwargs) + + output_file = project.get_output_file_path("profile", "html") + output_file.write_text(profiler.output_html()) + + pipeline_instance.log(f"Profiling results at {output_file.resolve()}") + + return result + + return wrapper diff --git a/scanpipe/pipes/input.py b/scanpipe/pipes/input.py index 81ae91c21d..906a2ee3a1 100644 --- a/scanpipe/pipes/input.py +++ b/scanpipe/pipes/input.py @@ -1,347 +1,345 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - -import hashlib -import logging -import os -import shutil -from datetime import datetime -from pathlib import Path - -from django.core.exceptions import FieldDoesNotExist -from django.core.validators import EMPTY_VALUES -from django.db import models - -import openpyxl -import requests -from typecode.contenttype import get_type - -from scanpipe import pipes -from scanpipe.models import CodebaseRelation -from scanpipe.models import CodebaseResource -from scanpipe.models import DiscoveredDependency -from scanpipe.models import DiscoveredLicense -from scanpipe.models import DiscoveredPackage -from scanpipe.models import InputSource -from scanpipe.pipes import scancode -from scanpipe.pipes.output import mappings_key_by_fieldname -from scancodeio.settings import download_store - -logger = logging.getLogger(__name__) - - -def copy_input(input_location, dest_path): - """Copy the ``input_location`` (file or directory) to the ``dest_path``.""" - input_path = Path(input_location) - destination_dir = Path(dest_path) - destination = destination_dir / input_path.name - - if input_path.is_dir(): - shutil.copytree(input_location, destination) - else: - if not os.path.exists(destination_dir): - os.makedirs(destination_dir) - shutil.copyfile(input_location, destination) - - return destination - - -def copy_inputs(input_locations, dest_path): - """Copy the provided ``input_locations`` to the ``dest_path``.""" - for input_location in input_locations: - copy_input(input_location, dest_path) - - -def move_input(input_location, dest_path): - """Move the provided ``input_location`` to the ``dest_path``.""" - destination = dest_path / Path(input_location).name - return shutil.move(input_location, destination) - - -def move_inputs(inputs, dest_path): - """Move the provided ``inputs`` to the ``dest_path``.""" - for input_location in inputs: - move_input(input_location, dest_path) - - -def get_tool_name_from_scan_headers(scan_data): - """Return the ``tool_name`` of the first header in the provided ``scan_data``.""" - if headers := scan_data.get("headers", []): - first_header = headers[0] - tool_name = first_header.get("tool_name", "") - return tool_name - - -def get_extra_data_from_scan_headers(scan_data): - """Return the ``extra_data`` of the first header in the provided ``scan_data``.""" - if headers := scan_data.get("headers", []): - first_header = headers[0] - if extra_data := first_header.get("extra_data"): - return extra_data - - -def is_archive(location): - """Return True if the file at ``location`` is an archive.""" - return get_type(location).is_archive - - -def load_inventory_from_toolkit_scan(project, input_location): - """ - Create license detections, packages, dependencies, and resources - loaded from the ScanCode-toolkit scan results located at ``input_location``. - """ - scanned_codebase = scancode.get_virtual_codebase(project, input_location) - scancode.create_discovered_licenses(project, scanned_codebase) - scancode.create_discovered_packages(project, scanned_codebase) - scancode.create_codebase_resources(project, scanned_codebase) - scancode.create_discovered_dependencies( - project, scanned_codebase, strip_datafile_path_root=True - ) - scancode.load_todo_issues(project, scanned_codebase) - - -def load_inventory_from_scanpipe(project, scan_data, extra_data_prefix=None): - """ - Create packages, dependencies, license detections, resources, and relations - loaded from a ScanCode.io JSON output provided as ``scan_data``. - - An ``extra_data_prefix`` can be provided in case multiple input files are loaded - into the same project. The prefix is usually the filename of the input. - """ - for detection_data in scan_data.get("license_detections", []): - pipes.update_or_create_license_detection(project, detection_data) - - for package_data in scan_data.get("packages", []): - pipes.update_or_create_package(project, package_data) - - for resource_data in scan_data.get("files", []): - pipes.update_or_create_resource(project, resource_data) - - for dependency_data in scan_data.get("dependencies", []): - pipes.update_or_create_dependency(project, dependency_data) - - for relation_data in scan_data.get("relations", []): - pipes.get_or_create_relation(project, relation_data) - - if extra_data := get_extra_data_from_scan_headers(scan_data): - if extra_data_prefix: - extra_data = {extra_data_prefix: extra_data} - project.update_extra_data(extra_data) - - -model_to_object_maker_func = { - DiscoveredPackage: pipes.update_or_create_package, - DiscoveredDependency: pipes.update_or_create_dependency, - DiscoveredLicense: pipes.update_or_create_license_detection, - CodebaseResource: pipes.update_or_create_resource, - CodebaseRelation: pipes.get_or_create_relation, -} - -worksheet_name_to_model = { - "PACKAGES": DiscoveredPackage, - "LICENSE_DETECTIONS": DiscoveredLicense, - "RESOURCES": CodebaseResource, - "DEPENDENCIES": DiscoveredDependency, - "RELATIONS": CodebaseRelation, -} - - -def get_worksheet_data(worksheet): - """Return the data from provided ``worksheet`` as a list of dict.""" - try: - header = [cell.value for cell in next(worksheet.rows)] - except StopIteration: - return {} - - worksheet_data = [ - dict(zip(header, row)) - for row in worksheet.iter_rows(min_row=2, values_only=True) - ] - return worksheet_data - - -def clean_xlsx_field_value(model_class, field_name, value): - """Clean the ``value`` for compatibility with the database ``model_class``.""" - if value in EMPTY_VALUES: - return - - if field_name == "for_packages": - return value.splitlines() - - elif field_name in ["purl", "for_package_uid", "datafile_path"]: - return value - - try: - field = model_class._meta.get_field(field_name) - except FieldDoesNotExist: - return - - if dict_key := mappings_key_by_fieldname.get(field_name): - return [{dict_key: entry} for entry in value.splitlines()] - - elif isinstance(field, models.JSONField): - if field.default is list: - return value.splitlines() - elif field.default is dict: - return # dict stored as JSON are not supported - - return value - - -def clean_xlsx_data_to_model_data(model_class, xlsx_data): - """Clean the ``xlsx_data`` for compatibility with the database ``model_class``.""" - cleaned_data = {} - - for field_name, value in xlsx_data.items(): - if cleaned_value := clean_xlsx_field_value(model_class, field_name, value): - cleaned_data[field_name] = cleaned_value - - return cleaned_data - - -def load_inventory_from_xlsx(project, input_location, extra_data_prefix=None): - """ - Create packages, dependencies, resources, and relations loaded from XLSX file - located at ``input_location``. - - An ``extra_data_prefix`` can be provided in case multiple input files are loaded - into the same project. The prefix is usually the filename of the input. - """ - workbook = openpyxl.load_workbook(input_location, read_only=True, data_only=True) - - for worksheet_name, model_class in worksheet_name_to_model.items(): - if worksheet_name not in workbook: - continue - - worksheet_data = get_worksheet_data(worksheet=workbook[worksheet_name]) - for row_data in worksheet_data: - object_maker_func = model_to_object_maker_func.get(model_class) - cleaned_data = clean_xlsx_data_to_model_data(model_class, row_data) - if cleaned_data: - object_maker_func(project, cleaned_data) - - if "LAYERS" in workbook: - layers_data = get_worksheet_data(worksheet=workbook["LAYERS"]) - extra_data = {"layers": layers_data} - if extra_data_prefix: - extra_data = {extra_data_prefix: extra_data} - project.update_extra_data(extra_data) - - -def add_input_from_url(project, url, filename=None): - """ - Download the file from the provided ``url`` and add it as an InputSource for the - specified ``project``. Optionally, specify a ``filename`` for the downloaded file. - If archiving is enabled, store the content in the DownloadStore and save metadata. - """ - try: - response = requests.get(url, stream=True,timeout=30) - response.raise_for_status() - content = response.content - except requests.RequestException as e: - logger.error(f"Failed to download {url}: {e}") - raise - - filename = filename or url.split("/")[-1] or "downloaded_file" - url_hash = hashlib.sha256(url.encode()).hexdigest() - archive_path = Path(project.settings.CENTRAL_ARCHIVE_PATH) / url_hash / filename - - if download_store: - try: - download = download_store.put( - content=content, - download_url=url, - download_date=datetime.now().isoformat(), - filename=filename, - ) - InputSource.objects.create( - project=project, - sha256=download.sha256, - download_url=download.download_url, - filename=download.filename, - download_date=download.download_date, - file_path=str(download.path), - is_uploaded=False, - ) - except Exception as e: - logger.error(f"Failed to archive download for {url}: {e}") - raise - else: - input_path = project.input_path / filename - try: - input_path.parent.mkdir(parents=True, exist_ok=True) - with open(input_path, "wb") as f: - f.write(content) - InputSource.objects.create( - project=project, - filename=filename, - download_url=url, - file_path=str(input_path), - is_uploaded=False, - ) - except Exception as e: - logger.error(f"Failed to save {filename} to {input_path}: {e}") - raise - -def add_input_from_upload(project, uploaded_file): - """ - Add an uploaded file as an InputSource for the specified ``project``. - If archiving is enabled, store the content in the DownloadStore and save metadata. - """ - content = uploaded_file.read() - filename = uploaded_file.name - - if download_store: - try: - download = download_store.put( - content=content, - download_url="", - download_date=datetime.now().isoformat(), - filename=filename, - ) - InputSource.objects.create( - project=project, - sha256=download.sha256, - download_url=download.download_url, - filename=download.filename, - download_date=download.download_date, - file_path=str(download.path), - is_uploaded=True, - ) - except Exception as e: - logger.error(f"Failed to archive upload {filename}: {e}") - raise - else: - input_path = project.input_path / filename - try: - input_path.parent.mkdir(parents=True, exist_ok=True) - with open(input_path, "wb") as f: - f.write(content) - InputSource.objects.create( - project=project, - filename=filename, - file_path=str(input_path), - is_uploaded=True, - ) - except Exception as e: - logger.error(f"Failed to save {filename} to {input_path}: {e}") - raise \ No newline at end of file +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +import logging +import os +import shutil +from datetime import datetime +from pathlib import Path + +from django.core.exceptions import FieldDoesNotExist +from django.core.validators import EMPTY_VALUES +from django.db import models + +import openpyxl +import requests +from typecode.contenttype import get_type + +from scancodeio.settings import download_store +from scanpipe import pipes +from scanpipe.models import CodebaseRelation +from scanpipe.models import CodebaseResource +from scanpipe.models import DiscoveredDependency +from scanpipe.models import DiscoveredLicense +from scanpipe.models import DiscoveredPackage +from scanpipe.models import InputSource +from scanpipe.pipes import scancode +from scanpipe.pipes.output import mappings_key_by_fieldname + +logger = logging.getLogger(__name__) + + +def copy_input(input_location, dest_path): + """Copy the ``input_location`` (file or directory) to the ``dest_path``.""" + input_path = Path(input_location) + destination_dir = Path(dest_path) + destination = destination_dir / input_path.name + + if input_path.is_dir(): + shutil.copytree(input_location, destination) + else: + if not os.path.exists(destination_dir): + os.makedirs(destination_dir) + shutil.copyfile(input_location, destination) + + return destination + + +def copy_inputs(input_locations, dest_path): + """Copy the provided ``input_locations`` to the ``dest_path``.""" + for input_location in input_locations: + copy_input(input_location, dest_path) + + +def move_input(input_location, dest_path): + """Move the provided ``input_location`` to the ``dest_path``.""" + destination = dest_path / Path(input_location).name + return shutil.move(input_location, destination) + + +def move_inputs(inputs, dest_path): + """Move the provided ``inputs`` to the ``dest_path``.""" + for input_location in inputs: + move_input(input_location, dest_path) + + +def get_tool_name_from_scan_headers(scan_data): + """Return the ``tool_name`` of the first header in the provided ``scan_data``.""" + if headers := scan_data.get("headers", []): + first_header = headers[0] + tool_name = first_header.get("tool_name", "") + return tool_name + + +def get_extra_data_from_scan_headers(scan_data): + """Return the ``extra_data`` of the first header in the provided ``scan_data``.""" + if headers := scan_data.get("headers", []): + first_header = headers[0] + if extra_data := first_header.get("extra_data"): + return extra_data + + +def is_archive(location): + """Return True if the file at ``location`` is an archive.""" + return get_type(location).is_archive + + +def load_inventory_from_toolkit_scan(project, input_location): + """ + Create license detections, packages, dependencies, and resources + loaded from the ScanCode-toolkit scan results located at ``input_location``. + """ + scanned_codebase = scancode.get_virtual_codebase(project, input_location) + scancode.create_discovered_licenses(project, scanned_codebase) + scancode.create_discovered_packages(project, scanned_codebase) + scancode.create_codebase_resources(project, scanned_codebase) + scancode.create_discovered_dependencies( + project, scanned_codebase, strip_datafile_path_root=True + ) + scancode.load_todo_issues(project, scanned_codebase) + + +def load_inventory_from_scanpipe(project, scan_data, extra_data_prefix=None): + """ + Create packages, dependencies, license detections, resources, and relations + loaded from a ScanCode.io JSON output provided as ``scan_data``. + + An ``extra_data_prefix`` can be provided in case multiple input files are loaded + into the same project. The prefix is usually the filename of the input. + """ + for detection_data in scan_data.get("license_detections", []): + pipes.update_or_create_license_detection(project, detection_data) + + for package_data in scan_data.get("packages", []): + pipes.update_or_create_package(project, package_data) + + for resource_data in scan_data.get("files", []): + pipes.update_or_create_resource(project, resource_data) + + for dependency_data in scan_data.get("dependencies", []): + pipes.update_or_create_dependency(project, dependency_data) + + for relation_data in scan_data.get("relations", []): + pipes.get_or_create_relation(project, relation_data) + + if extra_data := get_extra_data_from_scan_headers(scan_data): + if extra_data_prefix: + extra_data = {extra_data_prefix: extra_data} + project.update_extra_data(extra_data) + + +model_to_object_maker_func = { + DiscoveredPackage: pipes.update_or_create_package, + DiscoveredDependency: pipes.update_or_create_dependency, + DiscoveredLicense: pipes.update_or_create_license_detection, + CodebaseResource: pipes.update_or_create_resource, + CodebaseRelation: pipes.get_or_create_relation, +} + +worksheet_name_to_model = { + "PACKAGES": DiscoveredPackage, + "LICENSE_DETECTIONS": DiscoveredLicense, + "RESOURCES": CodebaseResource, + "DEPENDENCIES": DiscoveredDependency, + "RELATIONS": CodebaseRelation, +} + + +def get_worksheet_data(worksheet): + """Return the data from provided ``worksheet`` as a list of dict.""" + try: + header = [cell.value for cell in next(worksheet.rows)] + except StopIteration: + return {} + + worksheet_data = [ + dict(zip(header, row)) + for row in worksheet.iter_rows(min_row=2, values_only=True) + ] + return worksheet_data + + +def clean_xlsx_field_value(model_class, field_name, value): + """Clean the ``value`` for compatibility with the database ``model_class``.""" + if value in EMPTY_VALUES: + return + + if field_name == "for_packages": + return value.splitlines() + + elif field_name in ["purl", "for_package_uid", "datafile_path"]: + return value + + try: + field = model_class._meta.get_field(field_name) + except FieldDoesNotExist: + return + + if dict_key := mappings_key_by_fieldname.get(field_name): + return [{dict_key: entry} for entry in value.splitlines()] + + elif isinstance(field, models.JSONField): + if field.default is list: + return value.splitlines() + elif field.default is dict: + return # dict stored as JSON are not supported + + return value + + +def clean_xlsx_data_to_model_data(model_class, xlsx_data): + """Clean the ``xlsx_data`` for compatibility with the database ``model_class``.""" + cleaned_data = {} + + for field_name, value in xlsx_data.items(): + if cleaned_value := clean_xlsx_field_value(model_class, field_name, value): + cleaned_data[field_name] = cleaned_value + + return cleaned_data + + +def load_inventory_from_xlsx(project, input_location, extra_data_prefix=None): + """ + Create packages, dependencies, resources, and relations loaded from XLSX file + located at ``input_location``. + + An ``extra_data_prefix`` can be provided in case multiple input files are loaded + into the same project. The prefix is usually the filename of the input. + """ + workbook = openpyxl.load_workbook(input_location, read_only=True, data_only=True) + + for worksheet_name, model_class in worksheet_name_to_model.items(): + if worksheet_name not in workbook: + continue + + worksheet_data = get_worksheet_data(worksheet=workbook[worksheet_name]) + for row_data in worksheet_data: + object_maker_func = model_to_object_maker_func.get(model_class) + cleaned_data = clean_xlsx_data_to_model_data(model_class, row_data) + if cleaned_data: + object_maker_func(project, cleaned_data) + + if "LAYERS" in workbook: + layers_data = get_worksheet_data(worksheet=workbook["LAYERS"]) + extra_data = {"layers": layers_data} + if extra_data_prefix: + extra_data = {extra_data_prefix: extra_data} + project.update_extra_data(extra_data) + + +def add_input_from_url(project, url, filename=None): + """ + Download the file from the provided ``url`` and add it as an InputSource for the + specified ``project``. Optionally, specify a ``filename`` for the downloaded file. + If archiving is enabled, store the content in the DownloadStore and save metadata. + """ + try: + response = requests.get(url, stream=True, timeout=30) + response.raise_for_status() + content = response.content + except requests.RequestException as e: + logger.error(f"Failed to download {url}: {e}") + raise + + filename = filename or url.split("/")[-1] or "downloaded_file" + + if download_store: + try: + download = download_store.put( + content=content, + download_url=url, + download_date=datetime.now().isoformat(), + filename=filename, + ) + InputSource.objects.create( + project=project, + sha256=download.sha256, + download_url=download.download_url, + filename=download.filename, + download_date=download.download_date, + file_path=str(download.path), + is_uploaded=False, + ) + except Exception as e: + logger.error(f"Failed to archive download for {url}: {e}") + raise + else: + input_path = project.input_path / filename + try: + input_path.parent.mkdir(parents=True, exist_ok=True) + with open(input_path, "wb") as f: + f.write(content) + InputSource.objects.create( + project=project, + filename=filename, + download_url=url, + file_path=str(input_path), + is_uploaded=False, + ) + except Exception as e: + logger.error(f"Failed to save {filename} to {input_path}: {e}") + raise + + +def add_input_from_upload(project, uploaded_file): + """ + Add an uploaded file as an InputSource for the specified ``project``. + If archiving is enabled, store the content in the DownloadStore and save metadata. + """ + content = uploaded_file.read() + filename = uploaded_file.name + + if download_store: + try: + download = download_store.put( + content=content, + download_url="", + download_date=datetime.now().isoformat(), + filename=filename, + ) + InputSource.objects.create( + project=project, + sha256=download.sha256, + download_url=download.download_url, + filename=download.filename, + download_date=download.download_date, + file_path=str(download.path), + is_uploaded=True, + ) + except Exception as e: + logger.error(f"Failed to archive upload {filename}: {e}") + raise + else: + input_path = project.input_path / filename + try: + input_path.parent.mkdir(parents=True, exist_ok=True) + with open(input_path, "wb") as f: + f.write(content) + InputSource.objects.create( + project=project, + filename=filename, + file_path=str(input_path), + is_uploaded=True, + ) + except Exception as e: + logger.error(f"Failed to save {filename} to {input_path}: {e}") + raise diff --git a/scanpipe/tests/test_archiving.py b/scanpipe/tests/test_archiving.py index a249c96c46..0da1a236b5 100644 --- a/scanpipe/tests/test_archiving.py +++ b/scanpipe/tests/test_archiving.py @@ -1,86 +1,86 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - - -import hashlib -from pathlib import Path - -from django.test import TestCase - -from scanpipe.archiving import LocalFilesystemProvider -from scanpipe.tests import make_project - - -class TestArchiving(TestCase): - def setUp(self): - self.project = make_project() - self.root_path = Path(__file__).parent / "data" / "test_downloads" - self.store = LocalFilesystemProvider(root_path=self.root_path) - self.test_content = b"test content" - self.test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" - self.test_filename = "sample.tar.gz" - - def tearDown(self): - if self.root_path.exists(): - import shutil - - shutil.rmtree(self.root_path) - - def test_local_filesystem_provider_put_get(self): - download = self.store.put( - content=self.test_content, - download_url=self.test_url, - download_date="2025-08-21T09:00:00", - filename=self.test_filename, - ) - sha256 = hashlib.sha256(self.test_content).hexdigest() - self.assertEqual(download.sha256, sha256) - self.assertEqual(download.download_url, self.test_url) - self.assertEqual(download.filename, self.test_filename) - self.assertEqual(download.download_date, "2025-08-21T09:00:00") - content_path = ( - self.root_path / sha256[:2] / sha256[2:4] / sha256[4:] / "content" - ) - self.assertTrue(content_path.exists()) - with open(content_path, "rb") as f: - self.assertEqual(f.read(), self.test_content) - - retrieved = self.store.get(sha256) - self.assertEqual(retrieved.sha256, sha256) - self.assertEqual(retrieved.download_url, self.test_url) - self.assertEqual(retrieved.filename, self.test_filename) - - def test_local_filesystem_provider_deduplication(self): - download1 = self.store.put( - content=self.test_content, - download_url=self.test_url, - download_date="2025-08-21T09:00:00", - filename=self.test_filename, - ) - download2 = self.store.put( - content=self.test_content, - download_url="https://files.pythonhosted.org/packages/another.tar.gz", - download_date="2025-08-21T10:00:00", - filename="another.tar.gz", - ) - self.assertEqual(download1.sha256, download2.sha256) - self.assertEqual(download1.download_url, self.test_url) +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + + +import hashlib +from pathlib import Path + +from django.test import TestCase + +from scanpipe.archiving import LocalFilesystemProvider +from scanpipe.tests import make_project + + +class TestArchiving(TestCase): + def setUp(self): + self.project = make_project() + self.root_path = Path(__file__).parent / "data" / "test_downloads" + self.store = LocalFilesystemProvider(root_path=self.root_path) + self.test_content = b"test content" + self.test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" + self.test_filename = "sample.tar.gz" + + def tearDown(self): + if self.root_path.exists(): + import shutil + + shutil.rmtree(self.root_path) + + def test_local_filesystem_provider_put_get(self): + download = self.store.put( + content=self.test_content, + download_url=self.test_url, + download_date="2025-08-21T09:00:00", + filename=self.test_filename, + ) + sha256 = hashlib.sha256(self.test_content).hexdigest() + self.assertEqual(download.sha256, sha256) + self.assertEqual(download.download_url, self.test_url) + self.assertEqual(download.filename, self.test_filename) + self.assertEqual(download.download_date, "2025-08-21T09:00:00") + content_path = ( + self.root_path / sha256[:2] / sha256[2:4] / sha256[4:] / "content" + ) + self.assertTrue(content_path.exists()) + with open(content_path, "rb") as f: + self.assertEqual(f.read(), self.test_content) + + retrieved = self.store.get(sha256) + self.assertEqual(retrieved.sha256, sha256) + self.assertEqual(retrieved.download_url, self.test_url) + self.assertEqual(retrieved.filename, self.test_filename) + + def test_local_filesystem_provider_deduplication(self): + download1 = self.store.put( + content=self.test_content, + download_url=self.test_url, + download_date="2025-08-21T09:00:00", + filename=self.test_filename, + ) + download2 = self.store.put( + content=self.test_content, + download_url="https://files.pythonhosted.org/packages/another.tar.gz", + download_date="2025-08-21T10:00:00", + filename="another.tar.gz", + ) + self.assertEqual(download1.sha256, download2.sha256) + self.assertEqual(download1.download_url, self.test_url) diff --git a/scanpipe/tests/test_input.py b/scanpipe/tests/test_input.py index 3f2848cf1b..e55a90cace 100644 --- a/scanpipe/tests/test_input.py +++ b/scanpipe/tests/test_input.py @@ -1,143 +1,112 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: -# http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, -# software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an -# "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - - -from pathlib import Path -from unittest.mock import patch - -from django.core.files.uploadedfile import SimpleUploadedFile -from django.test import TestCase - -from scanpipe.models import InputSource -from scanpipe.pipes.input import add_input_from_upload -from scanpipe.pipes.input import add_input_from_url -from scancodeio.settings import settings -from scanpipe.tests import make_project - - -class TestInput(TestCase): - def setUp(self): - self.project = make_project() - self.test_filename = "sample.tar.gz" - self.test_data_path = ( - Path(__file__).parent / - "data" / - "test-downloads" / - self.test_filename - ) - with open(self.test_data_path, "rb") as f: - self.test_content = f.read() - - @patch("requests.get") - def test_add_input_from_url(self, mock_get): - test_url = ( - "https://files.pythonhosted.org/" - "packages/sample.tar.gz" - ) - mock_get.return_value.content = self.test_content - mock_get.return_value.status_code = 200 - add_input_from_url( - self.project, - test_url, - filename=self.test_filename - ) - input_source = InputSource.objects.get(project=self.project) - self.assertEqual(input_source.filename, self.test_filename) - self.assertEqual(input_source.download_url, test_url) - self.assertTrue(input_source.sha256) - self.assertTrue(input_source.download_date) - self.assertFalse(input_source.is_uploaded) - self.assertTrue( - input_source.file_path.startswith( - settings.CENTRAL_ARCHIVE_PATH - ) - ) - self.assertTrue(Path(input_source.file_path).exists()) - - @patch("scanpipe.pipes.input.download_store", None) - @patch("requests.get") - def test_add_input_from_url_fallback(self, mock_get): - test_url = ( - "https://files.pythonhosted.org/" - "packages/sample.tar.gz" - ) - mock_get.return_value.content = self.test_content - mock_get.return_value.status_code = 200 - add_input_from_url( - self.project, - test_url, - filename=self.test_filename - ) - input_source = InputSource.objects.get(project=self.project) - self.assertEqual(input_source.filename, self.test_filename) - self.assertEqual(input_source.download_url, test_url) - self.assertFalse(input_source.sha256) - self.assertFalse(input_source.download_date) - self.assertFalse(input_source.is_uploaded) - self.assertTrue( - str(input_source.file_path).startswith( - str(self.project.input_path) - ) - ) - self.assertTrue(Path(input_source.file_path).exists()) - - def test_add_input_from_upload(self): - uploaded_file = SimpleUploadedFile( - self.test_filename, - self.test_content - ) - add_input_from_upload(self.project, uploaded_file) - input_source = InputSource.objects.get(project=self.project) - self.assertEqual(input_source.filename, self.test_filename) - self.assertEqual(input_source.download_url, "") - self.assertTrue(input_source.sha256) - self.assertTrue(input_source.download_date) - self.assertTrue(input_source.is_uploaded) - self.assertTrue( - input_source.file_path.startswith( - settings.CENTRAL_ARCHIVE_PATH - ) - ) - self.assertTrue(Path(input_source.file_path).exists()) - - @patch("scanpipe.pipes.input.download_store", None) - def test_add_input_from_upload_fallback(self): - uploaded_file = SimpleUploadedFile( - self.test_filename, - self.test_content - ) - add_input_from_upload(self.project, uploaded_file) - input_source = InputSource.objects.get(project=self.project) - self.assertEqual(input_source.filename, self.test_filename) - self.assertEqual(input_source.download_url, "") - self.assertFalse(input_source.sha256) - self.assertFalse(input_source.download_date) - self.assertTrue(input_source.is_uploaded) - self.assertTrue( - str(input_source.file_path).startswith( - str(self.project.input_path) - ) - ) - self.assertTrue(Path(input_source.file_path).exists()) +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: +# http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, +# software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an +# "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + + +from pathlib import Path +from unittest.mock import patch + +from django.core.files.uploadedfile import SimpleUploadedFile +from django.test import TestCase + +from scancodeio.settings import settings +from scanpipe.models import InputSource +from scanpipe.pipes.input import add_input_from_upload +from scanpipe.pipes.input import add_input_from_url +from scanpipe.tests import make_project + + +class TestInput(TestCase): + def setUp(self): + self.project = make_project() + self.test_filename = "sample.tar.gz" + self.test_data_path = ( + Path(__file__).parent / "data" / "test-downloads" / self.test_filename + ) + with open(self.test_data_path, "rb") as f: + self.test_content = f.read() + + @patch("requests.get") + def test_add_input_from_url(self, mock_get): + test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" + mock_get.return_value.content = self.test_content + mock_get.return_value.status_code = 200 + add_input_from_url(self.project, test_url, filename=self.test_filename) + input_source = InputSource.objects.get(project=self.project) + self.assertEqual(input_source.filename, self.test_filename) + self.assertEqual(input_source.download_url, test_url) + self.assertTrue(input_source.sha256) + self.assertTrue(input_source.download_date) + self.assertFalse(input_source.is_uploaded) + self.assertTrue( + input_source.file_path.startswith(settings.CENTRAL_ARCHIVE_PATH) + ) + self.assertTrue(Path(input_source.file_path).exists()) + + @patch("scanpipe.pipes.input.download_store", None) + @patch("requests.get") + def test_add_input_from_url_fallback(self, mock_get): + test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" + mock_get.return_value.content = self.test_content + mock_get.return_value.status_code = 200 + add_input_from_url(self.project, test_url, filename=self.test_filename) + input_source = InputSource.objects.get(project=self.project) + self.assertEqual(input_source.filename, self.test_filename) + self.assertEqual(input_source.download_url, test_url) + self.assertFalse(input_source.sha256) + self.assertFalse(input_source.download_date) + self.assertFalse(input_source.is_uploaded) + self.assertTrue( + str(input_source.file_path).startswith(str(self.project.input_path)) + ) + self.assertTrue(Path(input_source.file_path).exists()) + + def test_add_input_from_upload(self): + uploaded_file = SimpleUploadedFile(self.test_filename, self.test_content) + add_input_from_upload(self.project, uploaded_file) + input_source = InputSource.objects.get(project=self.project) + self.assertEqual(input_source.filename, self.test_filename) + self.assertEqual(input_source.download_url, "") + self.assertTrue(input_source.sha256) + self.assertTrue(input_source.download_date) + self.assertTrue(input_source.is_uploaded) + self.assertTrue( + input_source.file_path.startswith(settings.CENTRAL_ARCHIVE_PATH) + ) + self.assertTrue(Path(input_source.file_path).exists()) + + @patch("scanpipe.pipes.input.download_store", None) + def test_add_input_from_upload_fallback(self): + uploaded_file = SimpleUploadedFile(self.test_filename, self.test_content) + add_input_from_upload(self.project, uploaded_file) + input_source = InputSource.objects.get(project=self.project) + self.assertEqual(input_source.filename, self.test_filename) + self.assertEqual(input_source.download_url, "") + self.assertFalse(input_source.sha256) + self.assertFalse(input_source.download_date) + self.assertTrue(input_source.is_uploaded) + self.assertTrue( + str(input_source.file_path).startswith(str(self.project.input_path)) + ) + self.assertTrue(Path(input_source.file_path).exists()) diff --git a/scanpipe/tests/test_pipelines.py b/scanpipe/tests/test_pipelines.py index 6439e842dd..16c6260ebc 100644 --- a/scanpipe/tests/test_pipelines.py +++ b/scanpipe/tests/test_pipelines.py @@ -1,4 +1,5 @@ <<<<<<< HEAD +<<<<<<< HEAD # SPDX-License-Identifier: Apache-2.0 # From a381d69ff7bb63221f908173992c256f60f941a7 Mon Sep 17 00:00:00 2001 From: Varsha U N Date: Thu, 18 Sep 2025 07:57:23 +0530 Subject: [PATCH 11/18] Revert "Revert "Revert "add tests for storing packages""" This reverts commit b6d2342873168e53865e8f39185a9602de191b7f. --- Dockerfile | 97 +++ scancodeio/settings.py | 979 ++++++++++++++++--------------- scanpipe/archiving.py | 375 ++++++------ scanpipe/pipelines/__init__.py | 699 +++++++++++----------- scanpipe/pipes/input.py | 692 +++++++++++----------- scanpipe/tests/test_archiving.py | 172 +++--- scanpipe/tests/test_input.py | 255 ++++---- scanpipe/tests/test_pipelines.py | 27 +- 8 files changed, 1726 insertions(+), 1570 deletions(-) diff --git a/Dockerfile b/Dockerfile index eae3f12edb..2527dea2f3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,5 @@ <<<<<<< HEAD +<<<<<<< HEAD ======= >>>>>>> b6d23428 (Revert "Revert "add tests for storing packages"") @@ -100,3 +101,99 @@ COPY --chown=$APP_USER:$APP_USER . $APP_DIR ======= COPY --chown=$APP_USER:$APP_USER . $APP_DIR >>>>>>> b6d23428 (Revert "Revert "add tests for storing packages"") +======= +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +FROM python:3.13-slim + +LABEL org.opencontainers.image.source="https://github.com/aboutcode-org/scancode.io" +LABEL org.opencontainers.image.description="ScanCode.io" +LABEL org.opencontainers.image.licenses="Apache-2.0" + +ENV APP_NAME scancodeio +ENV APP_USER app +ENV APP_DIR /opt/$APP_NAME +ENV VENV_LOCATION /opt/$APP_NAME/.venv + +# Force Python unbuffered stdout and stderr (they are flushed to terminal immediately) +ENV PYTHONUNBUFFERED 1 +# Do not write Python .pyc files +ENV PYTHONDONTWRITEBYTECODE 1 +# Add the app dir in the Python path for entry points availability +ENV PYTHONPATH $PYTHONPATH:$APP_DIR + +# OS requirements as per +# https://scancode-toolkit.readthedocs.io/en/latest/getting-started/install.html +# Also install universal-ctags and xgettext for symbol and string collection. +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + bzip2 \ + xz-utils \ + zlib1g \ + libxml2-dev \ + libxslt1-dev \ + libgomp1 \ + libsqlite3-0 \ + libgcrypt20 \ + libpopt0 \ + libzstd1 \ + libgpgme11 \ + libdevmapper1.02.1 \ + libguestfs-tools \ + linux-image-amd64 \ + git \ + wait-for-it \ + universal-ctags \ + gettext \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +# Create the APP_USER group and user +RUN addgroup --system $APP_USER \ + && adduser --system --group --home=$APP_DIR $APP_USER \ + && chown $APP_USER:$APP_USER $APP_DIR + +# Create the /var/APP_NAME directory with proper permission for APP_USER +RUN mkdir -p /var/$APP_NAME \ + && chown $APP_USER:$APP_USER /var/$APP_NAME + +# Setup the work directory and the user as APP_USER for the remaining stages +WORKDIR $APP_DIR +USER $APP_USER + +# Create the virtualenv +RUN python -m venv $VENV_LOCATION +# Enable the virtualenv, similar effect as "source activate" +ENV PATH $VENV_LOCATION/bin:$PATH + +# Create static/ and workspace/ directories +RUN mkdir -p /var/$APP_NAME/static/ \ + && mkdir -p /var/$APP_NAME/workspace/ + +# Install the dependencies before the codebase COPY for proper Docker layer caching +COPY --chown=$APP_USER:$APP_USER pyproject.toml $APP_DIR/ +RUN pip install --no-cache-dir . + +# Copy the codebase and set the proper permissions for the APP_USER +COPY --chown=$APP_USER:$APP_USER . $APP_DIR +>>>>>>> ca2f49f5 (Revert "Revert "Revert "add tests for storing packages""") diff --git a/scancodeio/settings.py b/scancodeio/settings.py index 15e52a4440..2d7686900c 100644 --- a/scancodeio/settings.py +++ b/scancodeio/settings.py @@ -1,488 +1,491 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - -import logging -import sys -import tempfile -from pathlib import Path - -import environ - -from scanpipe.archiving import LocalFilesystemProvider - -PROJECT_DIR = environ.Path(__file__) - 1 -ROOT_DIR = PROJECT_DIR - 1 - -# True if running tests through `./manage test` -IS_TESTS = "test" in sys.argv - -# Environment - -ENV_FILE = "/etc/scancodeio/.env" -if not Path(ENV_FILE).exists(): - ENV_FILE = ROOT_DIR(".env") - -# Do not use local .env environment when running the tests. -if IS_TESTS: - ENV_FILE = None - -env = environ.Env() -environ.Env.read_env(ENV_FILE) - -# Security - -SECRET_KEY = env.str("SECRET_KEY", default="") - -ALLOWED_HOSTS = env.list( - "ALLOWED_HOSTS", - default=[".localhost", "127.0.0.1", "[::1]", "host.docker.internal", "172.17.0.1"], -) - -CSRF_TRUSTED_ORIGINS = env.list("CSRF_TRUSTED_ORIGINS", default=[]) - -# SECURITY WARNING: don't run with debug turned on in production -DEBUG = env.bool("SCANCODEIO_DEBUG", default=False) - -SCANCODEIO_REQUIRE_AUTHENTICATION = env.bool( - "SCANCODEIO_REQUIRE_AUTHENTICATION", default=False -) - -SCANCODEIO_ENABLE_ADMIN_SITE = env.bool("SCANCODEIO_ENABLE_ADMIN_SITE", default=False) - -SECURE_CONTENT_TYPE_NOSNIFF = env.bool("SECURE_CONTENT_TYPE_NOSNIFF", default=True) - -X_FRAME_OPTIONS = env.str("X_FRAME_OPTIONS", default="DENY") - -SESSION_COOKIE_SECURE = env.bool("SESSION_COOKIE_SECURE", default=True) - -CSRF_COOKIE_SECURE = env.bool("CSRF_COOKIE_SECURE", default=True) - -# ``security.W004`` SECURE_HSTS_SECONDS and ``security.W008`` SECURE_SSL_REDIRECT -# are handled by the web server. -SILENCED_SYSTEM_CHECKS = ["security.W004", "security.W008"] - -# ScanCode.io - -SCANCODEIO_WORKSPACE_LOCATION = env.str("SCANCODEIO_WORKSPACE_LOCATION", default="var") - -SCANCODEIO_CONFIG_DIR = env.str("SCANCODEIO_CONFIG_DIR", default=".scancode") - -SCANCODEIO_CONFIG_FILE = env.str( - "SCANCODEIO_CONFIG_FILE", default="scancode-config.yml" -) - -SCANCODEIO_LOG_LEVEL = env.str("SCANCODEIO_LOG_LEVEL", "INFO") - -# Set the number of parallel processes to use for ScanCode related scan execution. -# If the SCANCODEIO_PROCESSES argument is not set, defaults to an optimal number of CPUs -# available on the machine. -SCANCODEIO_PROCESSES = env.int("SCANCODEIO_PROCESSES", default=None) - -SCANCODEIO_POLICIES_FILE = env.str("SCANCODEIO_POLICIES_FILE", default="policies.yml") - -# This setting defines the additional locations ScanCode.io will search for pipelines. -# This should be set to a list of strings that contain full paths to your additional -# pipelines directories. -SCANCODEIO_PIPELINES_DIRS = env.list("SCANCODEIO_PIPELINES_DIRS", default=[]) - -# Maximum time allowed for a pipeline to complete. -SCANCODEIO_TASK_TIMEOUT = env.str("SCANCODEIO_TASK_TIMEOUT", default="24h") - -# Default to 2 minutes. -SCANCODEIO_SCAN_FILE_TIMEOUT = env.int("SCANCODEIO_SCAN_FILE_TIMEOUT", default=120) - -# Default to None which scans all files -SCANCODEIO_SCAN_MAX_FILE_SIZE = env.int("SCANCODEIO_SCAN_MAX_FILE_SIZE", default=None) - -# List views pagination, controls the number of items displayed per page. -# Syntax in .env: SCANCODEIO_PAGINATE_BY=project=10,project_error=10 -SCANCODEIO_PAGINATE_BY = env.dict( - "SCANCODEIO_PAGINATE_BY", - default={ - "project": 20, - "error": 50, - "resource": 100, - "package": 100, - "dependency": 100, - "license": 100, - "relation": 100, - }, -) - -# Default limit for "most common" entries in QuerySets. -SCANCODEIO_MOST_COMMON_LIMIT = env.int("SCANCODEIO_MOST_COMMON_LIMIT", default=7) - -# The base URL (e.g., https://hostname/) of this application instance. -# Required for generating URLs to reference objects within the app, -# such as in webhook notifications. -SCANCODEIO_SITE_URL = env.str("SCANCODEIO_SITE_URL", default="") - -# Fetch authentication credentials - -# SCANCODEIO_FETCH_BASIC_AUTH="host=user,password;" -SCANCODEIO_FETCH_BASIC_AUTH = env.dict( - "SCANCODEIO_FETCH_BASIC_AUTH", - cast={"value": tuple}, - default={}, -) - -# SCANCODEIO_FETCH_DIGEST_AUTH="host=user,password;" -SCANCODEIO_FETCH_DIGEST_AUTH = env.dict( - "SCANCODEIO_FETCH_DIGEST_AUTH", - cast={"value": tuple}, - default={}, -) - -# SCANCODEIO_FETCH_HEADERS="host=Header1=value,Header2=value;" -SCANCODEIO_FETCH_HEADERS = {} -FETCH_HEADERS_STR = env.str("SCANCODEIO_FETCH_HEADERS", default="") -for entry in FETCH_HEADERS_STR.split(";"): - if entry.strip(): - host, headers = entry.split("=", 1) - SCANCODEIO_FETCH_HEADERS[host] = env.parse_value(headers, cast=dict) - -# SCANCODEIO_NETRC_LOCATION="~/.netrc" -SCANCODEIO_NETRC_LOCATION = env.str("SCANCODEIO_NETRC_LOCATION", default="") -if SCANCODEIO_NETRC_LOCATION: - # Propagate the location to the environ for `requests.utils.get_netrc_auth` - env.ENVIRON["NETRC"] = SCANCODEIO_NETRC_LOCATION - -# SCANCODEIO_SKOPEO_CREDENTIALS="host1=user:password,host2=user:password" -SCANCODEIO_SKOPEO_CREDENTIALS = env.dict("SCANCODEIO_SKOPEO_CREDENTIALS", default={}) - -# SCANCODEIO_SKOPEO_AUTHFILE_LOCATION="/path/to/auth.json" -SCANCODEIO_SKOPEO_AUTHFILE_LOCATION = env.str( - "SCANCODEIO_SKOPEO_AUTHFILE_LOCATION", default="" -) - -# This webhook will be added as WebhookSubscription for each new project. -# SCANCODEIO_GLOBAL_WEBHOOK=target_url=https://webhook.url,trigger_on_each_run=False,include_summary=True,include_results=False -SCANCODEIO_GLOBAL_WEBHOOK = env.dict("SCANCODEIO_GLOBAL_WEBHOOK", default={}) - -# Application definition - -INSTALLED_APPS = [ - # Local apps - # Must come before Third-party apps for proper templates override - "scanpipe", - # Django built-in - "django.contrib.auth", - "django.contrib.contenttypes", - "django.contrib.sessions", - "django.contrib.messages", - "django.contrib.staticfiles", - "django.contrib.admin", - "django.contrib.humanize", - # Third-party apps - "crispy_forms", - "crispy_bootstrap3", # required for the djangorestframework browsable API - "django_filters", - "rest_framework", - "rest_framework.authtoken", - "django_rq", - "django_probes", - "taggit", -] - -MIDDLEWARE = [ - "django.middleware.security.SecurityMiddleware", - "django.contrib.sessions.middleware.SessionMiddleware", - "django.middleware.common.CommonMiddleware", - "django.middleware.csrf.CsrfViewMiddleware", - "django.contrib.auth.middleware.AuthenticationMiddleware", - "django.contrib.messages.middleware.MessageMiddleware", - "django.middleware.clickjacking.XFrameOptionsMiddleware", - "scancodeio.middleware.TimezoneMiddleware", -] - -ROOT_URLCONF = "scancodeio.urls" - -WSGI_APPLICATION = "scancodeio.wsgi.application" - -SECURE_PROXY_SSL_HEADER = env.tuple( - "SECURE_PROXY_SSL_HEADER", default=("HTTP_X_FORWARDED_PROTO", "https") -) - -# Database - -DATABASES = { - "default": { - "ENGINE": env.str("SCANCODEIO_DB_ENGINE", "django.db.backends.postgresql"), - "HOST": env.str("SCANCODEIO_DB_HOST", "localhost"), - "NAME": env.str("SCANCODEIO_DB_NAME", "scancodeio"), - "USER": env.str("SCANCODEIO_DB_USER", "scancodeio"), - "PASSWORD": env.str("SCANCODEIO_DB_PASSWORD", "scancodeio"), - "PORT": env.str("SCANCODEIO_DB_PORT", "5432"), - "ATOMIC_REQUESTS": True, - } -} - -DEFAULT_AUTO_FIELD = "django.db.models.AutoField" - -# Forms and filters - -FILTERS_EMPTY_CHOICE_LABEL = env.str("FILTERS_EMPTY_CHOICE_LABEL", default="All") - -# Templates - -TEMPLATES = [ - { - "BACKEND": "django.template.backends.django.DjangoTemplates", - "APP_DIRS": True, - "OPTIONS": { - "debug": DEBUG, - "context_processors": [ - "django.contrib.auth.context_processors.auth", - "django.contrib.messages.context_processors.messages", - "django.template.context_processors.request", - "scancodeio.context_processors.versions", - ], - }, - }, -] - -# Login - -LOGIN_REDIRECT_URL = "project_list" - -# Passwords - -AUTH_PASSWORD_VALIDATORS = [ - { - "NAME": ( - "django.contrib.auth.password_validation.UserAttributeSimilarityValidator" - ), - }, - { - "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator", - "OPTIONS": { - "min_length": env.int("SCANCODEIO_PASSWORD_MIN_LENGTH", default=12), - }, - }, - { - "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator", - }, - { - "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator", - }, -] - -# Testing - -if IS_TESTS: - from django.core.management.utils import get_random_secret_key - - SECRET_KEY = get_random_secret_key() - # Do not pollute the workspace while running the tests. - SCANCODEIO_WORKSPACE_LOCATION = tempfile.mkdtemp() - SCANCODEIO_REQUIRE_AUTHENTICATION = True - SCANCODEIO_SCAN_FILE_TIMEOUT = 120 - SCANCODEIO_POLICIES_FILE = None - # The default password hasher is rather slow by design. - # Using a faster hashing algorithm in the testing context to speed up the run. - PASSWORD_HASHERS = ["django.contrib.auth.hashers.MD5PasswordHasher"] - -# Debug toolbar - -DEBUG_TOOLBAR = env.bool("SCANCODEIO_DEBUG_TOOLBAR", default=False) -if DEBUG and DEBUG_TOOLBAR: - INSTALLED_APPS.append("debug_toolbar") - MIDDLEWARE.append("debug_toolbar.middleware.DebugToolbarMiddleware") - INTERNAL_IPS = ["127.0.0.1"] - -# Logging - -LOGGING = { - "version": 1, - "disable_existing_loggers": False, - "formatters": { - "simple": { - "format": "{levelname} {message}", - "style": "{", - }, - }, - "handlers": { - "null": { - "class": "logging.NullHandler", - }, - "console": { - "class": "logging.StreamHandler", - "formatter": "simple", - }, - }, - "loggers": { - "scanpipe": { - "handlers": ["null"] if IS_TESTS else ["console"], - "level": SCANCODEIO_LOG_LEVEL, - "propagate": False, - }, - "django": { - "handlers": ["null"] if IS_TESTS else ["console"], - "propagate": False, - }, - # Set SCANCODEIO_LOG_LEVEL=DEBUG to display all SQL queries in the console. - "django.db.backends": { - "level": SCANCODEIO_LOG_LEVEL, - }, - }, -} - -# Instead of sending out real emails the console backend just writes the emails -# that would be sent to the standard output. -EMAIL_BACKEND = "django.core.mail.backends.console.EmailBackend" - -# Internationalization - -LANGUAGE_CODE = "en-us" - -FORMAT_MODULE_PATH = ["scancodeio.formats"] - -TIME_ZONE = env.str("TIME_ZONE", default="UTC") - -USE_I18N = True - -USE_TZ = True - -# Static files (CSS, JavaScript, Images) - -STATIC_URL = "/static/" - -STATIC_ROOT = env.str("STATIC_ROOT", default="/var/scancodeio/static/") - -STATICFILES_DIRS = [ - PROJECT_DIR("static"), -] - -# Third-party apps - -CRISPY_TEMPLATE_PACK = "bootstrap3" - -# Centralized archive directory for all projects -CENTRAL_ARCHIVE_PATH = env.str( - "CENTRAL_ARCHIVE_PATH", default="/var/scancodeio/archives" -) - -# localstorage configuration -DOWNLOAD_ARCHIVING_PROVIDER = env.str( - "DOWNLOAD_ARCHIVING_PROVIDER", default="localstorage" -) - -# For local storage, we would store the root path in that setting -DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION = env.dict( - "DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION", default=None -) - -# Initialize the DownloadStore for local storage - -download_store = None -logger = logging.getLogger(__name__) -if DOWNLOAD_ARCHIVING_PROVIDER == "localstorage": - config = DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION or {} - root_path = Path(config.get("root_path", CENTRAL_ARCHIVE_PATH)) - try: - download_store = LocalFilesystemProvider(root_path=root_path) - except Exception as e: - logger.error(f"Failed to initialize LocalFilesystemProvider: {e}") -else: - logger.error(f"Unknown DOWNLOAD_ARCHIVING_PROVIDER: {DOWNLOAD_ARCHIVING_PROVIDER}") - -# Job Queue - -RQ_QUEUES = { - "default": { - "HOST": env.str("SCANCODEIO_RQ_REDIS_HOST", default="localhost"), - "PORT": env.str("SCANCODEIO_RQ_REDIS_PORT", default="6379"), - "DB": env.int("SCANCODEIO_RQ_REDIS_DB", default=0), - "USERNAME": env.str("SCANCODEIO_RQ_REDIS_USERNAME", default=None), - "PASSWORD": env.str("SCANCODEIO_RQ_REDIS_PASSWORD", default=""), - "DEFAULT_TIMEOUT": env.int("SCANCODEIO_RQ_REDIS_DEFAULT_TIMEOUT", default=360), - # Enable SSL for Redis connections when deploying ScanCode.io in environments - # where Redis is hosted on a separate system (e.g., cloud deployment or remote - # Redis server) to secure data in transit. - "SSL": env.bool("SCANCODEIO_RQ_REDIS_SSL", default=False), - }, -} - -SCANCODEIO_ASYNC = env.bool("SCANCODEIO_ASYNC", default=False) -if not SCANCODEIO_ASYNC: - for queue_config in RQ_QUEUES.values(): - queue_config["ASYNC"] = False - -# ClamAV virus scan -CLAMD_USE_TCP = env.bool("CLAMD_USE_TCP", default=True) -CLAMD_TCP_ADDR = env.str("CLAMD_TCP_ADDR", default="clamav") - -# Django restframework - -REST_FRAMEWORK = { - "DEFAULT_AUTHENTICATION_CLASSES": ( - "rest_framework.authentication.TokenAuthentication", - ), - "DEFAULT_PERMISSION_CLASSES": ("rest_framework.permissions.IsAuthenticated",), - "DEFAULT_RENDERER_CLASSES": ( - "rest_framework.renderers.JSONRenderer", - "rest_framework.renderers.BrowsableAPIRenderer", - "rest_framework.renderers.AdminRenderer", - ), - "DEFAULT_FILTER_BACKENDS": ( - "django_filters.rest_framework.DjangoFilterBackend", - "rest_framework.filters.SearchFilter", - ), - "DEFAULT_PAGINATION_CLASS": "rest_framework.pagination.PageNumberPagination", - "PAGE_SIZE": env.int("SCANCODEIO_REST_API_PAGE_SIZE", default=50), - "UPLOADED_FILES_USE_URL": False, -} - -if not SCANCODEIO_REQUIRE_AUTHENTICATION: - REST_FRAMEWORK["DEFAULT_PERMISSION_CLASSES"] = ( - "rest_framework.permissions.AllowAny", - ) - -# VulnerableCode integration - -VULNERABLECODE_URL = env.str("VULNERABLECODE_URL", default="").rstrip("/") -VULNERABLECODE_USER = env.str("VULNERABLECODE_USER", default="") -VULNERABLECODE_PASSWORD = env.str("VULNERABLECODE_PASSWORD", default="") -VULNERABLECODE_API_KEY = env.str("VULNERABLECODE_API_KEY", default="") - -# PurlDB integration - -PURLDB_URL = env.str("PURLDB_URL", default="").rstrip("/") -PURLDB_USER = env.str("PURLDB_USER", default="") -PURLDB_PASSWORD = env.str("PURLDB_PASSWORD", default="") -PURLDB_API_KEY = env.str("PURLDB_API_KEY", default="") - -# MatchCode.io integration - -MATCHCODEIO_URL = env.str("MATCHCODEIO_URL", default="").rstrip("/") -MATCHCODEIO_USER = env.str("MATCHCODEIO_USER", default="") -MATCHCODEIO_PASSWORD = env.str("MATCHCODEIO_PASSWORD", default="") -MATCHCODEIO_API_KEY = env.str("MATCHCODEIO_API_KEY", default="") - -# FederatedCode integration - -FEDERATEDCODE_GIT_ACCOUNT_URL = env.str( - "FEDERATEDCODE_GIT_ACCOUNT_URL", default="" -).rstrip("/") -FEDERATEDCODE_GIT_SERVICE_TOKEN = env.str("FEDERATEDCODE_GIT_SERVICE_TOKEN", default="") -FEDERATEDCODE_GIT_SERVICE_NAME = env.str("FEDERATEDCODE_GIT_SERVICE_NAME", default="") -FEDERATEDCODE_GIT_SERVICE_EMAIL = env.str("FEDERATEDCODE_GIT_SERVICE_EMAIL", default="") +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +import sys +import tempfile +from pathlib import Path +import logging + +import environ + +from scanpipe.archiving import LocalFilesystemProvider + + +PROJECT_DIR = environ.Path(__file__) - 1 +ROOT_DIR = PROJECT_DIR - 1 + +# True if running tests through `./manage test` +IS_TESTS = "test" in sys.argv + +# Environment + +ENV_FILE = "/etc/scancodeio/.env" +if not Path(ENV_FILE).exists(): + ENV_FILE = ROOT_DIR(".env") + +# Do not use local .env environment when running the tests. +if IS_TESTS: + ENV_FILE = None + +env = environ.Env() +environ.Env.read_env(ENV_FILE) + +# Security + +SECRET_KEY = env.str("SECRET_KEY", default="") + +ALLOWED_HOSTS = env.list( + "ALLOWED_HOSTS", + default=[".localhost", "127.0.0.1", "[::1]", "host.docker.internal", "172.17.0.1"], +) + +CSRF_TRUSTED_ORIGINS = env.list("CSRF_TRUSTED_ORIGINS", default=[]) + +# SECURITY WARNING: don't run with debug turned on in production +DEBUG = env.bool("SCANCODEIO_DEBUG", default=False) + +SCANCODEIO_REQUIRE_AUTHENTICATION = env.bool( + "SCANCODEIO_REQUIRE_AUTHENTICATION", default=False +) + +SCANCODEIO_ENABLE_ADMIN_SITE = env.bool("SCANCODEIO_ENABLE_ADMIN_SITE", default=False) + +SECURE_CONTENT_TYPE_NOSNIFF = env.bool("SECURE_CONTENT_TYPE_NOSNIFF", default=True) + +X_FRAME_OPTIONS = env.str("X_FRAME_OPTIONS", default="DENY") + +SESSION_COOKIE_SECURE = env.bool("SESSION_COOKIE_SECURE", default=True) + +CSRF_COOKIE_SECURE = env.bool("CSRF_COOKIE_SECURE", default=True) + +# ``security.W004`` SECURE_HSTS_SECONDS and ``security.W008`` SECURE_SSL_REDIRECT +# are handled by the web server. +SILENCED_SYSTEM_CHECKS = ["security.W004", "security.W008"] + +# ScanCode.io + +SCANCODEIO_WORKSPACE_LOCATION = env.str("SCANCODEIO_WORKSPACE_LOCATION", default="var") + +SCANCODEIO_CONFIG_DIR = env.str("SCANCODEIO_CONFIG_DIR", default=".scancode") + +SCANCODEIO_CONFIG_FILE = env.str( + "SCANCODEIO_CONFIG_FILE", default="scancode-config.yml" +) + +SCANCODEIO_LOG_LEVEL = env.str("SCANCODEIO_LOG_LEVEL", "INFO") + +# Set the number of parallel processes to use for ScanCode related scan execution. +# If the SCANCODEIO_PROCESSES argument is not set, defaults to an optimal number of CPUs +# available on the machine. +SCANCODEIO_PROCESSES = env.int("SCANCODEIO_PROCESSES", default=None) + +SCANCODEIO_POLICIES_FILE = env.str("SCANCODEIO_POLICIES_FILE", default="policies.yml") + +# This setting defines the additional locations ScanCode.io will search for pipelines. +# This should be set to a list of strings that contain full paths to your additional +# pipelines directories. +SCANCODEIO_PIPELINES_DIRS = env.list("SCANCODEIO_PIPELINES_DIRS", default=[]) + +# Maximum time allowed for a pipeline to complete. +SCANCODEIO_TASK_TIMEOUT = env.str("SCANCODEIO_TASK_TIMEOUT", default="24h") + +# Default to 2 minutes. +SCANCODEIO_SCAN_FILE_TIMEOUT = env.int("SCANCODEIO_SCAN_FILE_TIMEOUT", default=120) + +# Default to None which scans all files +SCANCODEIO_SCAN_MAX_FILE_SIZE = env.int("SCANCODEIO_SCAN_MAX_FILE_SIZE", default=None) + +# List views pagination, controls the number of items displayed per page. +# Syntax in .env: SCANCODEIO_PAGINATE_BY=project=10,project_error=10 +SCANCODEIO_PAGINATE_BY = env.dict( + "SCANCODEIO_PAGINATE_BY", + default={ + "project": 20, + "error": 50, + "resource": 100, + "package": 100, + "dependency": 100, + "license": 100, + "relation": 100, + }, +) + +# Default limit for "most common" entries in QuerySets. +SCANCODEIO_MOST_COMMON_LIMIT = env.int("SCANCODEIO_MOST_COMMON_LIMIT", default=7) + +# The base URL (e.g., https://hostname/) of this application instance. +# Required for generating URLs to reference objects within the app, +# such as in webhook notifications. +SCANCODEIO_SITE_URL = env.str("SCANCODEIO_SITE_URL", default="") + +# Fetch authentication credentials + +# SCANCODEIO_FETCH_BASIC_AUTH="host=user,password;" +SCANCODEIO_FETCH_BASIC_AUTH = env.dict( + "SCANCODEIO_FETCH_BASIC_AUTH", + cast={"value": tuple}, + default={}, +) + +# SCANCODEIO_FETCH_DIGEST_AUTH="host=user,password;" +SCANCODEIO_FETCH_DIGEST_AUTH = env.dict( + "SCANCODEIO_FETCH_DIGEST_AUTH", + cast={"value": tuple}, + default={}, +) + +# SCANCODEIO_FETCH_HEADERS="host=Header1=value,Header2=value;" +SCANCODEIO_FETCH_HEADERS = {} +FETCH_HEADERS_STR = env.str("SCANCODEIO_FETCH_HEADERS", default="") +for entry in FETCH_HEADERS_STR.split(";"): + if entry.strip(): + host, headers = entry.split("=", 1) + SCANCODEIO_FETCH_HEADERS[host] = env.parse_value(headers, cast=dict) + +# SCANCODEIO_NETRC_LOCATION="~/.netrc" +SCANCODEIO_NETRC_LOCATION = env.str("SCANCODEIO_NETRC_LOCATION", default="") +if SCANCODEIO_NETRC_LOCATION: + # Propagate the location to the environ for `requests.utils.get_netrc_auth` + env.ENVIRON["NETRC"] = SCANCODEIO_NETRC_LOCATION + +# SCANCODEIO_SKOPEO_CREDENTIALS="host1=user:password,host2=user:password" +SCANCODEIO_SKOPEO_CREDENTIALS = env.dict("SCANCODEIO_SKOPEO_CREDENTIALS", default={}) + +# SCANCODEIO_SKOPEO_AUTHFILE_LOCATION="/path/to/auth.json" +SCANCODEIO_SKOPEO_AUTHFILE_LOCATION = env.str( + "SCANCODEIO_SKOPEO_AUTHFILE_LOCATION", default="" +) + +# This webhook will be added as WebhookSubscription for each new project. +# SCANCODEIO_GLOBAL_WEBHOOK=target_url=https://webhook.url,trigger_on_each_run=False,include_summary=True,include_results=False +SCANCODEIO_GLOBAL_WEBHOOK = env.dict("SCANCODEIO_GLOBAL_WEBHOOK", default={}) + +# Application definition + +INSTALLED_APPS = [ + # Local apps + # Must come before Third-party apps for proper templates override + "scanpipe", + # Django built-in + "django.contrib.auth", + "django.contrib.contenttypes", + "django.contrib.sessions", + "django.contrib.messages", + "django.contrib.staticfiles", + "django.contrib.admin", + "django.contrib.humanize", + # Third-party apps + "crispy_forms", + "crispy_bootstrap3", # required for the djangorestframework browsable API + "django_filters", + "rest_framework", + "rest_framework.authtoken", + "django_rq", + "django_probes", + "taggit", +] + +MIDDLEWARE = [ + "django.middleware.security.SecurityMiddleware", + "django.contrib.sessions.middleware.SessionMiddleware", + "django.middleware.common.CommonMiddleware", + "django.middleware.csrf.CsrfViewMiddleware", + "django.contrib.auth.middleware.AuthenticationMiddleware", + "django.contrib.messages.middleware.MessageMiddleware", + "django.middleware.clickjacking.XFrameOptionsMiddleware", + "scancodeio.middleware.TimezoneMiddleware", +] + +ROOT_URLCONF = "scancodeio.urls" + +WSGI_APPLICATION = "scancodeio.wsgi.application" + +SECURE_PROXY_SSL_HEADER = env.tuple( + "SECURE_PROXY_SSL_HEADER", default=("HTTP_X_FORWARDED_PROTO", "https") +) + +# Database + +DATABASES = { + "default": { + "ENGINE": env.str("SCANCODEIO_DB_ENGINE", "django.db.backends.postgresql"), + "HOST": env.str("SCANCODEIO_DB_HOST", "localhost"), + "NAME": env.str("SCANCODEIO_DB_NAME", "scancodeio"), + "USER": env.str("SCANCODEIO_DB_USER", "scancodeio"), + "PASSWORD": env.str("SCANCODEIO_DB_PASSWORD", "scancodeio"), + "PORT": env.str("SCANCODEIO_DB_PORT", "5432"), + "ATOMIC_REQUESTS": True, + } +} + +DEFAULT_AUTO_FIELD = "django.db.models.AutoField" + +# Forms and filters + +FILTERS_EMPTY_CHOICE_LABEL = env.str("FILTERS_EMPTY_CHOICE_LABEL", default="All") + +# Templates + +TEMPLATES = [ + { + "BACKEND": "django.template.backends.django.DjangoTemplates", + "APP_DIRS": True, + "OPTIONS": { + "debug": DEBUG, + "context_processors": [ + "django.contrib.auth.context_processors.auth", + "django.contrib.messages.context_processors.messages", + "django.template.context_processors.request", + "scancodeio.context_processors.versions", + ], + }, + }, +] + +# Login + +LOGIN_REDIRECT_URL = "project_list" + +# Passwords + +AUTH_PASSWORD_VALIDATORS = [ + { + "NAME": ( + "django.contrib.auth.password_validation.UserAttributeSimilarityValidator" + ), + }, + { + "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator", + "OPTIONS": { + "min_length": env.int("SCANCODEIO_PASSWORD_MIN_LENGTH", default=12), + }, + }, + { + "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator", + }, + { + "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator", + }, +] + +# Testing + +if IS_TESTS: + from django.core.management.utils import get_random_secret_key + + SECRET_KEY = get_random_secret_key() + # Do not pollute the workspace while running the tests. + SCANCODEIO_WORKSPACE_LOCATION = tempfile.mkdtemp() + SCANCODEIO_REQUIRE_AUTHENTICATION = True + SCANCODEIO_SCAN_FILE_TIMEOUT = 120 + SCANCODEIO_POLICIES_FILE = None + # The default password hasher is rather slow by design. + # Using a faster hashing algorithm in the testing context to speed up the run. + PASSWORD_HASHERS = ["django.contrib.auth.hashers.MD5PasswordHasher"] + +# Debug toolbar + +DEBUG_TOOLBAR = env.bool("SCANCODEIO_DEBUG_TOOLBAR", default=False) +if DEBUG and DEBUG_TOOLBAR: + INSTALLED_APPS.append("debug_toolbar") + MIDDLEWARE.append("debug_toolbar.middleware.DebugToolbarMiddleware") + INTERNAL_IPS = ["127.0.0.1"] + +# Logging + +LOGGING = { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "simple": { + "format": "{levelname} {message}", + "style": "{", + }, + }, + "handlers": { + "null": { + "class": "logging.NullHandler", + }, + "console": { + "class": "logging.StreamHandler", + "formatter": "simple", + }, + }, + "loggers": { + "scanpipe": { + "handlers": ["null"] if IS_TESTS else ["console"], + "level": SCANCODEIO_LOG_LEVEL, + "propagate": False, + }, + "django": { + "handlers": ["null"] if IS_TESTS else ["console"], + "propagate": False, + }, + # Set SCANCODEIO_LOG_LEVEL=DEBUG to display all SQL queries in the console. + "django.db.backends": { + "level": SCANCODEIO_LOG_LEVEL, + }, + }, +} + +# Instead of sending out real emails the console backend just writes the emails +# that would be sent to the standard output. +EMAIL_BACKEND = "django.core.mail.backends.console.EmailBackend" + +# Internationalization + +LANGUAGE_CODE = "en-us" + +FORMAT_MODULE_PATH = ["scancodeio.formats"] + +TIME_ZONE = env.str("TIME_ZONE", default="UTC") + +USE_I18N = True + +USE_TZ = True + +# Static files (CSS, JavaScript, Images) + +STATIC_URL = "/static/" + +STATIC_ROOT = env.str("STATIC_ROOT", default="/var/scancodeio/static/") + +STATICFILES_DIRS = [ + PROJECT_DIR("static"), +] + +# Third-party apps + +CRISPY_TEMPLATE_PACK = "bootstrap3" + +# Centralized archive directory for all projects +CENTRAL_ARCHIVE_PATH = env.str( + "CENTRAL_ARCHIVE_PATH", default="/var/scancodeio/archives" +) + +# localstorage configuration +DOWNLOAD_ARCHIVING_PROVIDER = env.str( + "DOWNLOAD_ARCHIVING_PROVIDER", default="localstorage" +) + +# For local storage, we would store the root path in that setting +DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION = env.dict( + "DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION", default=None +) + +# Initialize the DownloadStore for local storage + +download_store = None +logger = logging.getLogger(__name__) +if DOWNLOAD_ARCHIVING_PROVIDER == "localstorage": + config = DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION or {} + root_path = Path(config.get("root_path", CENTRAL_ARCHIVE_PATH)) + try: + download_store = LocalFilesystemProvider(root_path=root_path) + except Exception as e: + logger.error(f"Failed to initialize LocalFilesystemProvider: {e}") +else: + logger.error( + f"Unknown DOWNLOAD_ARCHIVING_PROVIDER: {DOWNLOAD_ARCHIVING_PROVIDER}" + ) + +# Job Queue + +RQ_QUEUES = { + "default": { + "HOST": env.str("SCANCODEIO_RQ_REDIS_HOST", default="localhost"), + "PORT": env.str("SCANCODEIO_RQ_REDIS_PORT", default="6379"), + "DB": env.int("SCANCODEIO_RQ_REDIS_DB", default=0), + "USERNAME": env.str("SCANCODEIO_RQ_REDIS_USERNAME", default=None), + "PASSWORD": env.str("SCANCODEIO_RQ_REDIS_PASSWORD", default=""), + "DEFAULT_TIMEOUT": env.int("SCANCODEIO_RQ_REDIS_DEFAULT_TIMEOUT", default=360), + # Enable SSL for Redis connections when deploying ScanCode.io in environments + # where Redis is hosted on a separate system (e.g., cloud deployment or remote + # Redis server) to secure data in transit. + "SSL": env.bool("SCANCODEIO_RQ_REDIS_SSL", default=False), + }, +} + +SCANCODEIO_ASYNC = env.bool("SCANCODEIO_ASYNC", default=False) +if not SCANCODEIO_ASYNC: + for queue_config in RQ_QUEUES.values(): + queue_config["ASYNC"] = False + +# ClamAV virus scan +CLAMD_USE_TCP = env.bool("CLAMD_USE_TCP", default=True) +CLAMD_TCP_ADDR = env.str("CLAMD_TCP_ADDR", default="clamav") + +# Django restframework + +REST_FRAMEWORK = { + "DEFAULT_AUTHENTICATION_CLASSES": ( + "rest_framework.authentication.TokenAuthentication", + ), + "DEFAULT_PERMISSION_CLASSES": ("rest_framework.permissions.IsAuthenticated",), + "DEFAULT_RENDERER_CLASSES": ( + "rest_framework.renderers.JSONRenderer", + "rest_framework.renderers.BrowsableAPIRenderer", + "rest_framework.renderers.AdminRenderer", + ), + "DEFAULT_FILTER_BACKENDS": ( + "django_filters.rest_framework.DjangoFilterBackend", + "rest_framework.filters.SearchFilter", + ), + "DEFAULT_PAGINATION_CLASS": "rest_framework.pagination.PageNumberPagination", + "PAGE_SIZE": env.int("SCANCODEIO_REST_API_PAGE_SIZE", default=50), + "UPLOADED_FILES_USE_URL": False, +} + +if not SCANCODEIO_REQUIRE_AUTHENTICATION: + REST_FRAMEWORK["DEFAULT_PERMISSION_CLASSES"] = ( + "rest_framework.permissions.AllowAny", + ) + +# VulnerableCode integration + +VULNERABLECODE_URL = env.str("VULNERABLECODE_URL", default="").rstrip("/") +VULNERABLECODE_USER = env.str("VULNERABLECODE_USER", default="") +VULNERABLECODE_PASSWORD = env.str("VULNERABLECODE_PASSWORD", default="") +VULNERABLECODE_API_KEY = env.str("VULNERABLECODE_API_KEY", default="") + +# PurlDB integration + +PURLDB_URL = env.str("PURLDB_URL", default="").rstrip("/") +PURLDB_USER = env.str("PURLDB_USER", default="") +PURLDB_PASSWORD = env.str("PURLDB_PASSWORD", default="") +PURLDB_API_KEY = env.str("PURLDB_API_KEY", default="") + +# MatchCode.io integration + +MATCHCODEIO_URL = env.str("MATCHCODEIO_URL", default="").rstrip("/") +MATCHCODEIO_USER = env.str("MATCHCODEIO_USER", default="") +MATCHCODEIO_PASSWORD = env.str("MATCHCODEIO_PASSWORD", default="") +MATCHCODEIO_API_KEY = env.str("MATCHCODEIO_API_KEY", default="") + +# FederatedCode integration + +FEDERATEDCODE_GIT_ACCOUNT_URL = env.str( + "FEDERATEDCODE_GIT_ACCOUNT_URL", default="" +).rstrip("/") +FEDERATEDCODE_GIT_SERVICE_TOKEN = env.str("FEDERATEDCODE_GIT_SERVICE_TOKEN", default="") +FEDERATEDCODE_GIT_SERVICE_NAME = env.str("FEDERATEDCODE_GIT_SERVICE_NAME", default="") +FEDERATEDCODE_GIT_SERVICE_EMAIL = env.str("FEDERATEDCODE_GIT_SERVICE_EMAIL", default="") diff --git a/scanpipe/archiving.py b/scanpipe/archiving.py index 3f3d66e2e8..482f448de5 100644 --- a/scanpipe/archiving.py +++ b/scanpipe/archiving.py @@ -1,185 +1,190 @@ -# scanpipe/archiving.py -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - -import hashlib -import json -import logging -from abc import ABC -from abc import abstractmethod -from dataclasses import dataclass -from pathlib import Path - -logger = logging.getLogger(__name__) - - -@dataclass -class Download: - sha256: str - download_date: str - download_url: str - filename: str - - -class DownloadStore(ABC): - def _compute_sha256(self, content: bytes) -> str: - """Compute SHA256 hash for content.""" - return hashlib.sha256(content).hexdigest() - - def _compute_origin_hash( - self, filename: str, download_date: str, download_url: str - ) -> str: - """Compute a hash for the metadata to name the origin JSON file.""" - to_hash = f"{filename}{download_date}{download_url}".encode() - return hashlib.sha256(to_hash).hexdigest() - - def _build_metadata( - self, sha256: str, filename: str, download_date: str, download_url: str - ) -> dict: - """Build metadata dictionary for JSON storage.""" - return { - "sha256": sha256, - "filename": filename, - "download_date": download_date, - "download_url": download_url, - } - - @abstractmethod - def _get_content_path(self, sha256: str) -> str: - """Get the storage path/key for the content based on SHA256.""" - pass - - @abstractmethod - def list(self): - """Return an iterable of all stored downloads.""" - pass - - @abstractmethod - def get(self, sha256_checksum: str): - """Return a Download object for this checksum or None.""" - pass - - @abstractmethod - def put(self, content: bytes, download_url: str, download_date: str, filename: str): - """ - Store content with its metadata. Return a Download object on success. - Raise an exception on error. - """ - pass - - @abstractmethod - def find( - self, download_url: str = None, filename: str = None, download_date: str = None - ): - """Return a Download object matching the metadata or None.""" - pass - - -class LocalFilesystemProvider(DownloadStore): - def __init__(self, root_path: Path): - self.root_path = root_path - - def _get_content_path(self, sha256: str) -> Path: - """Create a nested path like 59/4c/67/... based on the SHA256 hash.""" - return self.root_path / sha256[:2] / sha256[2:4] / sha256[4:] - - def list(self): - """Return an iterable of all stored downloads.""" - downloads = [] - for content_path in self.root_path.rglob("content"): - origin_files = list(content_path.parent.glob("origin-*.json")) - for origin_file in origin_files: - try: - with open(origin_file) as f: - data = json.load(f) - downloads.append(Download(**data)) - except Exception as e: - logger.error(f"Error reading {origin_file}: {e}") - return downloads - - def get(self, sha256_checksum: str): - """Retrieve a Download object for the given SHA256 hash.""" - content_path = self._get_content_path(sha256_checksum) - if content_path.exists(): - origin_files = list(content_path.glob("origin-*.json")) - if origin_files: - try: - with open(origin_files[0]) as f: - data = json.load(f) - return Download(**data) - except Exception as e: - logger.error( - f"Error reading origin file for {sha256_checksum}: {e}" - ) - return None - - def put(self, content: bytes, download_url: str, download_date: str, filename: str): - """Store the content and its metadata.""" - sha256 = self._compute_sha256(content) - content_path = self._get_content_path(sha256) - content_path.mkdir(parents=True, exist_ok=True) - - content_file = content_path / "content" - if not content_file.exists(): - try: - with open(content_file, "wb") as f: - f.write(content) - except Exception as e: - raise Exception(f"Failed to write content to {content_file}: {e}") - - origin_hash = self._compute_origin_hash(filename, download_date, download_url) - origin_filename = f"origin-{origin_hash}.json" - origin_path = content_path / origin_filename - if origin_path.exists(): - raise Exception(f"Origin {origin_filename} already exists") - - metadata = self._build_metadata(sha256, filename, download_date, download_url) - try: - with open(origin_path, "w") as f: - json.dump(metadata, f, indent=2) - except Exception as e: - raise Exception(f"Failed to write metadata to {origin_path}: {e}") - - return Download(**metadata) - - def find( - self, download_url: str = None, filename: str = None, download_date: str = None - ): - """Find a download based on metadata.""" - if not (download_url or filename or download_date): - return None - for content_path in self.root_path.rglob("origin-*.json"): - try: - with open(content_path) as f: - data = json.load(f) - if ( - (download_url is None or data.get("url") == download_url) - and (filename is None or data.get("filename") == filename) - and ( - download_date is None - or data.get("download_date") == download_date - ) - ): - return Download(**data) - except Exception as e: - logger.error(f"Error reading {content_path}: {e}") - return None +# scanpipe/archiving.py +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +import hashlib +import json +import logging +import os +import stat +from abc import ABC +from abc import abstractmethod +from dataclasses import dataclass +from pathlib import Path + + +logger = logging.getLogger(__name__) + + +@dataclass +class Download: + sha256: str + download_date: str + download_url: str + filename: str + + +class DownloadStore(ABC): + def _compute_sha256(self, content: bytes) -> str: + """Compute SHA256 hash for content.""" + return hashlib.sha256(content).hexdigest() + + def _compute_origin_hash( + self, filename: str, download_date: str, download_url: str + ) -> str: + """Compute a hash for the metadata to name the origin JSON file.""" + to_hash = f"{filename}{download_date}{download_url}".encode() + return hashlib.sha256(to_hash).hexdigest() + + def _build_metadata( + self, sha256: str, filename: str, download_date: str, download_url: str + ) -> dict: + """Build metadata dictionary for JSON storage.""" + return { + "sha256": sha256, + "filename": filename, + "download_date": download_date, + "download_url": download_url, + } + + @abstractmethod + def _get_content_path(self, sha256: str) -> str: + """Get the storage path/key for the content based on SHA256.""" + pass + + @abstractmethod + def list(self): + """Return an iterable of all stored downloads.""" + pass + + @abstractmethod + def get(self, sha256_checksum: str): + """Return a Download object for this checksum or None.""" + pass + + @abstractmethod + def put(self, content: bytes, download_url: str, download_date: str, filename: str): + """ + Store content with its metadata. Return a Download object on success. + Raise an exception on error. + """ + pass + + @abstractmethod + def find( + self, download_url: str = None, filename: str = None, download_date: str = None + ): + """Return a Download object matching the metadata or None.""" + pass + + +class LocalFilesystemProvider(DownloadStore): + def __init__(self, root_path: Path): + self.root_path = root_path + + def _get_content_path(self, sha256: str) -> Path: + """Create a nested path like 59/4c/67/... based on the SHA256 hash.""" + return self.root_path / sha256[:2] / sha256[2:4] / sha256[4:] + + def list(self): + """Return an iterable of all stored downloads.""" + downloads = [] + for content_path in self.root_path.rglob("content"): + origin_files = list(content_path.parent.glob("origin-*.json")) + for origin_file in origin_files: + try: + with open(origin_file) as f: + data = json.load(f) + downloads.append(Download(**data)) + except Exception as e: + logger.error(f"Error reading {origin_file}: {e}") + return downloads + + def get(self, sha256_checksum: str): + """Retrieve a Download object for the given SHA256 hash.""" + content_path = self._get_content_path(sha256_checksum) + if content_path.exists(): + origin_files = list(content_path.glob("origin-*.json")) + if origin_files: + try: + with open(origin_files[0]) as f: + data = json.load(f) + return Download(**data) + except Exception as e: + logger.error( + f"Error reading origin file for {sha256_checksum}: {e}" + ) + return None + + def put(self, content: bytes, download_url: str, download_date: str, filename: str): + """Store the content and its metadata.""" + sha256 = self._compute_sha256(content) + content_path = self._get_content_path(sha256) + content_path.mkdir(parents=True, exist_ok=True) + + content_file = content_path / "content" + if not content_file.exists(): + try: + with open(content_file, "wb") as f: + f.write(content) + except Exception as e: + raise Exception(f"Failed to write content to {content_file}: {e}") + + origin_hash = self._compute_origin_hash(filename, download_date, download_url) + origin_filename = f"origin-{origin_hash}.json" + origin_path = content_path / origin_filename + if origin_path.exists(): + raise Exception(f"Origin {origin_filename} already exists") + + metadata = self._build_metadata(sha256, filename, download_date, download_url) + try: + with open(origin_path, "w") as f: + json.dump(metadata, f, indent=2) + except Exception as e: + raise Exception(f"Failed to write metadata to {origin_path}: {e}") + + return Download(**metadata) + + def find( + self, download_url: str = None, filename: str = None, download_date: str = None + ): + """Find a download based on metadata.""" + if not (download_url or filename or download_date): + return None + for content_path in self.root_path.rglob("origin-*.json"): + try: + with open(content_path) as f: + data = json.load(f) + if ( + (download_url is None or data.get("url") == download_url) + and (filename is None or data.get("filename") == filename) + and ( + download_date is None + or data.get("download_date") == download_date + ) + ): + return Download(**data) + except Exception as e: + logger.error(f"Error reading {content_path}: {e}") + return None + + diff --git a/scanpipe/pipelines/__init__.py b/scanpipe/pipelines/__init__.py index 5153bf1887..1b6cd4e0a0 100644 --- a/scanpipe/pipelines/__init__.py +++ b/scanpipe/pipelines/__init__.py @@ -1,353 +1,346 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - -import hashlib -import inspect -import logging -import traceback -from contextlib import contextmanager -from datetime import datetime -from functools import wraps -from pathlib import Path - -import bleach -from markdown_it import MarkdownIt -from pyinstrument import Profiler - -from aboutcode.pipeline import BasePipeline -from scancodeio.settings import download_store -from scancodeio.settings import settings - -logger = logging.getLogger(__name__) - - -class InputFilesError(Exception): - """InputFile is missing or cannot be downloaded.""" - - def __init__(self, error_tracebacks): - self.error_tracebacks = error_tracebacks - super().__init__(self._generate_message()) - - def _generate_message(self): - message = "InputFilesError encountered with the following issues:\n" - for index, (error, tb) in enumerate(self.error_tracebacks, start=1): - message += f"\nError {index}: {str(error)}\n\n{tb}" - return message - - -def convert_markdown_to_html(markdown_text): - """Convert Markdown text to sanitized HTML.""" - # Using the "js-default" for safety. - html_content = MarkdownIt("js-default").renderInline(markdown_text) - # Sanitize HTML using bleach. - sanitized_html = bleach.clean(html_content) - return sanitized_html - - -class CommonStepsMixin: - """Common steps available on all project pipelines.""" - - def flag_empty_files(self): - """Flag empty files.""" - from scanpipe.pipes import flag - - flag.flag_empty_files(self.project) - - def flag_ignored_resources(self): - """Flag ignored resources based on Project ``ignored_patterns`` setting.""" - from scanpipe.pipes import flag - - ignored_patterns = self.env.get("ignored_patterns", []) - - if isinstance(ignored_patterns, str): - ignored_patterns = ignored_patterns.splitlines() - ignored_patterns.extend(flag.DEFAULT_IGNORED_PATTERNS) - - flag.flag_ignored_patterns( - codebaseresources=self.project.codebaseresources.no_status(), - patterns=ignored_patterns, - ) - - def extract_archive(self, location, target): - """Extract archive at `location` to `target`. Save errors as messages.""" - from scanpipe.pipes import scancode - - extract_errors = scancode.extract_archive(location, target) - - for resource_location, errors in extract_errors.items(): - resource_path = Path(resource_location) - - if resource_path.is_relative_to(self.project.codebase_path): - resource_path = resource_path.relative_to(self.project.codebase_path) - details = {"resource_path": str(resource_path)} - elif resource_path.is_relative_to(self.project.input_path): - resource_path = resource_path.relative_to(self.project.input_path) - details = {"path": f"input/{str(resource_path)}"} - else: - details = {"filename": str(resource_path.name)} - - self.project.add_error( - description="\n".join(errors), - model="extract_archive", - details=details, - ) - - def extract_archives(self, location=None): - """Extract archives located in the codebase/ directory with extractcode.""" - from scanpipe.pipes import scancode - - if not location: - location = self.project.codebase_path - - extract_errors = scancode.extract_archives(location=location, recurse=True) - - for resource_path, errors in extract_errors.items(): - self.project.add_error( - description="\n".join(errors), - model="extract_archives", - details={"resource_path": resource_path}, - ) - - # Reload the project env post-extraction as the scancode-config.yml file - # may be located in one of the extracted archives. - self.env = self.project.get_env() - - def download_missing_inputs(self): - """ - Download any InputSource missing on disk. - Raise an error if any of the uploaded files is not available or not reachable. - """ - error_tracebacks = [] - - for input_source in self.project.inputsources.all(): - if input_source.exists(): - continue - - if input_source.is_uploaded: - msg = f"Uploaded file {input_source} not available." - self.log(msg) - error_tracebacks.append((msg, "No traceback available.")) - continue - - download_url = input_source.download_url - if not download_url: - continue - - url_hash = hashlib.sha256(download_url.encode()).hexdigest() - filename = ( - input_source.filename - or Path(download_url).name - or f"{url_hash}.archive" - ) - archive_path = Path(settings.CENTRAL_ARCHIVE_PATH) / url_hash / filename - - if archive_path.exists(): - logger.info(f"Reusing existing archive at {archive_path}") - input_source.file_path = str(archive_path) - input_source.save() - continue - - self.log(f"Fetching input from {input_source.download_url}") - try: - input_source.fetch() - - except Exception as error: - traceback_str = traceback.format_exc() - logger.error(traceback_str) - self.log(f"{input_source.download_url} could not be fetched.") - error_tracebacks.append((str(error), traceback_str)) - - if error_tracebacks: - raise InputFilesError(error_tracebacks) - - def archive_downloads(self): - """ - Archive downloaded inputs to the centralized DownloadStore if not already - archived.Updates InputSource with archiving metadata (sha256, download_date). - """ - logger.info(f"Archiving downloads for project {self.project.name}") - for input_source in self.project.inputsources.filter( - sha256__isnull=True, is_uploaded=False - ): - if input_source.download_url: - logger.warning( - f"No download URL for input {input_source.filename}, " - "skipping archiving" - ) - continue - - if not input_source.file_path: - logger.warning( - f"No file_path for input {input_source.download_url}, " - "skipping archiving" - ) - continue - try: - with open(input_source.file_path, "rb") as f: - content = f.read() - filename = ( - input_source.filename or input_source.download_url.split("/")[-1] - ) - download = download_store.put( - content=content, - download_url=input_source.download_url, - download_date=datetime.now().isoformat(), - filename=filename, - ) - input_source.sha256 = download.sha256 - input_source.download_date = download.download_date - input_source.file_path = str(download.path) - input_source.save() - except Exception as e: - self.add_error( - exception=e, - message=f"Failed to archive {input_source.download_url}", - ) - - -class ProjectPipeline(CommonStepsMixin, BasePipeline): - """Main class for all project related pipelines including common steps methods.""" - - # Flag specifying whether to download missing inputs as an initial step. - download_inputs = True - - # Optional URL that targets a view of the results relative to this Pipeline. - # This URL may contain dictionary-style string formatting, which will be - # interpolated against the project's field attributes. - # For example, you could use results_url="/project/{slug}/packages/?filter=value" - # to target the Package list view with an active filtering. - results_url = "" - - def __init__(self, run_instance): - """Load the Pipeline execution context from a Run database object.""" - self.run = run_instance - self.project = run_instance.project - self.env = self.project.get_env() - - self.pipeline_class = run_instance.pipeline_class - self.pipeline_name = run_instance.pipeline_name - - self.selected_groups = run_instance.selected_groups or [] - self.selected_steps = run_instance.selected_steps or [] - - self.ecosystem_config = None - - @classmethod - def get_initial_steps(cls): - """Add the ``download_inputs`` step as an initial step if enabled.""" - steps = [] - if cls.download_inputs: - steps.append(cls.download_missing_inputs) - steps.append(cls.archive_downloads) - return tuple(steps) - - @classmethod - def get_info(cls, as_html=False): - """Add the option to render the values as HTML.""" - info = super().get_info() - - if as_html: - info["summary"] = convert_markdown_to_html(info["summary"]) - info["description"] = convert_markdown_to_html(info["description"]) - for step in info["steps"]: - step["doc"] = convert_markdown_to_html(step["doc"]) - - return info - - def append_to_log(self, message): - self.run.append_to_log(message) - - def set_current_step(self, message): - self.run.set_current_step(message) - - def add_error(self, exception, resource=None): - """Create a ``ProjectMessage`` ERROR record on the current `project`.""" - self.project.add_error( - model=self.pipeline_name, - exception=exception, - object_instance=resource, - ) - - @contextmanager - def save_errors(self, *exceptions, **kwargs): - """ - Context manager to save specified exceptions as ``ProjectMessage`` in the - database. - - - Example in a Pipeline step:: - - with self.save_errors(rootfs.DistroNotFound): - rootfs.scan_rootfs_for_system_packages(self.project, rfs) - - - Example when iterating over resources:: - - for resource in self.project.codebaseresources.all(): - with self.save_errors(Exception, resource=resource): - analyse(resource) - """ - try: - yield - except exceptions as error: - self.add_error(exception=error, **kwargs) - - -class Pipeline(ProjectPipeline): - """Alias for the ProjectPipeline class.""" - - pass - - -def is_pipeline(obj): - """ - Return True if the `obj` is a subclass of `Pipeline` except for the - `Pipeline` class itself. - """ - return inspect.isclass(obj) and issubclass(obj, Pipeline) and obj is not Pipeline - - -def profile(step): - """ - Profile a Pipeline step and save the results as HTML file in the project output - directory. - - Usage: - @profile - def step(self): - pass - """ - - @wraps(step) - def wrapper(*arg, **kwargs): - pipeline_instance = arg[0] - project = pipeline_instance.project - - with Profiler() as profiler: - result = step(*arg, **kwargs) - - output_file = project.get_output_file_path("profile", "html") - output_file.write_text(profiler.output_html()) - - pipeline_instance.log(f"Profiling results at {output_file.resolve()}") - - return result - - return wrapper +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +import inspect +import logging +import traceback +import hashlib +from contextlib import contextmanager +from datetime import datetime +from functools import wraps +from pathlib import Path + +import bleach +import requests +from markdown_it import MarkdownIt +from pyinstrument import Profiler + +from aboutcode.pipeline import BasePipeline +from scancodeio.settings import download_store + +logger = logging.getLogger(__name__) + + +class InputFilesError(Exception): + """InputFile is missing or cannot be downloaded.""" + + def __init__(self, error_tracebacks): + self.error_tracebacks = error_tracebacks + super().__init__(self._generate_message()) + + def _generate_message(self): + message = "InputFilesError encountered with the following issues:\n" + for index, (error, tb) in enumerate(self.error_tracebacks, start=1): + message += f"\nError {index}: {str(error)}\n\n{tb}" + return message + + +def convert_markdown_to_html(markdown_text): + """Convert Markdown text to sanitized HTML.""" + # Using the "js-default" for safety. + html_content = MarkdownIt("js-default").renderInline(markdown_text) + # Sanitize HTML using bleach. + sanitized_html = bleach.clean(html_content) + return sanitized_html + + +class CommonStepsMixin: + """Common steps available on all project pipelines.""" + + def flag_empty_files(self): + """Flag empty files.""" + from scanpipe.pipes import flag + + flag.flag_empty_files(self.project) + + def flag_ignored_resources(self): + """Flag ignored resources based on Project ``ignored_patterns`` setting.""" + from scanpipe.pipes import flag + + ignored_patterns = self.env.get("ignored_patterns", []) + + if isinstance(ignored_patterns, str): + ignored_patterns = ignored_patterns.splitlines() + ignored_patterns.extend(flag.DEFAULT_IGNORED_PATTERNS) + + flag.flag_ignored_patterns( + codebaseresources=self.project.codebaseresources.no_status(), + patterns=ignored_patterns, + ) + + def extract_archive(self, location, target): + """Extract archive at `location` to `target`. Save errors as messages.""" + from scanpipe.pipes import scancode + + extract_errors = scancode.extract_archive(location, target) + + for resource_location, errors in extract_errors.items(): + resource_path = Path(resource_location) + + if resource_path.is_relative_to(self.project.codebase_path): + resource_path = resource_path.relative_to(self.project.codebase_path) + details = {"resource_path": str(resource_path)} + elif resource_path.is_relative_to(self.project.input_path): + resource_path = resource_path.relative_to(self.project.input_path) + details = {"path": f"input/{str(resource_path)}"} + else: + details = {"filename": str(resource_path.name)} + + self.project.add_error( + description="\n".join(errors), + model="extract_archive", + details=details, + ) + + def extract_archives(self, location=None): + """Extract archives located in the codebase/ directory with extractcode.""" + from scanpipe.pipes import scancode + + if not location: + location = self.project.codebase_path + + extract_errors = scancode.extract_archives(location=location, recurse=True) + + for resource_path, errors in extract_errors.items(): + self.project.add_error( + description="\n".join(errors), + model="extract_archives", + details={"resource_path": resource_path}, + ) + + # Reload the project env post-extraction as the scancode-config.yml file + # may be located in one of the extracted archives. + self.env = self.project.get_env() + + def download_missing_inputs(self): + """ + Download any InputSource missing on disk. + Raise an error if any of the uploaded files is not available or not reachable. + """ + error_tracebacks = [] + + for input_source in self.project.inputsources.all(): + if input_source.exists(): + continue + + if input_source.is_uploaded: + msg = f"Uploaded file {input_source} not available." + self.log(msg) + error_tracebacks.append((msg, "No traceback available.")) + continue + + download_url = input_source.download_url + if not download_url: + continue + + url_hash = hashlib.sha256(download_url.encode()).hexdigest() + filename = input_source.filename or Path(download_url).name or f"{url_hash}.archive" + archive_path = Path(settings.CENTRAL_ARCHIVE_PATH) / url_hash / filename + + if archive_path.exists(): + logger.info(f"Reusing existing archive at {archive_path}") + input_source.file_path = str(archive_path) + input_source.save() + continue + + self.log(f"Fetching input from {input_source.download_url}") + try: + input_source.fetch() + + except Exception as error: + traceback_str = traceback.format_exc() + logger.error(traceback_str) + self.log(f"{input_source.download_url} could not be fetched.") + error_tracebacks.append((str(error), traceback_str)) + + if error_tracebacks: + raise InputFilesError(error_tracebacks) + + def archive_downloads(self): + """ + Archive downloaded inputs to the centralized DownloadStore if not already + archived.Updates InputSource with archiving metadata (sha256, download_date). + """ + logger.info(f"Archiving downloads for project {self.project.name}") + for input_source in self.project.inputsources.filter( + sha256__isnull=True, is_uploaded=False + ): + if input_source.download_url: + try: + response = requests.get( + input_source.download_url, stream=True,timeout=30 + ) + response.raise_for_status() + content = response.content + filename = ( + input_source.filename + or input_source.download_url.split("/")[-1] + ) + download = download_store.put( + content=content, + download_url=input_source.download_url, + download_date=datetime.now().isoformat(), + filename=filename, + ) + input_source.sha256 = download.sha256 + input_source.download_date = download.download_date + input_source.save() + except Exception as e: + self.add_error( + exception=e, + message=f"Failed to archive {input_source.download_url}", + ) + else: + logger.warning( + f"No download URL for input {input_source.filename}," + "skipping archiving" + ) + + +class ProjectPipeline(CommonStepsMixin, BasePipeline): + """Main class for all project related pipelines including common steps methods.""" + + # Flag specifying whether to download missing inputs as an initial step. + download_inputs = True + + # Optional URL that targets a view of the results relative to this Pipeline. + # This URL may contain dictionary-style string formatting, which will be + # interpolated against the project's field attributes. + # For example, you could use results_url="/project/{slug}/packages/?filter=value" + # to target the Package list view with an active filtering. + results_url = "" + + def __init__(self, run_instance): + """Load the Pipeline execution context from a Run database object.""" + self.run = run_instance + self.project = run_instance.project + self.env = self.project.get_env() + + self.pipeline_class = run_instance.pipeline_class + self.pipeline_name = run_instance.pipeline_name + + self.selected_groups = run_instance.selected_groups or [] + self.selected_steps = run_instance.selected_steps or [] + + self.ecosystem_config = None + + @classmethod + def get_initial_steps(cls): + """Add the ``download_inputs`` step as an initial step if enabled.""" + steps = [] + if cls.download_inputs: + steps.append(cls.download_missing_inputs) + if ENABLE_DOWNLOAD_ARCHIVING: + steps.append(cls.archive_downloads) + return tuple(steps) + + @classmethod + def get_info(cls, as_html=False): + """Add the option to render the values as HTML.""" + info = super().get_info() + + if as_html: + info["summary"] = convert_markdown_to_html(info["summary"]) + info["description"] = convert_markdown_to_html(info["description"]) + for step in info["steps"]: + step["doc"] = convert_markdown_to_html(step["doc"]) + + return info + + def append_to_log(self, message): + self.run.append_to_log(message) + + def set_current_step(self, message): + self.run.set_current_step(message) + + def add_error(self, exception, resource=None): + """Create a ``ProjectMessage`` ERROR record on the current `project`.""" + self.project.add_error( + model=self.pipeline_name, + exception=exception, + object_instance=resource, + ) + + @contextmanager + def save_errors(self, *exceptions, **kwargs): + """ + Context manager to save specified exceptions as ``ProjectMessage`` in the + database. + + - Example in a Pipeline step:: + + with self.save_errors(rootfs.DistroNotFound): + rootfs.scan_rootfs_for_system_packages(self.project, rfs) + + - Example when iterating over resources:: + + for resource in self.project.codebaseresources.all(): + with self.save_errors(Exception, resource=resource): + analyse(resource) + """ + try: + yield + except exceptions as error: + self.add_error(exception=error, **kwargs) + + +class Pipeline(ProjectPipeline): + """Alias for the ProjectPipeline class.""" + + pass + + +def is_pipeline(obj): + """ + Return True if the `obj` is a subclass of `Pipeline` except for the + `Pipeline` class itself. + """ + return inspect.isclass(obj) and issubclass(obj, Pipeline) and obj is not Pipeline + + +def profile(step): + """ + Profile a Pipeline step and save the results as HTML file in the project output + directory. + + Usage: + @profile + def step(self): + pass + """ + + @wraps(step) + def wrapper(*arg, **kwargs): + pipeline_instance = arg[0] + project = pipeline_instance.project + + with Profiler() as profiler: + result = step(*arg, **kwargs) + + output_file = project.get_output_file_path("profile", "html") + output_file.write_text(profiler.output_html()) + + pipeline_instance.log(f"Profiling results at {output_file.resolve()}") + + return result + + return wrapper diff --git a/scanpipe/pipes/input.py b/scanpipe/pipes/input.py index 906a2ee3a1..81ae91c21d 100644 --- a/scanpipe/pipes/input.py +++ b/scanpipe/pipes/input.py @@ -1,345 +1,347 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - -import logging -import os -import shutil -from datetime import datetime -from pathlib import Path - -from django.core.exceptions import FieldDoesNotExist -from django.core.validators import EMPTY_VALUES -from django.db import models - -import openpyxl -import requests -from typecode.contenttype import get_type - -from scancodeio.settings import download_store -from scanpipe import pipes -from scanpipe.models import CodebaseRelation -from scanpipe.models import CodebaseResource -from scanpipe.models import DiscoveredDependency -from scanpipe.models import DiscoveredLicense -from scanpipe.models import DiscoveredPackage -from scanpipe.models import InputSource -from scanpipe.pipes import scancode -from scanpipe.pipes.output import mappings_key_by_fieldname - -logger = logging.getLogger(__name__) - - -def copy_input(input_location, dest_path): - """Copy the ``input_location`` (file or directory) to the ``dest_path``.""" - input_path = Path(input_location) - destination_dir = Path(dest_path) - destination = destination_dir / input_path.name - - if input_path.is_dir(): - shutil.copytree(input_location, destination) - else: - if not os.path.exists(destination_dir): - os.makedirs(destination_dir) - shutil.copyfile(input_location, destination) - - return destination - - -def copy_inputs(input_locations, dest_path): - """Copy the provided ``input_locations`` to the ``dest_path``.""" - for input_location in input_locations: - copy_input(input_location, dest_path) - - -def move_input(input_location, dest_path): - """Move the provided ``input_location`` to the ``dest_path``.""" - destination = dest_path / Path(input_location).name - return shutil.move(input_location, destination) - - -def move_inputs(inputs, dest_path): - """Move the provided ``inputs`` to the ``dest_path``.""" - for input_location in inputs: - move_input(input_location, dest_path) - - -def get_tool_name_from_scan_headers(scan_data): - """Return the ``tool_name`` of the first header in the provided ``scan_data``.""" - if headers := scan_data.get("headers", []): - first_header = headers[0] - tool_name = first_header.get("tool_name", "") - return tool_name - - -def get_extra_data_from_scan_headers(scan_data): - """Return the ``extra_data`` of the first header in the provided ``scan_data``.""" - if headers := scan_data.get("headers", []): - first_header = headers[0] - if extra_data := first_header.get("extra_data"): - return extra_data - - -def is_archive(location): - """Return True if the file at ``location`` is an archive.""" - return get_type(location).is_archive - - -def load_inventory_from_toolkit_scan(project, input_location): - """ - Create license detections, packages, dependencies, and resources - loaded from the ScanCode-toolkit scan results located at ``input_location``. - """ - scanned_codebase = scancode.get_virtual_codebase(project, input_location) - scancode.create_discovered_licenses(project, scanned_codebase) - scancode.create_discovered_packages(project, scanned_codebase) - scancode.create_codebase_resources(project, scanned_codebase) - scancode.create_discovered_dependencies( - project, scanned_codebase, strip_datafile_path_root=True - ) - scancode.load_todo_issues(project, scanned_codebase) - - -def load_inventory_from_scanpipe(project, scan_data, extra_data_prefix=None): - """ - Create packages, dependencies, license detections, resources, and relations - loaded from a ScanCode.io JSON output provided as ``scan_data``. - - An ``extra_data_prefix`` can be provided in case multiple input files are loaded - into the same project. The prefix is usually the filename of the input. - """ - for detection_data in scan_data.get("license_detections", []): - pipes.update_or_create_license_detection(project, detection_data) - - for package_data in scan_data.get("packages", []): - pipes.update_or_create_package(project, package_data) - - for resource_data in scan_data.get("files", []): - pipes.update_or_create_resource(project, resource_data) - - for dependency_data in scan_data.get("dependencies", []): - pipes.update_or_create_dependency(project, dependency_data) - - for relation_data in scan_data.get("relations", []): - pipes.get_or_create_relation(project, relation_data) - - if extra_data := get_extra_data_from_scan_headers(scan_data): - if extra_data_prefix: - extra_data = {extra_data_prefix: extra_data} - project.update_extra_data(extra_data) - - -model_to_object_maker_func = { - DiscoveredPackage: pipes.update_or_create_package, - DiscoveredDependency: pipes.update_or_create_dependency, - DiscoveredLicense: pipes.update_or_create_license_detection, - CodebaseResource: pipes.update_or_create_resource, - CodebaseRelation: pipes.get_or_create_relation, -} - -worksheet_name_to_model = { - "PACKAGES": DiscoveredPackage, - "LICENSE_DETECTIONS": DiscoveredLicense, - "RESOURCES": CodebaseResource, - "DEPENDENCIES": DiscoveredDependency, - "RELATIONS": CodebaseRelation, -} - - -def get_worksheet_data(worksheet): - """Return the data from provided ``worksheet`` as a list of dict.""" - try: - header = [cell.value for cell in next(worksheet.rows)] - except StopIteration: - return {} - - worksheet_data = [ - dict(zip(header, row)) - for row in worksheet.iter_rows(min_row=2, values_only=True) - ] - return worksheet_data - - -def clean_xlsx_field_value(model_class, field_name, value): - """Clean the ``value`` for compatibility with the database ``model_class``.""" - if value in EMPTY_VALUES: - return - - if field_name == "for_packages": - return value.splitlines() - - elif field_name in ["purl", "for_package_uid", "datafile_path"]: - return value - - try: - field = model_class._meta.get_field(field_name) - except FieldDoesNotExist: - return - - if dict_key := mappings_key_by_fieldname.get(field_name): - return [{dict_key: entry} for entry in value.splitlines()] - - elif isinstance(field, models.JSONField): - if field.default is list: - return value.splitlines() - elif field.default is dict: - return # dict stored as JSON are not supported - - return value - - -def clean_xlsx_data_to_model_data(model_class, xlsx_data): - """Clean the ``xlsx_data`` for compatibility with the database ``model_class``.""" - cleaned_data = {} - - for field_name, value in xlsx_data.items(): - if cleaned_value := clean_xlsx_field_value(model_class, field_name, value): - cleaned_data[field_name] = cleaned_value - - return cleaned_data - - -def load_inventory_from_xlsx(project, input_location, extra_data_prefix=None): - """ - Create packages, dependencies, resources, and relations loaded from XLSX file - located at ``input_location``. - - An ``extra_data_prefix`` can be provided in case multiple input files are loaded - into the same project. The prefix is usually the filename of the input. - """ - workbook = openpyxl.load_workbook(input_location, read_only=True, data_only=True) - - for worksheet_name, model_class in worksheet_name_to_model.items(): - if worksheet_name not in workbook: - continue - - worksheet_data = get_worksheet_data(worksheet=workbook[worksheet_name]) - for row_data in worksheet_data: - object_maker_func = model_to_object_maker_func.get(model_class) - cleaned_data = clean_xlsx_data_to_model_data(model_class, row_data) - if cleaned_data: - object_maker_func(project, cleaned_data) - - if "LAYERS" in workbook: - layers_data = get_worksheet_data(worksheet=workbook["LAYERS"]) - extra_data = {"layers": layers_data} - if extra_data_prefix: - extra_data = {extra_data_prefix: extra_data} - project.update_extra_data(extra_data) - - -def add_input_from_url(project, url, filename=None): - """ - Download the file from the provided ``url`` and add it as an InputSource for the - specified ``project``. Optionally, specify a ``filename`` for the downloaded file. - If archiving is enabled, store the content in the DownloadStore and save metadata. - """ - try: - response = requests.get(url, stream=True, timeout=30) - response.raise_for_status() - content = response.content - except requests.RequestException as e: - logger.error(f"Failed to download {url}: {e}") - raise - - filename = filename or url.split("/")[-1] or "downloaded_file" - - if download_store: - try: - download = download_store.put( - content=content, - download_url=url, - download_date=datetime.now().isoformat(), - filename=filename, - ) - InputSource.objects.create( - project=project, - sha256=download.sha256, - download_url=download.download_url, - filename=download.filename, - download_date=download.download_date, - file_path=str(download.path), - is_uploaded=False, - ) - except Exception as e: - logger.error(f"Failed to archive download for {url}: {e}") - raise - else: - input_path = project.input_path / filename - try: - input_path.parent.mkdir(parents=True, exist_ok=True) - with open(input_path, "wb") as f: - f.write(content) - InputSource.objects.create( - project=project, - filename=filename, - download_url=url, - file_path=str(input_path), - is_uploaded=False, - ) - except Exception as e: - logger.error(f"Failed to save {filename} to {input_path}: {e}") - raise - - -def add_input_from_upload(project, uploaded_file): - """ - Add an uploaded file as an InputSource for the specified ``project``. - If archiving is enabled, store the content in the DownloadStore and save metadata. - """ - content = uploaded_file.read() - filename = uploaded_file.name - - if download_store: - try: - download = download_store.put( - content=content, - download_url="", - download_date=datetime.now().isoformat(), - filename=filename, - ) - InputSource.objects.create( - project=project, - sha256=download.sha256, - download_url=download.download_url, - filename=download.filename, - download_date=download.download_date, - file_path=str(download.path), - is_uploaded=True, - ) - except Exception as e: - logger.error(f"Failed to archive upload {filename}: {e}") - raise - else: - input_path = project.input_path / filename - try: - input_path.parent.mkdir(parents=True, exist_ok=True) - with open(input_path, "wb") as f: - f.write(content) - InputSource.objects.create( - project=project, - filename=filename, - file_path=str(input_path), - is_uploaded=True, - ) - except Exception as e: - logger.error(f"Failed to save {filename} to {input_path}: {e}") - raise +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +import hashlib +import logging +import os +import shutil +from datetime import datetime +from pathlib import Path + +from django.core.exceptions import FieldDoesNotExist +from django.core.validators import EMPTY_VALUES +from django.db import models + +import openpyxl +import requests +from typecode.contenttype import get_type + +from scanpipe import pipes +from scanpipe.models import CodebaseRelation +from scanpipe.models import CodebaseResource +from scanpipe.models import DiscoveredDependency +from scanpipe.models import DiscoveredLicense +from scanpipe.models import DiscoveredPackage +from scanpipe.models import InputSource +from scanpipe.pipes import scancode +from scanpipe.pipes.output import mappings_key_by_fieldname +from scancodeio.settings import download_store + +logger = logging.getLogger(__name__) + + +def copy_input(input_location, dest_path): + """Copy the ``input_location`` (file or directory) to the ``dest_path``.""" + input_path = Path(input_location) + destination_dir = Path(dest_path) + destination = destination_dir / input_path.name + + if input_path.is_dir(): + shutil.copytree(input_location, destination) + else: + if not os.path.exists(destination_dir): + os.makedirs(destination_dir) + shutil.copyfile(input_location, destination) + + return destination + + +def copy_inputs(input_locations, dest_path): + """Copy the provided ``input_locations`` to the ``dest_path``.""" + for input_location in input_locations: + copy_input(input_location, dest_path) + + +def move_input(input_location, dest_path): + """Move the provided ``input_location`` to the ``dest_path``.""" + destination = dest_path / Path(input_location).name + return shutil.move(input_location, destination) + + +def move_inputs(inputs, dest_path): + """Move the provided ``inputs`` to the ``dest_path``.""" + for input_location in inputs: + move_input(input_location, dest_path) + + +def get_tool_name_from_scan_headers(scan_data): + """Return the ``tool_name`` of the first header in the provided ``scan_data``.""" + if headers := scan_data.get("headers", []): + first_header = headers[0] + tool_name = first_header.get("tool_name", "") + return tool_name + + +def get_extra_data_from_scan_headers(scan_data): + """Return the ``extra_data`` of the first header in the provided ``scan_data``.""" + if headers := scan_data.get("headers", []): + first_header = headers[0] + if extra_data := first_header.get("extra_data"): + return extra_data + + +def is_archive(location): + """Return True if the file at ``location`` is an archive.""" + return get_type(location).is_archive + + +def load_inventory_from_toolkit_scan(project, input_location): + """ + Create license detections, packages, dependencies, and resources + loaded from the ScanCode-toolkit scan results located at ``input_location``. + """ + scanned_codebase = scancode.get_virtual_codebase(project, input_location) + scancode.create_discovered_licenses(project, scanned_codebase) + scancode.create_discovered_packages(project, scanned_codebase) + scancode.create_codebase_resources(project, scanned_codebase) + scancode.create_discovered_dependencies( + project, scanned_codebase, strip_datafile_path_root=True + ) + scancode.load_todo_issues(project, scanned_codebase) + + +def load_inventory_from_scanpipe(project, scan_data, extra_data_prefix=None): + """ + Create packages, dependencies, license detections, resources, and relations + loaded from a ScanCode.io JSON output provided as ``scan_data``. + + An ``extra_data_prefix`` can be provided in case multiple input files are loaded + into the same project. The prefix is usually the filename of the input. + """ + for detection_data in scan_data.get("license_detections", []): + pipes.update_or_create_license_detection(project, detection_data) + + for package_data in scan_data.get("packages", []): + pipes.update_or_create_package(project, package_data) + + for resource_data in scan_data.get("files", []): + pipes.update_or_create_resource(project, resource_data) + + for dependency_data in scan_data.get("dependencies", []): + pipes.update_or_create_dependency(project, dependency_data) + + for relation_data in scan_data.get("relations", []): + pipes.get_or_create_relation(project, relation_data) + + if extra_data := get_extra_data_from_scan_headers(scan_data): + if extra_data_prefix: + extra_data = {extra_data_prefix: extra_data} + project.update_extra_data(extra_data) + + +model_to_object_maker_func = { + DiscoveredPackage: pipes.update_or_create_package, + DiscoveredDependency: pipes.update_or_create_dependency, + DiscoveredLicense: pipes.update_or_create_license_detection, + CodebaseResource: pipes.update_or_create_resource, + CodebaseRelation: pipes.get_or_create_relation, +} + +worksheet_name_to_model = { + "PACKAGES": DiscoveredPackage, + "LICENSE_DETECTIONS": DiscoveredLicense, + "RESOURCES": CodebaseResource, + "DEPENDENCIES": DiscoveredDependency, + "RELATIONS": CodebaseRelation, +} + + +def get_worksheet_data(worksheet): + """Return the data from provided ``worksheet`` as a list of dict.""" + try: + header = [cell.value for cell in next(worksheet.rows)] + except StopIteration: + return {} + + worksheet_data = [ + dict(zip(header, row)) + for row in worksheet.iter_rows(min_row=2, values_only=True) + ] + return worksheet_data + + +def clean_xlsx_field_value(model_class, field_name, value): + """Clean the ``value`` for compatibility with the database ``model_class``.""" + if value in EMPTY_VALUES: + return + + if field_name == "for_packages": + return value.splitlines() + + elif field_name in ["purl", "for_package_uid", "datafile_path"]: + return value + + try: + field = model_class._meta.get_field(field_name) + except FieldDoesNotExist: + return + + if dict_key := mappings_key_by_fieldname.get(field_name): + return [{dict_key: entry} for entry in value.splitlines()] + + elif isinstance(field, models.JSONField): + if field.default is list: + return value.splitlines() + elif field.default is dict: + return # dict stored as JSON are not supported + + return value + + +def clean_xlsx_data_to_model_data(model_class, xlsx_data): + """Clean the ``xlsx_data`` for compatibility with the database ``model_class``.""" + cleaned_data = {} + + for field_name, value in xlsx_data.items(): + if cleaned_value := clean_xlsx_field_value(model_class, field_name, value): + cleaned_data[field_name] = cleaned_value + + return cleaned_data + + +def load_inventory_from_xlsx(project, input_location, extra_data_prefix=None): + """ + Create packages, dependencies, resources, and relations loaded from XLSX file + located at ``input_location``. + + An ``extra_data_prefix`` can be provided in case multiple input files are loaded + into the same project. The prefix is usually the filename of the input. + """ + workbook = openpyxl.load_workbook(input_location, read_only=True, data_only=True) + + for worksheet_name, model_class in worksheet_name_to_model.items(): + if worksheet_name not in workbook: + continue + + worksheet_data = get_worksheet_data(worksheet=workbook[worksheet_name]) + for row_data in worksheet_data: + object_maker_func = model_to_object_maker_func.get(model_class) + cleaned_data = clean_xlsx_data_to_model_data(model_class, row_data) + if cleaned_data: + object_maker_func(project, cleaned_data) + + if "LAYERS" in workbook: + layers_data = get_worksheet_data(worksheet=workbook["LAYERS"]) + extra_data = {"layers": layers_data} + if extra_data_prefix: + extra_data = {extra_data_prefix: extra_data} + project.update_extra_data(extra_data) + + +def add_input_from_url(project, url, filename=None): + """ + Download the file from the provided ``url`` and add it as an InputSource for the + specified ``project``. Optionally, specify a ``filename`` for the downloaded file. + If archiving is enabled, store the content in the DownloadStore and save metadata. + """ + try: + response = requests.get(url, stream=True,timeout=30) + response.raise_for_status() + content = response.content + except requests.RequestException as e: + logger.error(f"Failed to download {url}: {e}") + raise + + filename = filename or url.split("/")[-1] or "downloaded_file" + url_hash = hashlib.sha256(url.encode()).hexdigest() + archive_path = Path(project.settings.CENTRAL_ARCHIVE_PATH) / url_hash / filename + + if download_store: + try: + download = download_store.put( + content=content, + download_url=url, + download_date=datetime.now().isoformat(), + filename=filename, + ) + InputSource.objects.create( + project=project, + sha256=download.sha256, + download_url=download.download_url, + filename=download.filename, + download_date=download.download_date, + file_path=str(download.path), + is_uploaded=False, + ) + except Exception as e: + logger.error(f"Failed to archive download for {url}: {e}") + raise + else: + input_path = project.input_path / filename + try: + input_path.parent.mkdir(parents=True, exist_ok=True) + with open(input_path, "wb") as f: + f.write(content) + InputSource.objects.create( + project=project, + filename=filename, + download_url=url, + file_path=str(input_path), + is_uploaded=False, + ) + except Exception as e: + logger.error(f"Failed to save {filename} to {input_path}: {e}") + raise + +def add_input_from_upload(project, uploaded_file): + """ + Add an uploaded file as an InputSource for the specified ``project``. + If archiving is enabled, store the content in the DownloadStore and save metadata. + """ + content = uploaded_file.read() + filename = uploaded_file.name + + if download_store: + try: + download = download_store.put( + content=content, + download_url="", + download_date=datetime.now().isoformat(), + filename=filename, + ) + InputSource.objects.create( + project=project, + sha256=download.sha256, + download_url=download.download_url, + filename=download.filename, + download_date=download.download_date, + file_path=str(download.path), + is_uploaded=True, + ) + except Exception as e: + logger.error(f"Failed to archive upload {filename}: {e}") + raise + else: + input_path = project.input_path / filename + try: + input_path.parent.mkdir(parents=True, exist_ok=True) + with open(input_path, "wb") as f: + f.write(content) + InputSource.objects.create( + project=project, + filename=filename, + file_path=str(input_path), + is_uploaded=True, + ) + except Exception as e: + logger.error(f"Failed to save {filename} to {input_path}: {e}") + raise \ No newline at end of file diff --git a/scanpipe/tests/test_archiving.py b/scanpipe/tests/test_archiving.py index 0da1a236b5..a249c96c46 100644 --- a/scanpipe/tests/test_archiving.py +++ b/scanpipe/tests/test_archiving.py @@ -1,86 +1,86 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - - -import hashlib -from pathlib import Path - -from django.test import TestCase - -from scanpipe.archiving import LocalFilesystemProvider -from scanpipe.tests import make_project - - -class TestArchiving(TestCase): - def setUp(self): - self.project = make_project() - self.root_path = Path(__file__).parent / "data" / "test_downloads" - self.store = LocalFilesystemProvider(root_path=self.root_path) - self.test_content = b"test content" - self.test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" - self.test_filename = "sample.tar.gz" - - def tearDown(self): - if self.root_path.exists(): - import shutil - - shutil.rmtree(self.root_path) - - def test_local_filesystem_provider_put_get(self): - download = self.store.put( - content=self.test_content, - download_url=self.test_url, - download_date="2025-08-21T09:00:00", - filename=self.test_filename, - ) - sha256 = hashlib.sha256(self.test_content).hexdigest() - self.assertEqual(download.sha256, sha256) - self.assertEqual(download.download_url, self.test_url) - self.assertEqual(download.filename, self.test_filename) - self.assertEqual(download.download_date, "2025-08-21T09:00:00") - content_path = ( - self.root_path / sha256[:2] / sha256[2:4] / sha256[4:] / "content" - ) - self.assertTrue(content_path.exists()) - with open(content_path, "rb") as f: - self.assertEqual(f.read(), self.test_content) - - retrieved = self.store.get(sha256) - self.assertEqual(retrieved.sha256, sha256) - self.assertEqual(retrieved.download_url, self.test_url) - self.assertEqual(retrieved.filename, self.test_filename) - - def test_local_filesystem_provider_deduplication(self): - download1 = self.store.put( - content=self.test_content, - download_url=self.test_url, - download_date="2025-08-21T09:00:00", - filename=self.test_filename, - ) - download2 = self.store.put( - content=self.test_content, - download_url="https://files.pythonhosted.org/packages/another.tar.gz", - download_date="2025-08-21T10:00:00", - filename="another.tar.gz", - ) - self.assertEqual(download1.sha256, download2.sha256) - self.assertEqual(download1.download_url, self.test_url) +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + + +import hashlib +from pathlib import Path + +from django.test import TestCase + +from scanpipe.archiving import LocalFilesystemProvider +from scanpipe.tests import make_project + + +class TestArchiving(TestCase): + def setUp(self): + self.project = make_project() + self.root_path = Path(__file__).parent / "data" / "test_downloads" + self.store = LocalFilesystemProvider(root_path=self.root_path) + self.test_content = b"test content" + self.test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" + self.test_filename = "sample.tar.gz" + + def tearDown(self): + if self.root_path.exists(): + import shutil + + shutil.rmtree(self.root_path) + + def test_local_filesystem_provider_put_get(self): + download = self.store.put( + content=self.test_content, + download_url=self.test_url, + download_date="2025-08-21T09:00:00", + filename=self.test_filename, + ) + sha256 = hashlib.sha256(self.test_content).hexdigest() + self.assertEqual(download.sha256, sha256) + self.assertEqual(download.download_url, self.test_url) + self.assertEqual(download.filename, self.test_filename) + self.assertEqual(download.download_date, "2025-08-21T09:00:00") + content_path = ( + self.root_path / sha256[:2] / sha256[2:4] / sha256[4:] / "content" + ) + self.assertTrue(content_path.exists()) + with open(content_path, "rb") as f: + self.assertEqual(f.read(), self.test_content) + + retrieved = self.store.get(sha256) + self.assertEqual(retrieved.sha256, sha256) + self.assertEqual(retrieved.download_url, self.test_url) + self.assertEqual(retrieved.filename, self.test_filename) + + def test_local_filesystem_provider_deduplication(self): + download1 = self.store.put( + content=self.test_content, + download_url=self.test_url, + download_date="2025-08-21T09:00:00", + filename=self.test_filename, + ) + download2 = self.store.put( + content=self.test_content, + download_url="https://files.pythonhosted.org/packages/another.tar.gz", + download_date="2025-08-21T10:00:00", + filename="another.tar.gz", + ) + self.assertEqual(download1.sha256, download2.sha256) + self.assertEqual(download1.download_url, self.test_url) diff --git a/scanpipe/tests/test_input.py b/scanpipe/tests/test_input.py index e55a90cace..3f2848cf1b 100644 --- a/scanpipe/tests/test_input.py +++ b/scanpipe/tests/test_input.py @@ -1,112 +1,143 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: -# http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, -# software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an -# "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - - -from pathlib import Path -from unittest.mock import patch - -from django.core.files.uploadedfile import SimpleUploadedFile -from django.test import TestCase - -from scancodeio.settings import settings -from scanpipe.models import InputSource -from scanpipe.pipes.input import add_input_from_upload -from scanpipe.pipes.input import add_input_from_url -from scanpipe.tests import make_project - - -class TestInput(TestCase): - def setUp(self): - self.project = make_project() - self.test_filename = "sample.tar.gz" - self.test_data_path = ( - Path(__file__).parent / "data" / "test-downloads" / self.test_filename - ) - with open(self.test_data_path, "rb") as f: - self.test_content = f.read() - - @patch("requests.get") - def test_add_input_from_url(self, mock_get): - test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" - mock_get.return_value.content = self.test_content - mock_get.return_value.status_code = 200 - add_input_from_url(self.project, test_url, filename=self.test_filename) - input_source = InputSource.objects.get(project=self.project) - self.assertEqual(input_source.filename, self.test_filename) - self.assertEqual(input_source.download_url, test_url) - self.assertTrue(input_source.sha256) - self.assertTrue(input_source.download_date) - self.assertFalse(input_source.is_uploaded) - self.assertTrue( - input_source.file_path.startswith(settings.CENTRAL_ARCHIVE_PATH) - ) - self.assertTrue(Path(input_source.file_path).exists()) - - @patch("scanpipe.pipes.input.download_store", None) - @patch("requests.get") - def test_add_input_from_url_fallback(self, mock_get): - test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" - mock_get.return_value.content = self.test_content - mock_get.return_value.status_code = 200 - add_input_from_url(self.project, test_url, filename=self.test_filename) - input_source = InputSource.objects.get(project=self.project) - self.assertEqual(input_source.filename, self.test_filename) - self.assertEqual(input_source.download_url, test_url) - self.assertFalse(input_source.sha256) - self.assertFalse(input_source.download_date) - self.assertFalse(input_source.is_uploaded) - self.assertTrue( - str(input_source.file_path).startswith(str(self.project.input_path)) - ) - self.assertTrue(Path(input_source.file_path).exists()) - - def test_add_input_from_upload(self): - uploaded_file = SimpleUploadedFile(self.test_filename, self.test_content) - add_input_from_upload(self.project, uploaded_file) - input_source = InputSource.objects.get(project=self.project) - self.assertEqual(input_source.filename, self.test_filename) - self.assertEqual(input_source.download_url, "") - self.assertTrue(input_source.sha256) - self.assertTrue(input_source.download_date) - self.assertTrue(input_source.is_uploaded) - self.assertTrue( - input_source.file_path.startswith(settings.CENTRAL_ARCHIVE_PATH) - ) - self.assertTrue(Path(input_source.file_path).exists()) - - @patch("scanpipe.pipes.input.download_store", None) - def test_add_input_from_upload_fallback(self): - uploaded_file = SimpleUploadedFile(self.test_filename, self.test_content) - add_input_from_upload(self.project, uploaded_file) - input_source = InputSource.objects.get(project=self.project) - self.assertEqual(input_source.filename, self.test_filename) - self.assertEqual(input_source.download_url, "") - self.assertFalse(input_source.sha256) - self.assertFalse(input_source.download_date) - self.assertTrue(input_source.is_uploaded) - self.assertTrue( - str(input_source.file_path).startswith(str(self.project.input_path)) - ) - self.assertTrue(Path(input_source.file_path).exists()) +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: +# http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, +# software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an +# "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + + +from pathlib import Path +from unittest.mock import patch + +from django.core.files.uploadedfile import SimpleUploadedFile +from django.test import TestCase + +from scanpipe.models import InputSource +from scanpipe.pipes.input import add_input_from_upload +from scanpipe.pipes.input import add_input_from_url +from scancodeio.settings import settings +from scanpipe.tests import make_project + + +class TestInput(TestCase): + def setUp(self): + self.project = make_project() + self.test_filename = "sample.tar.gz" + self.test_data_path = ( + Path(__file__).parent / + "data" / + "test-downloads" / + self.test_filename + ) + with open(self.test_data_path, "rb") as f: + self.test_content = f.read() + + @patch("requests.get") + def test_add_input_from_url(self, mock_get): + test_url = ( + "https://files.pythonhosted.org/" + "packages/sample.tar.gz" + ) + mock_get.return_value.content = self.test_content + mock_get.return_value.status_code = 200 + add_input_from_url( + self.project, + test_url, + filename=self.test_filename + ) + input_source = InputSource.objects.get(project=self.project) + self.assertEqual(input_source.filename, self.test_filename) + self.assertEqual(input_source.download_url, test_url) + self.assertTrue(input_source.sha256) + self.assertTrue(input_source.download_date) + self.assertFalse(input_source.is_uploaded) + self.assertTrue( + input_source.file_path.startswith( + settings.CENTRAL_ARCHIVE_PATH + ) + ) + self.assertTrue(Path(input_source.file_path).exists()) + + @patch("scanpipe.pipes.input.download_store", None) + @patch("requests.get") + def test_add_input_from_url_fallback(self, mock_get): + test_url = ( + "https://files.pythonhosted.org/" + "packages/sample.tar.gz" + ) + mock_get.return_value.content = self.test_content + mock_get.return_value.status_code = 200 + add_input_from_url( + self.project, + test_url, + filename=self.test_filename + ) + input_source = InputSource.objects.get(project=self.project) + self.assertEqual(input_source.filename, self.test_filename) + self.assertEqual(input_source.download_url, test_url) + self.assertFalse(input_source.sha256) + self.assertFalse(input_source.download_date) + self.assertFalse(input_source.is_uploaded) + self.assertTrue( + str(input_source.file_path).startswith( + str(self.project.input_path) + ) + ) + self.assertTrue(Path(input_source.file_path).exists()) + + def test_add_input_from_upload(self): + uploaded_file = SimpleUploadedFile( + self.test_filename, + self.test_content + ) + add_input_from_upload(self.project, uploaded_file) + input_source = InputSource.objects.get(project=self.project) + self.assertEqual(input_source.filename, self.test_filename) + self.assertEqual(input_source.download_url, "") + self.assertTrue(input_source.sha256) + self.assertTrue(input_source.download_date) + self.assertTrue(input_source.is_uploaded) + self.assertTrue( + input_source.file_path.startswith( + settings.CENTRAL_ARCHIVE_PATH + ) + ) + self.assertTrue(Path(input_source.file_path).exists()) + + @patch("scanpipe.pipes.input.download_store", None) + def test_add_input_from_upload_fallback(self): + uploaded_file = SimpleUploadedFile( + self.test_filename, + self.test_content + ) + add_input_from_upload(self.project, uploaded_file) + input_source = InputSource.objects.get(project=self.project) + self.assertEqual(input_source.filename, self.test_filename) + self.assertEqual(input_source.download_url, "") + self.assertFalse(input_source.sha256) + self.assertFalse(input_source.download_date) + self.assertTrue(input_source.is_uploaded) + self.assertTrue( + str(input_source.file_path).startswith( + str(self.project.input_path) + ) + ) + self.assertTrue(Path(input_source.file_path).exists()) diff --git a/scanpipe/tests/test_pipelines.py b/scanpipe/tests/test_pipelines.py index 16c6260ebc..edb1e4687e 100644 --- a/scanpipe/tests/test_pipelines.py +++ b/scanpipe/tests/test_pipelines.py @@ -1,6 +1,9 @@ <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD +======= +>>>>>>> ca2f49f5 (Revert "Revert "Revert "add tests for storing packages""") # SPDX-License-Identifier: Apache-2.0 # # http://nexb.com and https://github.com/nexB/scancode.io @@ -32,6 +35,10 @@ from pathlib import Path from unittest import mock from unittest import skipIf +<<<<<<< HEAD +======= +from unittest.mock import patch +>>>>>>> ca2f49f5 (Revert "Revert "Revert "add tests for storing packages""") from django.conf import settings from django.test import TestCase @@ -302,30 +309,49 @@ def test_archive_downloads(self, mock_get): with open(test_data_path, "rb") as f: test_content = f.read() +<<<<<<< HEAD input_source=InputSource.objects.create( +======= + InputSource.objects.create( +>>>>>>> ca2f49f5 (Revert "Revert "Revert "add tests for storing packages""") project=project1, filename=test_filename, download_url=test_url, is_uploaded=False, ) +<<<<<<< HEAD +======= + +>>>>>>> ca2f49f5 (Revert "Revert "Revert "add tests for storing packages""") mock_get.return_value.content = test_content mock_get.return_value.status_code = 200 pipeline.download_missing_inputs() input_source.refresh_from_db() +<<<<<<< HEAD self.assertTrue( input_source.file_path.startswith(settings.CENTRAL_ARCHIVE_PATH) ) self.assertTrue(Path(input_source.file_path).exists()) +======= + self.assertTrue(input_source.file_path.startswith(settings.CENTRAL_ARCHIVE_PATH)) + self.assertTrue(Path(input_source.file_path).exists()) + + +>>>>>>> ca2f49f5 (Revert "Revert "Revert "add tests for storing packages""") pipeline.archive_downloads() input_source = InputSource.refresh_from_db() self.assertTrue(input_source.sha256) self.assertTrue(input_source.download_date) self.assertEqual(input_source.download_url, test_url) self.assertEqual(input_source.filename, test_filename) +<<<<<<< HEAD +======= + +>>>>>>> ca2f49f5 (Revert "Revert "Revert "add tests for storing packages""") project2 = make_project(name="project2") input_source2 = InputSource.objects.create( project=project2, @@ -2055,4 +2081,3 @@ def test_scanpipe_enrich_with_purldb_pipeline_integration( run.refresh_from_db() self.assertIn("pkg:npm/csvtojson@2.0.10 ['release_date'", run.log) self.assertIn("1 discovered package enriched with the PurlDB.", run.log) - From 544f9e2727538dac7cf0e904bbef6bc6a9dfa868 Mon Sep 17 00:00:00 2001 From: Varsha U N Date: Thu, 18 Sep 2025 07:59:57 +0530 Subject: [PATCH 12/18] Revert "add tests for storing packages" This reverts commit ca2f49f505bd5c951b5f270d4b218a69848a6de9. --- Dockerfile | 7 + scancodeio/settings.py | 979 +++++++++++++++---------------- scanpipe/archiving.py | 375 ++++++------ scanpipe/pipelines/__init__.py | 699 +++++++++++----------- scanpipe/pipes/input.py | 692 +++++++++++----------- scanpipe/tests/test_archiving.py | 172 +++--- scanpipe/tests/test_input.py | 255 ++++---- scanpipe/tests/test_pipelines.py | 1 + 8 files changed, 1577 insertions(+), 1603 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2527dea2f3..9615d29f0c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,11 @@ <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD ======= >>>>>>> b6d23428 (Revert "Revert "add tests for storing packages"") +======= +>>>>>>> 507231a0 (Revert "add tests for storing packages") # SPDX-License-Identifier: Apache-2.0 # # http://nexb.com and https://github.com/aboutcode-org/scancode.io @@ -97,6 +100,7 @@ RUN pip install --no-cache-dir . # Copy the codebase and set the proper permissions for the APP_USER <<<<<<< HEAD +<<<<<<< HEAD COPY --chown=$APP_USER:$APP_USER . $APP_DIR ======= COPY --chown=$APP_USER:$APP_USER . $APP_DIR @@ -197,3 +201,6 @@ RUN pip install --no-cache-dir . # Copy the codebase and set the proper permissions for the APP_USER COPY --chown=$APP_USER:$APP_USER . $APP_DIR >>>>>>> ca2f49f5 (Revert "Revert "Revert "add tests for storing packages""") +======= +COPY --chown=$APP_USER:$APP_USER . $APP_DIR +>>>>>>> 507231a0 (Revert "add tests for storing packages") diff --git a/scancodeio/settings.py b/scancodeio/settings.py index 2d7686900c..15e52a4440 100644 --- a/scancodeio/settings.py +++ b/scancodeio/settings.py @@ -1,491 +1,488 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - -import sys -import tempfile -from pathlib import Path -import logging - -import environ - -from scanpipe.archiving import LocalFilesystemProvider - - -PROJECT_DIR = environ.Path(__file__) - 1 -ROOT_DIR = PROJECT_DIR - 1 - -# True if running tests through `./manage test` -IS_TESTS = "test" in sys.argv - -# Environment - -ENV_FILE = "/etc/scancodeio/.env" -if not Path(ENV_FILE).exists(): - ENV_FILE = ROOT_DIR(".env") - -# Do not use local .env environment when running the tests. -if IS_TESTS: - ENV_FILE = None - -env = environ.Env() -environ.Env.read_env(ENV_FILE) - -# Security - -SECRET_KEY = env.str("SECRET_KEY", default="") - -ALLOWED_HOSTS = env.list( - "ALLOWED_HOSTS", - default=[".localhost", "127.0.0.1", "[::1]", "host.docker.internal", "172.17.0.1"], -) - -CSRF_TRUSTED_ORIGINS = env.list("CSRF_TRUSTED_ORIGINS", default=[]) - -# SECURITY WARNING: don't run with debug turned on in production -DEBUG = env.bool("SCANCODEIO_DEBUG", default=False) - -SCANCODEIO_REQUIRE_AUTHENTICATION = env.bool( - "SCANCODEIO_REQUIRE_AUTHENTICATION", default=False -) - -SCANCODEIO_ENABLE_ADMIN_SITE = env.bool("SCANCODEIO_ENABLE_ADMIN_SITE", default=False) - -SECURE_CONTENT_TYPE_NOSNIFF = env.bool("SECURE_CONTENT_TYPE_NOSNIFF", default=True) - -X_FRAME_OPTIONS = env.str("X_FRAME_OPTIONS", default="DENY") - -SESSION_COOKIE_SECURE = env.bool("SESSION_COOKIE_SECURE", default=True) - -CSRF_COOKIE_SECURE = env.bool("CSRF_COOKIE_SECURE", default=True) - -# ``security.W004`` SECURE_HSTS_SECONDS and ``security.W008`` SECURE_SSL_REDIRECT -# are handled by the web server. -SILENCED_SYSTEM_CHECKS = ["security.W004", "security.W008"] - -# ScanCode.io - -SCANCODEIO_WORKSPACE_LOCATION = env.str("SCANCODEIO_WORKSPACE_LOCATION", default="var") - -SCANCODEIO_CONFIG_DIR = env.str("SCANCODEIO_CONFIG_DIR", default=".scancode") - -SCANCODEIO_CONFIG_FILE = env.str( - "SCANCODEIO_CONFIG_FILE", default="scancode-config.yml" -) - -SCANCODEIO_LOG_LEVEL = env.str("SCANCODEIO_LOG_LEVEL", "INFO") - -# Set the number of parallel processes to use for ScanCode related scan execution. -# If the SCANCODEIO_PROCESSES argument is not set, defaults to an optimal number of CPUs -# available on the machine. -SCANCODEIO_PROCESSES = env.int("SCANCODEIO_PROCESSES", default=None) - -SCANCODEIO_POLICIES_FILE = env.str("SCANCODEIO_POLICIES_FILE", default="policies.yml") - -# This setting defines the additional locations ScanCode.io will search for pipelines. -# This should be set to a list of strings that contain full paths to your additional -# pipelines directories. -SCANCODEIO_PIPELINES_DIRS = env.list("SCANCODEIO_PIPELINES_DIRS", default=[]) - -# Maximum time allowed for a pipeline to complete. -SCANCODEIO_TASK_TIMEOUT = env.str("SCANCODEIO_TASK_TIMEOUT", default="24h") - -# Default to 2 minutes. -SCANCODEIO_SCAN_FILE_TIMEOUT = env.int("SCANCODEIO_SCAN_FILE_TIMEOUT", default=120) - -# Default to None which scans all files -SCANCODEIO_SCAN_MAX_FILE_SIZE = env.int("SCANCODEIO_SCAN_MAX_FILE_SIZE", default=None) - -# List views pagination, controls the number of items displayed per page. -# Syntax in .env: SCANCODEIO_PAGINATE_BY=project=10,project_error=10 -SCANCODEIO_PAGINATE_BY = env.dict( - "SCANCODEIO_PAGINATE_BY", - default={ - "project": 20, - "error": 50, - "resource": 100, - "package": 100, - "dependency": 100, - "license": 100, - "relation": 100, - }, -) - -# Default limit for "most common" entries in QuerySets. -SCANCODEIO_MOST_COMMON_LIMIT = env.int("SCANCODEIO_MOST_COMMON_LIMIT", default=7) - -# The base URL (e.g., https://hostname/) of this application instance. -# Required for generating URLs to reference objects within the app, -# such as in webhook notifications. -SCANCODEIO_SITE_URL = env.str("SCANCODEIO_SITE_URL", default="") - -# Fetch authentication credentials - -# SCANCODEIO_FETCH_BASIC_AUTH="host=user,password;" -SCANCODEIO_FETCH_BASIC_AUTH = env.dict( - "SCANCODEIO_FETCH_BASIC_AUTH", - cast={"value": tuple}, - default={}, -) - -# SCANCODEIO_FETCH_DIGEST_AUTH="host=user,password;" -SCANCODEIO_FETCH_DIGEST_AUTH = env.dict( - "SCANCODEIO_FETCH_DIGEST_AUTH", - cast={"value": tuple}, - default={}, -) - -# SCANCODEIO_FETCH_HEADERS="host=Header1=value,Header2=value;" -SCANCODEIO_FETCH_HEADERS = {} -FETCH_HEADERS_STR = env.str("SCANCODEIO_FETCH_HEADERS", default="") -for entry in FETCH_HEADERS_STR.split(";"): - if entry.strip(): - host, headers = entry.split("=", 1) - SCANCODEIO_FETCH_HEADERS[host] = env.parse_value(headers, cast=dict) - -# SCANCODEIO_NETRC_LOCATION="~/.netrc" -SCANCODEIO_NETRC_LOCATION = env.str("SCANCODEIO_NETRC_LOCATION", default="") -if SCANCODEIO_NETRC_LOCATION: - # Propagate the location to the environ for `requests.utils.get_netrc_auth` - env.ENVIRON["NETRC"] = SCANCODEIO_NETRC_LOCATION - -# SCANCODEIO_SKOPEO_CREDENTIALS="host1=user:password,host2=user:password" -SCANCODEIO_SKOPEO_CREDENTIALS = env.dict("SCANCODEIO_SKOPEO_CREDENTIALS", default={}) - -# SCANCODEIO_SKOPEO_AUTHFILE_LOCATION="/path/to/auth.json" -SCANCODEIO_SKOPEO_AUTHFILE_LOCATION = env.str( - "SCANCODEIO_SKOPEO_AUTHFILE_LOCATION", default="" -) - -# This webhook will be added as WebhookSubscription for each new project. -# SCANCODEIO_GLOBAL_WEBHOOK=target_url=https://webhook.url,trigger_on_each_run=False,include_summary=True,include_results=False -SCANCODEIO_GLOBAL_WEBHOOK = env.dict("SCANCODEIO_GLOBAL_WEBHOOK", default={}) - -# Application definition - -INSTALLED_APPS = [ - # Local apps - # Must come before Third-party apps for proper templates override - "scanpipe", - # Django built-in - "django.contrib.auth", - "django.contrib.contenttypes", - "django.contrib.sessions", - "django.contrib.messages", - "django.contrib.staticfiles", - "django.contrib.admin", - "django.contrib.humanize", - # Third-party apps - "crispy_forms", - "crispy_bootstrap3", # required for the djangorestframework browsable API - "django_filters", - "rest_framework", - "rest_framework.authtoken", - "django_rq", - "django_probes", - "taggit", -] - -MIDDLEWARE = [ - "django.middleware.security.SecurityMiddleware", - "django.contrib.sessions.middleware.SessionMiddleware", - "django.middleware.common.CommonMiddleware", - "django.middleware.csrf.CsrfViewMiddleware", - "django.contrib.auth.middleware.AuthenticationMiddleware", - "django.contrib.messages.middleware.MessageMiddleware", - "django.middleware.clickjacking.XFrameOptionsMiddleware", - "scancodeio.middleware.TimezoneMiddleware", -] - -ROOT_URLCONF = "scancodeio.urls" - -WSGI_APPLICATION = "scancodeio.wsgi.application" - -SECURE_PROXY_SSL_HEADER = env.tuple( - "SECURE_PROXY_SSL_HEADER", default=("HTTP_X_FORWARDED_PROTO", "https") -) - -# Database - -DATABASES = { - "default": { - "ENGINE": env.str("SCANCODEIO_DB_ENGINE", "django.db.backends.postgresql"), - "HOST": env.str("SCANCODEIO_DB_HOST", "localhost"), - "NAME": env.str("SCANCODEIO_DB_NAME", "scancodeio"), - "USER": env.str("SCANCODEIO_DB_USER", "scancodeio"), - "PASSWORD": env.str("SCANCODEIO_DB_PASSWORD", "scancodeio"), - "PORT": env.str("SCANCODEIO_DB_PORT", "5432"), - "ATOMIC_REQUESTS": True, - } -} - -DEFAULT_AUTO_FIELD = "django.db.models.AutoField" - -# Forms and filters - -FILTERS_EMPTY_CHOICE_LABEL = env.str("FILTERS_EMPTY_CHOICE_LABEL", default="All") - -# Templates - -TEMPLATES = [ - { - "BACKEND": "django.template.backends.django.DjangoTemplates", - "APP_DIRS": True, - "OPTIONS": { - "debug": DEBUG, - "context_processors": [ - "django.contrib.auth.context_processors.auth", - "django.contrib.messages.context_processors.messages", - "django.template.context_processors.request", - "scancodeio.context_processors.versions", - ], - }, - }, -] - -# Login - -LOGIN_REDIRECT_URL = "project_list" - -# Passwords - -AUTH_PASSWORD_VALIDATORS = [ - { - "NAME": ( - "django.contrib.auth.password_validation.UserAttributeSimilarityValidator" - ), - }, - { - "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator", - "OPTIONS": { - "min_length": env.int("SCANCODEIO_PASSWORD_MIN_LENGTH", default=12), - }, - }, - { - "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator", - }, - { - "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator", - }, -] - -# Testing - -if IS_TESTS: - from django.core.management.utils import get_random_secret_key - - SECRET_KEY = get_random_secret_key() - # Do not pollute the workspace while running the tests. - SCANCODEIO_WORKSPACE_LOCATION = tempfile.mkdtemp() - SCANCODEIO_REQUIRE_AUTHENTICATION = True - SCANCODEIO_SCAN_FILE_TIMEOUT = 120 - SCANCODEIO_POLICIES_FILE = None - # The default password hasher is rather slow by design. - # Using a faster hashing algorithm in the testing context to speed up the run. - PASSWORD_HASHERS = ["django.contrib.auth.hashers.MD5PasswordHasher"] - -# Debug toolbar - -DEBUG_TOOLBAR = env.bool("SCANCODEIO_DEBUG_TOOLBAR", default=False) -if DEBUG and DEBUG_TOOLBAR: - INSTALLED_APPS.append("debug_toolbar") - MIDDLEWARE.append("debug_toolbar.middleware.DebugToolbarMiddleware") - INTERNAL_IPS = ["127.0.0.1"] - -# Logging - -LOGGING = { - "version": 1, - "disable_existing_loggers": False, - "formatters": { - "simple": { - "format": "{levelname} {message}", - "style": "{", - }, - }, - "handlers": { - "null": { - "class": "logging.NullHandler", - }, - "console": { - "class": "logging.StreamHandler", - "formatter": "simple", - }, - }, - "loggers": { - "scanpipe": { - "handlers": ["null"] if IS_TESTS else ["console"], - "level": SCANCODEIO_LOG_LEVEL, - "propagate": False, - }, - "django": { - "handlers": ["null"] if IS_TESTS else ["console"], - "propagate": False, - }, - # Set SCANCODEIO_LOG_LEVEL=DEBUG to display all SQL queries in the console. - "django.db.backends": { - "level": SCANCODEIO_LOG_LEVEL, - }, - }, -} - -# Instead of sending out real emails the console backend just writes the emails -# that would be sent to the standard output. -EMAIL_BACKEND = "django.core.mail.backends.console.EmailBackend" - -# Internationalization - -LANGUAGE_CODE = "en-us" - -FORMAT_MODULE_PATH = ["scancodeio.formats"] - -TIME_ZONE = env.str("TIME_ZONE", default="UTC") - -USE_I18N = True - -USE_TZ = True - -# Static files (CSS, JavaScript, Images) - -STATIC_URL = "/static/" - -STATIC_ROOT = env.str("STATIC_ROOT", default="/var/scancodeio/static/") - -STATICFILES_DIRS = [ - PROJECT_DIR("static"), -] - -# Third-party apps - -CRISPY_TEMPLATE_PACK = "bootstrap3" - -# Centralized archive directory for all projects -CENTRAL_ARCHIVE_PATH = env.str( - "CENTRAL_ARCHIVE_PATH", default="/var/scancodeio/archives" -) - -# localstorage configuration -DOWNLOAD_ARCHIVING_PROVIDER = env.str( - "DOWNLOAD_ARCHIVING_PROVIDER", default="localstorage" -) - -# For local storage, we would store the root path in that setting -DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION = env.dict( - "DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION", default=None -) - -# Initialize the DownloadStore for local storage - -download_store = None -logger = logging.getLogger(__name__) -if DOWNLOAD_ARCHIVING_PROVIDER == "localstorage": - config = DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION or {} - root_path = Path(config.get("root_path", CENTRAL_ARCHIVE_PATH)) - try: - download_store = LocalFilesystemProvider(root_path=root_path) - except Exception as e: - logger.error(f"Failed to initialize LocalFilesystemProvider: {e}") -else: - logger.error( - f"Unknown DOWNLOAD_ARCHIVING_PROVIDER: {DOWNLOAD_ARCHIVING_PROVIDER}" - ) - -# Job Queue - -RQ_QUEUES = { - "default": { - "HOST": env.str("SCANCODEIO_RQ_REDIS_HOST", default="localhost"), - "PORT": env.str("SCANCODEIO_RQ_REDIS_PORT", default="6379"), - "DB": env.int("SCANCODEIO_RQ_REDIS_DB", default=0), - "USERNAME": env.str("SCANCODEIO_RQ_REDIS_USERNAME", default=None), - "PASSWORD": env.str("SCANCODEIO_RQ_REDIS_PASSWORD", default=""), - "DEFAULT_TIMEOUT": env.int("SCANCODEIO_RQ_REDIS_DEFAULT_TIMEOUT", default=360), - # Enable SSL for Redis connections when deploying ScanCode.io in environments - # where Redis is hosted on a separate system (e.g., cloud deployment or remote - # Redis server) to secure data in transit. - "SSL": env.bool("SCANCODEIO_RQ_REDIS_SSL", default=False), - }, -} - -SCANCODEIO_ASYNC = env.bool("SCANCODEIO_ASYNC", default=False) -if not SCANCODEIO_ASYNC: - for queue_config in RQ_QUEUES.values(): - queue_config["ASYNC"] = False - -# ClamAV virus scan -CLAMD_USE_TCP = env.bool("CLAMD_USE_TCP", default=True) -CLAMD_TCP_ADDR = env.str("CLAMD_TCP_ADDR", default="clamav") - -# Django restframework - -REST_FRAMEWORK = { - "DEFAULT_AUTHENTICATION_CLASSES": ( - "rest_framework.authentication.TokenAuthentication", - ), - "DEFAULT_PERMISSION_CLASSES": ("rest_framework.permissions.IsAuthenticated",), - "DEFAULT_RENDERER_CLASSES": ( - "rest_framework.renderers.JSONRenderer", - "rest_framework.renderers.BrowsableAPIRenderer", - "rest_framework.renderers.AdminRenderer", - ), - "DEFAULT_FILTER_BACKENDS": ( - "django_filters.rest_framework.DjangoFilterBackend", - "rest_framework.filters.SearchFilter", - ), - "DEFAULT_PAGINATION_CLASS": "rest_framework.pagination.PageNumberPagination", - "PAGE_SIZE": env.int("SCANCODEIO_REST_API_PAGE_SIZE", default=50), - "UPLOADED_FILES_USE_URL": False, -} - -if not SCANCODEIO_REQUIRE_AUTHENTICATION: - REST_FRAMEWORK["DEFAULT_PERMISSION_CLASSES"] = ( - "rest_framework.permissions.AllowAny", - ) - -# VulnerableCode integration - -VULNERABLECODE_URL = env.str("VULNERABLECODE_URL", default="").rstrip("/") -VULNERABLECODE_USER = env.str("VULNERABLECODE_USER", default="") -VULNERABLECODE_PASSWORD = env.str("VULNERABLECODE_PASSWORD", default="") -VULNERABLECODE_API_KEY = env.str("VULNERABLECODE_API_KEY", default="") - -# PurlDB integration - -PURLDB_URL = env.str("PURLDB_URL", default="").rstrip("/") -PURLDB_USER = env.str("PURLDB_USER", default="") -PURLDB_PASSWORD = env.str("PURLDB_PASSWORD", default="") -PURLDB_API_KEY = env.str("PURLDB_API_KEY", default="") - -# MatchCode.io integration - -MATCHCODEIO_URL = env.str("MATCHCODEIO_URL", default="").rstrip("/") -MATCHCODEIO_USER = env.str("MATCHCODEIO_USER", default="") -MATCHCODEIO_PASSWORD = env.str("MATCHCODEIO_PASSWORD", default="") -MATCHCODEIO_API_KEY = env.str("MATCHCODEIO_API_KEY", default="") - -# FederatedCode integration - -FEDERATEDCODE_GIT_ACCOUNT_URL = env.str( - "FEDERATEDCODE_GIT_ACCOUNT_URL", default="" -).rstrip("/") -FEDERATEDCODE_GIT_SERVICE_TOKEN = env.str("FEDERATEDCODE_GIT_SERVICE_TOKEN", default="") -FEDERATEDCODE_GIT_SERVICE_NAME = env.str("FEDERATEDCODE_GIT_SERVICE_NAME", default="") -FEDERATEDCODE_GIT_SERVICE_EMAIL = env.str("FEDERATEDCODE_GIT_SERVICE_EMAIL", default="") +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +import logging +import sys +import tempfile +from pathlib import Path + +import environ + +from scanpipe.archiving import LocalFilesystemProvider + +PROJECT_DIR = environ.Path(__file__) - 1 +ROOT_DIR = PROJECT_DIR - 1 + +# True if running tests through `./manage test` +IS_TESTS = "test" in sys.argv + +# Environment + +ENV_FILE = "/etc/scancodeio/.env" +if not Path(ENV_FILE).exists(): + ENV_FILE = ROOT_DIR(".env") + +# Do not use local .env environment when running the tests. +if IS_TESTS: + ENV_FILE = None + +env = environ.Env() +environ.Env.read_env(ENV_FILE) + +# Security + +SECRET_KEY = env.str("SECRET_KEY", default="") + +ALLOWED_HOSTS = env.list( + "ALLOWED_HOSTS", + default=[".localhost", "127.0.0.1", "[::1]", "host.docker.internal", "172.17.0.1"], +) + +CSRF_TRUSTED_ORIGINS = env.list("CSRF_TRUSTED_ORIGINS", default=[]) + +# SECURITY WARNING: don't run with debug turned on in production +DEBUG = env.bool("SCANCODEIO_DEBUG", default=False) + +SCANCODEIO_REQUIRE_AUTHENTICATION = env.bool( + "SCANCODEIO_REQUIRE_AUTHENTICATION", default=False +) + +SCANCODEIO_ENABLE_ADMIN_SITE = env.bool("SCANCODEIO_ENABLE_ADMIN_SITE", default=False) + +SECURE_CONTENT_TYPE_NOSNIFF = env.bool("SECURE_CONTENT_TYPE_NOSNIFF", default=True) + +X_FRAME_OPTIONS = env.str("X_FRAME_OPTIONS", default="DENY") + +SESSION_COOKIE_SECURE = env.bool("SESSION_COOKIE_SECURE", default=True) + +CSRF_COOKIE_SECURE = env.bool("CSRF_COOKIE_SECURE", default=True) + +# ``security.W004`` SECURE_HSTS_SECONDS and ``security.W008`` SECURE_SSL_REDIRECT +# are handled by the web server. +SILENCED_SYSTEM_CHECKS = ["security.W004", "security.W008"] + +# ScanCode.io + +SCANCODEIO_WORKSPACE_LOCATION = env.str("SCANCODEIO_WORKSPACE_LOCATION", default="var") + +SCANCODEIO_CONFIG_DIR = env.str("SCANCODEIO_CONFIG_DIR", default=".scancode") + +SCANCODEIO_CONFIG_FILE = env.str( + "SCANCODEIO_CONFIG_FILE", default="scancode-config.yml" +) + +SCANCODEIO_LOG_LEVEL = env.str("SCANCODEIO_LOG_LEVEL", "INFO") + +# Set the number of parallel processes to use for ScanCode related scan execution. +# If the SCANCODEIO_PROCESSES argument is not set, defaults to an optimal number of CPUs +# available on the machine. +SCANCODEIO_PROCESSES = env.int("SCANCODEIO_PROCESSES", default=None) + +SCANCODEIO_POLICIES_FILE = env.str("SCANCODEIO_POLICIES_FILE", default="policies.yml") + +# This setting defines the additional locations ScanCode.io will search for pipelines. +# This should be set to a list of strings that contain full paths to your additional +# pipelines directories. +SCANCODEIO_PIPELINES_DIRS = env.list("SCANCODEIO_PIPELINES_DIRS", default=[]) + +# Maximum time allowed for a pipeline to complete. +SCANCODEIO_TASK_TIMEOUT = env.str("SCANCODEIO_TASK_TIMEOUT", default="24h") + +# Default to 2 minutes. +SCANCODEIO_SCAN_FILE_TIMEOUT = env.int("SCANCODEIO_SCAN_FILE_TIMEOUT", default=120) + +# Default to None which scans all files +SCANCODEIO_SCAN_MAX_FILE_SIZE = env.int("SCANCODEIO_SCAN_MAX_FILE_SIZE", default=None) + +# List views pagination, controls the number of items displayed per page. +# Syntax in .env: SCANCODEIO_PAGINATE_BY=project=10,project_error=10 +SCANCODEIO_PAGINATE_BY = env.dict( + "SCANCODEIO_PAGINATE_BY", + default={ + "project": 20, + "error": 50, + "resource": 100, + "package": 100, + "dependency": 100, + "license": 100, + "relation": 100, + }, +) + +# Default limit for "most common" entries in QuerySets. +SCANCODEIO_MOST_COMMON_LIMIT = env.int("SCANCODEIO_MOST_COMMON_LIMIT", default=7) + +# The base URL (e.g., https://hostname/) of this application instance. +# Required for generating URLs to reference objects within the app, +# such as in webhook notifications. +SCANCODEIO_SITE_URL = env.str("SCANCODEIO_SITE_URL", default="") + +# Fetch authentication credentials + +# SCANCODEIO_FETCH_BASIC_AUTH="host=user,password;" +SCANCODEIO_FETCH_BASIC_AUTH = env.dict( + "SCANCODEIO_FETCH_BASIC_AUTH", + cast={"value": tuple}, + default={}, +) + +# SCANCODEIO_FETCH_DIGEST_AUTH="host=user,password;" +SCANCODEIO_FETCH_DIGEST_AUTH = env.dict( + "SCANCODEIO_FETCH_DIGEST_AUTH", + cast={"value": tuple}, + default={}, +) + +# SCANCODEIO_FETCH_HEADERS="host=Header1=value,Header2=value;" +SCANCODEIO_FETCH_HEADERS = {} +FETCH_HEADERS_STR = env.str("SCANCODEIO_FETCH_HEADERS", default="") +for entry in FETCH_HEADERS_STR.split(";"): + if entry.strip(): + host, headers = entry.split("=", 1) + SCANCODEIO_FETCH_HEADERS[host] = env.parse_value(headers, cast=dict) + +# SCANCODEIO_NETRC_LOCATION="~/.netrc" +SCANCODEIO_NETRC_LOCATION = env.str("SCANCODEIO_NETRC_LOCATION", default="") +if SCANCODEIO_NETRC_LOCATION: + # Propagate the location to the environ for `requests.utils.get_netrc_auth` + env.ENVIRON["NETRC"] = SCANCODEIO_NETRC_LOCATION + +# SCANCODEIO_SKOPEO_CREDENTIALS="host1=user:password,host2=user:password" +SCANCODEIO_SKOPEO_CREDENTIALS = env.dict("SCANCODEIO_SKOPEO_CREDENTIALS", default={}) + +# SCANCODEIO_SKOPEO_AUTHFILE_LOCATION="/path/to/auth.json" +SCANCODEIO_SKOPEO_AUTHFILE_LOCATION = env.str( + "SCANCODEIO_SKOPEO_AUTHFILE_LOCATION", default="" +) + +# This webhook will be added as WebhookSubscription for each new project. +# SCANCODEIO_GLOBAL_WEBHOOK=target_url=https://webhook.url,trigger_on_each_run=False,include_summary=True,include_results=False +SCANCODEIO_GLOBAL_WEBHOOK = env.dict("SCANCODEIO_GLOBAL_WEBHOOK", default={}) + +# Application definition + +INSTALLED_APPS = [ + # Local apps + # Must come before Third-party apps for proper templates override + "scanpipe", + # Django built-in + "django.contrib.auth", + "django.contrib.contenttypes", + "django.contrib.sessions", + "django.contrib.messages", + "django.contrib.staticfiles", + "django.contrib.admin", + "django.contrib.humanize", + # Third-party apps + "crispy_forms", + "crispy_bootstrap3", # required for the djangorestframework browsable API + "django_filters", + "rest_framework", + "rest_framework.authtoken", + "django_rq", + "django_probes", + "taggit", +] + +MIDDLEWARE = [ + "django.middleware.security.SecurityMiddleware", + "django.contrib.sessions.middleware.SessionMiddleware", + "django.middleware.common.CommonMiddleware", + "django.middleware.csrf.CsrfViewMiddleware", + "django.contrib.auth.middleware.AuthenticationMiddleware", + "django.contrib.messages.middleware.MessageMiddleware", + "django.middleware.clickjacking.XFrameOptionsMiddleware", + "scancodeio.middleware.TimezoneMiddleware", +] + +ROOT_URLCONF = "scancodeio.urls" + +WSGI_APPLICATION = "scancodeio.wsgi.application" + +SECURE_PROXY_SSL_HEADER = env.tuple( + "SECURE_PROXY_SSL_HEADER", default=("HTTP_X_FORWARDED_PROTO", "https") +) + +# Database + +DATABASES = { + "default": { + "ENGINE": env.str("SCANCODEIO_DB_ENGINE", "django.db.backends.postgresql"), + "HOST": env.str("SCANCODEIO_DB_HOST", "localhost"), + "NAME": env.str("SCANCODEIO_DB_NAME", "scancodeio"), + "USER": env.str("SCANCODEIO_DB_USER", "scancodeio"), + "PASSWORD": env.str("SCANCODEIO_DB_PASSWORD", "scancodeio"), + "PORT": env.str("SCANCODEIO_DB_PORT", "5432"), + "ATOMIC_REQUESTS": True, + } +} + +DEFAULT_AUTO_FIELD = "django.db.models.AutoField" + +# Forms and filters + +FILTERS_EMPTY_CHOICE_LABEL = env.str("FILTERS_EMPTY_CHOICE_LABEL", default="All") + +# Templates + +TEMPLATES = [ + { + "BACKEND": "django.template.backends.django.DjangoTemplates", + "APP_DIRS": True, + "OPTIONS": { + "debug": DEBUG, + "context_processors": [ + "django.contrib.auth.context_processors.auth", + "django.contrib.messages.context_processors.messages", + "django.template.context_processors.request", + "scancodeio.context_processors.versions", + ], + }, + }, +] + +# Login + +LOGIN_REDIRECT_URL = "project_list" + +# Passwords + +AUTH_PASSWORD_VALIDATORS = [ + { + "NAME": ( + "django.contrib.auth.password_validation.UserAttributeSimilarityValidator" + ), + }, + { + "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator", + "OPTIONS": { + "min_length": env.int("SCANCODEIO_PASSWORD_MIN_LENGTH", default=12), + }, + }, + { + "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator", + }, + { + "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator", + }, +] + +# Testing + +if IS_TESTS: + from django.core.management.utils import get_random_secret_key + + SECRET_KEY = get_random_secret_key() + # Do not pollute the workspace while running the tests. + SCANCODEIO_WORKSPACE_LOCATION = tempfile.mkdtemp() + SCANCODEIO_REQUIRE_AUTHENTICATION = True + SCANCODEIO_SCAN_FILE_TIMEOUT = 120 + SCANCODEIO_POLICIES_FILE = None + # The default password hasher is rather slow by design. + # Using a faster hashing algorithm in the testing context to speed up the run. + PASSWORD_HASHERS = ["django.contrib.auth.hashers.MD5PasswordHasher"] + +# Debug toolbar + +DEBUG_TOOLBAR = env.bool("SCANCODEIO_DEBUG_TOOLBAR", default=False) +if DEBUG and DEBUG_TOOLBAR: + INSTALLED_APPS.append("debug_toolbar") + MIDDLEWARE.append("debug_toolbar.middleware.DebugToolbarMiddleware") + INTERNAL_IPS = ["127.0.0.1"] + +# Logging + +LOGGING = { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "simple": { + "format": "{levelname} {message}", + "style": "{", + }, + }, + "handlers": { + "null": { + "class": "logging.NullHandler", + }, + "console": { + "class": "logging.StreamHandler", + "formatter": "simple", + }, + }, + "loggers": { + "scanpipe": { + "handlers": ["null"] if IS_TESTS else ["console"], + "level": SCANCODEIO_LOG_LEVEL, + "propagate": False, + }, + "django": { + "handlers": ["null"] if IS_TESTS else ["console"], + "propagate": False, + }, + # Set SCANCODEIO_LOG_LEVEL=DEBUG to display all SQL queries in the console. + "django.db.backends": { + "level": SCANCODEIO_LOG_LEVEL, + }, + }, +} + +# Instead of sending out real emails the console backend just writes the emails +# that would be sent to the standard output. +EMAIL_BACKEND = "django.core.mail.backends.console.EmailBackend" + +# Internationalization + +LANGUAGE_CODE = "en-us" + +FORMAT_MODULE_PATH = ["scancodeio.formats"] + +TIME_ZONE = env.str("TIME_ZONE", default="UTC") + +USE_I18N = True + +USE_TZ = True + +# Static files (CSS, JavaScript, Images) + +STATIC_URL = "/static/" + +STATIC_ROOT = env.str("STATIC_ROOT", default="/var/scancodeio/static/") + +STATICFILES_DIRS = [ + PROJECT_DIR("static"), +] + +# Third-party apps + +CRISPY_TEMPLATE_PACK = "bootstrap3" + +# Centralized archive directory for all projects +CENTRAL_ARCHIVE_PATH = env.str( + "CENTRAL_ARCHIVE_PATH", default="/var/scancodeio/archives" +) + +# localstorage configuration +DOWNLOAD_ARCHIVING_PROVIDER = env.str( + "DOWNLOAD_ARCHIVING_PROVIDER", default="localstorage" +) + +# For local storage, we would store the root path in that setting +DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION = env.dict( + "DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION", default=None +) + +# Initialize the DownloadStore for local storage + +download_store = None +logger = logging.getLogger(__name__) +if DOWNLOAD_ARCHIVING_PROVIDER == "localstorage": + config = DOWNLOAD_ARCHIVING_PROVIDER_CONFIGURATION or {} + root_path = Path(config.get("root_path", CENTRAL_ARCHIVE_PATH)) + try: + download_store = LocalFilesystemProvider(root_path=root_path) + except Exception as e: + logger.error(f"Failed to initialize LocalFilesystemProvider: {e}") +else: + logger.error(f"Unknown DOWNLOAD_ARCHIVING_PROVIDER: {DOWNLOAD_ARCHIVING_PROVIDER}") + +# Job Queue + +RQ_QUEUES = { + "default": { + "HOST": env.str("SCANCODEIO_RQ_REDIS_HOST", default="localhost"), + "PORT": env.str("SCANCODEIO_RQ_REDIS_PORT", default="6379"), + "DB": env.int("SCANCODEIO_RQ_REDIS_DB", default=0), + "USERNAME": env.str("SCANCODEIO_RQ_REDIS_USERNAME", default=None), + "PASSWORD": env.str("SCANCODEIO_RQ_REDIS_PASSWORD", default=""), + "DEFAULT_TIMEOUT": env.int("SCANCODEIO_RQ_REDIS_DEFAULT_TIMEOUT", default=360), + # Enable SSL for Redis connections when deploying ScanCode.io in environments + # where Redis is hosted on a separate system (e.g., cloud deployment or remote + # Redis server) to secure data in transit. + "SSL": env.bool("SCANCODEIO_RQ_REDIS_SSL", default=False), + }, +} + +SCANCODEIO_ASYNC = env.bool("SCANCODEIO_ASYNC", default=False) +if not SCANCODEIO_ASYNC: + for queue_config in RQ_QUEUES.values(): + queue_config["ASYNC"] = False + +# ClamAV virus scan +CLAMD_USE_TCP = env.bool("CLAMD_USE_TCP", default=True) +CLAMD_TCP_ADDR = env.str("CLAMD_TCP_ADDR", default="clamav") + +# Django restframework + +REST_FRAMEWORK = { + "DEFAULT_AUTHENTICATION_CLASSES": ( + "rest_framework.authentication.TokenAuthentication", + ), + "DEFAULT_PERMISSION_CLASSES": ("rest_framework.permissions.IsAuthenticated",), + "DEFAULT_RENDERER_CLASSES": ( + "rest_framework.renderers.JSONRenderer", + "rest_framework.renderers.BrowsableAPIRenderer", + "rest_framework.renderers.AdminRenderer", + ), + "DEFAULT_FILTER_BACKENDS": ( + "django_filters.rest_framework.DjangoFilterBackend", + "rest_framework.filters.SearchFilter", + ), + "DEFAULT_PAGINATION_CLASS": "rest_framework.pagination.PageNumberPagination", + "PAGE_SIZE": env.int("SCANCODEIO_REST_API_PAGE_SIZE", default=50), + "UPLOADED_FILES_USE_URL": False, +} + +if not SCANCODEIO_REQUIRE_AUTHENTICATION: + REST_FRAMEWORK["DEFAULT_PERMISSION_CLASSES"] = ( + "rest_framework.permissions.AllowAny", + ) + +# VulnerableCode integration + +VULNERABLECODE_URL = env.str("VULNERABLECODE_URL", default="").rstrip("/") +VULNERABLECODE_USER = env.str("VULNERABLECODE_USER", default="") +VULNERABLECODE_PASSWORD = env.str("VULNERABLECODE_PASSWORD", default="") +VULNERABLECODE_API_KEY = env.str("VULNERABLECODE_API_KEY", default="") + +# PurlDB integration + +PURLDB_URL = env.str("PURLDB_URL", default="").rstrip("/") +PURLDB_USER = env.str("PURLDB_USER", default="") +PURLDB_PASSWORD = env.str("PURLDB_PASSWORD", default="") +PURLDB_API_KEY = env.str("PURLDB_API_KEY", default="") + +# MatchCode.io integration + +MATCHCODEIO_URL = env.str("MATCHCODEIO_URL", default="").rstrip("/") +MATCHCODEIO_USER = env.str("MATCHCODEIO_USER", default="") +MATCHCODEIO_PASSWORD = env.str("MATCHCODEIO_PASSWORD", default="") +MATCHCODEIO_API_KEY = env.str("MATCHCODEIO_API_KEY", default="") + +# FederatedCode integration + +FEDERATEDCODE_GIT_ACCOUNT_URL = env.str( + "FEDERATEDCODE_GIT_ACCOUNT_URL", default="" +).rstrip("/") +FEDERATEDCODE_GIT_SERVICE_TOKEN = env.str("FEDERATEDCODE_GIT_SERVICE_TOKEN", default="") +FEDERATEDCODE_GIT_SERVICE_NAME = env.str("FEDERATEDCODE_GIT_SERVICE_NAME", default="") +FEDERATEDCODE_GIT_SERVICE_EMAIL = env.str("FEDERATEDCODE_GIT_SERVICE_EMAIL", default="") diff --git a/scanpipe/archiving.py b/scanpipe/archiving.py index 482f448de5..3f3d66e2e8 100644 --- a/scanpipe/archiving.py +++ b/scanpipe/archiving.py @@ -1,190 +1,185 @@ -# scanpipe/archiving.py -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - -import hashlib -import json -import logging -import os -import stat -from abc import ABC -from abc import abstractmethod -from dataclasses import dataclass -from pathlib import Path - - -logger = logging.getLogger(__name__) - - -@dataclass -class Download: - sha256: str - download_date: str - download_url: str - filename: str - - -class DownloadStore(ABC): - def _compute_sha256(self, content: bytes) -> str: - """Compute SHA256 hash for content.""" - return hashlib.sha256(content).hexdigest() - - def _compute_origin_hash( - self, filename: str, download_date: str, download_url: str - ) -> str: - """Compute a hash for the metadata to name the origin JSON file.""" - to_hash = f"{filename}{download_date}{download_url}".encode() - return hashlib.sha256(to_hash).hexdigest() - - def _build_metadata( - self, sha256: str, filename: str, download_date: str, download_url: str - ) -> dict: - """Build metadata dictionary for JSON storage.""" - return { - "sha256": sha256, - "filename": filename, - "download_date": download_date, - "download_url": download_url, - } - - @abstractmethod - def _get_content_path(self, sha256: str) -> str: - """Get the storage path/key for the content based on SHA256.""" - pass - - @abstractmethod - def list(self): - """Return an iterable of all stored downloads.""" - pass - - @abstractmethod - def get(self, sha256_checksum: str): - """Return a Download object for this checksum or None.""" - pass - - @abstractmethod - def put(self, content: bytes, download_url: str, download_date: str, filename: str): - """ - Store content with its metadata. Return a Download object on success. - Raise an exception on error. - """ - pass - - @abstractmethod - def find( - self, download_url: str = None, filename: str = None, download_date: str = None - ): - """Return a Download object matching the metadata or None.""" - pass - - -class LocalFilesystemProvider(DownloadStore): - def __init__(self, root_path: Path): - self.root_path = root_path - - def _get_content_path(self, sha256: str) -> Path: - """Create a nested path like 59/4c/67/... based on the SHA256 hash.""" - return self.root_path / sha256[:2] / sha256[2:4] / sha256[4:] - - def list(self): - """Return an iterable of all stored downloads.""" - downloads = [] - for content_path in self.root_path.rglob("content"): - origin_files = list(content_path.parent.glob("origin-*.json")) - for origin_file in origin_files: - try: - with open(origin_file) as f: - data = json.load(f) - downloads.append(Download(**data)) - except Exception as e: - logger.error(f"Error reading {origin_file}: {e}") - return downloads - - def get(self, sha256_checksum: str): - """Retrieve a Download object for the given SHA256 hash.""" - content_path = self._get_content_path(sha256_checksum) - if content_path.exists(): - origin_files = list(content_path.glob("origin-*.json")) - if origin_files: - try: - with open(origin_files[0]) as f: - data = json.load(f) - return Download(**data) - except Exception as e: - logger.error( - f"Error reading origin file for {sha256_checksum}: {e}" - ) - return None - - def put(self, content: bytes, download_url: str, download_date: str, filename: str): - """Store the content and its metadata.""" - sha256 = self._compute_sha256(content) - content_path = self._get_content_path(sha256) - content_path.mkdir(parents=True, exist_ok=True) - - content_file = content_path / "content" - if not content_file.exists(): - try: - with open(content_file, "wb") as f: - f.write(content) - except Exception as e: - raise Exception(f"Failed to write content to {content_file}: {e}") - - origin_hash = self._compute_origin_hash(filename, download_date, download_url) - origin_filename = f"origin-{origin_hash}.json" - origin_path = content_path / origin_filename - if origin_path.exists(): - raise Exception(f"Origin {origin_filename} already exists") - - metadata = self._build_metadata(sha256, filename, download_date, download_url) - try: - with open(origin_path, "w") as f: - json.dump(metadata, f, indent=2) - except Exception as e: - raise Exception(f"Failed to write metadata to {origin_path}: {e}") - - return Download(**metadata) - - def find( - self, download_url: str = None, filename: str = None, download_date: str = None - ): - """Find a download based on metadata.""" - if not (download_url or filename or download_date): - return None - for content_path in self.root_path.rglob("origin-*.json"): - try: - with open(content_path) as f: - data = json.load(f) - if ( - (download_url is None or data.get("url") == download_url) - and (filename is None or data.get("filename") == filename) - and ( - download_date is None - or data.get("download_date") == download_date - ) - ): - return Download(**data) - except Exception as e: - logger.error(f"Error reading {content_path}: {e}") - return None - - +# scanpipe/archiving.py +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +import hashlib +import json +import logging +from abc import ABC +from abc import abstractmethod +from dataclasses import dataclass +from pathlib import Path + +logger = logging.getLogger(__name__) + + +@dataclass +class Download: + sha256: str + download_date: str + download_url: str + filename: str + + +class DownloadStore(ABC): + def _compute_sha256(self, content: bytes) -> str: + """Compute SHA256 hash for content.""" + return hashlib.sha256(content).hexdigest() + + def _compute_origin_hash( + self, filename: str, download_date: str, download_url: str + ) -> str: + """Compute a hash for the metadata to name the origin JSON file.""" + to_hash = f"{filename}{download_date}{download_url}".encode() + return hashlib.sha256(to_hash).hexdigest() + + def _build_metadata( + self, sha256: str, filename: str, download_date: str, download_url: str + ) -> dict: + """Build metadata dictionary for JSON storage.""" + return { + "sha256": sha256, + "filename": filename, + "download_date": download_date, + "download_url": download_url, + } + + @abstractmethod + def _get_content_path(self, sha256: str) -> str: + """Get the storage path/key for the content based on SHA256.""" + pass + + @abstractmethod + def list(self): + """Return an iterable of all stored downloads.""" + pass + + @abstractmethod + def get(self, sha256_checksum: str): + """Return a Download object for this checksum or None.""" + pass + + @abstractmethod + def put(self, content: bytes, download_url: str, download_date: str, filename: str): + """ + Store content with its metadata. Return a Download object on success. + Raise an exception on error. + """ + pass + + @abstractmethod + def find( + self, download_url: str = None, filename: str = None, download_date: str = None + ): + """Return a Download object matching the metadata or None.""" + pass + + +class LocalFilesystemProvider(DownloadStore): + def __init__(self, root_path: Path): + self.root_path = root_path + + def _get_content_path(self, sha256: str) -> Path: + """Create a nested path like 59/4c/67/... based on the SHA256 hash.""" + return self.root_path / sha256[:2] / sha256[2:4] / sha256[4:] + + def list(self): + """Return an iterable of all stored downloads.""" + downloads = [] + for content_path in self.root_path.rglob("content"): + origin_files = list(content_path.parent.glob("origin-*.json")) + for origin_file in origin_files: + try: + with open(origin_file) as f: + data = json.load(f) + downloads.append(Download(**data)) + except Exception as e: + logger.error(f"Error reading {origin_file}: {e}") + return downloads + + def get(self, sha256_checksum: str): + """Retrieve a Download object for the given SHA256 hash.""" + content_path = self._get_content_path(sha256_checksum) + if content_path.exists(): + origin_files = list(content_path.glob("origin-*.json")) + if origin_files: + try: + with open(origin_files[0]) as f: + data = json.load(f) + return Download(**data) + except Exception as e: + logger.error( + f"Error reading origin file for {sha256_checksum}: {e}" + ) + return None + + def put(self, content: bytes, download_url: str, download_date: str, filename: str): + """Store the content and its metadata.""" + sha256 = self._compute_sha256(content) + content_path = self._get_content_path(sha256) + content_path.mkdir(parents=True, exist_ok=True) + + content_file = content_path / "content" + if not content_file.exists(): + try: + with open(content_file, "wb") as f: + f.write(content) + except Exception as e: + raise Exception(f"Failed to write content to {content_file}: {e}") + + origin_hash = self._compute_origin_hash(filename, download_date, download_url) + origin_filename = f"origin-{origin_hash}.json" + origin_path = content_path / origin_filename + if origin_path.exists(): + raise Exception(f"Origin {origin_filename} already exists") + + metadata = self._build_metadata(sha256, filename, download_date, download_url) + try: + with open(origin_path, "w") as f: + json.dump(metadata, f, indent=2) + except Exception as e: + raise Exception(f"Failed to write metadata to {origin_path}: {e}") + + return Download(**metadata) + + def find( + self, download_url: str = None, filename: str = None, download_date: str = None + ): + """Find a download based on metadata.""" + if not (download_url or filename or download_date): + return None + for content_path in self.root_path.rglob("origin-*.json"): + try: + with open(content_path) as f: + data = json.load(f) + if ( + (download_url is None or data.get("url") == download_url) + and (filename is None or data.get("filename") == filename) + and ( + download_date is None + or data.get("download_date") == download_date + ) + ): + return Download(**data) + except Exception as e: + logger.error(f"Error reading {content_path}: {e}") + return None diff --git a/scanpipe/pipelines/__init__.py b/scanpipe/pipelines/__init__.py index 1b6cd4e0a0..5153bf1887 100644 --- a/scanpipe/pipelines/__init__.py +++ b/scanpipe/pipelines/__init__.py @@ -1,346 +1,353 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - -import inspect -import logging -import traceback -import hashlib -from contextlib import contextmanager -from datetime import datetime -from functools import wraps -from pathlib import Path - -import bleach -import requests -from markdown_it import MarkdownIt -from pyinstrument import Profiler - -from aboutcode.pipeline import BasePipeline -from scancodeio.settings import download_store - -logger = logging.getLogger(__name__) - - -class InputFilesError(Exception): - """InputFile is missing or cannot be downloaded.""" - - def __init__(self, error_tracebacks): - self.error_tracebacks = error_tracebacks - super().__init__(self._generate_message()) - - def _generate_message(self): - message = "InputFilesError encountered with the following issues:\n" - for index, (error, tb) in enumerate(self.error_tracebacks, start=1): - message += f"\nError {index}: {str(error)}\n\n{tb}" - return message - - -def convert_markdown_to_html(markdown_text): - """Convert Markdown text to sanitized HTML.""" - # Using the "js-default" for safety. - html_content = MarkdownIt("js-default").renderInline(markdown_text) - # Sanitize HTML using bleach. - sanitized_html = bleach.clean(html_content) - return sanitized_html - - -class CommonStepsMixin: - """Common steps available on all project pipelines.""" - - def flag_empty_files(self): - """Flag empty files.""" - from scanpipe.pipes import flag - - flag.flag_empty_files(self.project) - - def flag_ignored_resources(self): - """Flag ignored resources based on Project ``ignored_patterns`` setting.""" - from scanpipe.pipes import flag - - ignored_patterns = self.env.get("ignored_patterns", []) - - if isinstance(ignored_patterns, str): - ignored_patterns = ignored_patterns.splitlines() - ignored_patterns.extend(flag.DEFAULT_IGNORED_PATTERNS) - - flag.flag_ignored_patterns( - codebaseresources=self.project.codebaseresources.no_status(), - patterns=ignored_patterns, - ) - - def extract_archive(self, location, target): - """Extract archive at `location` to `target`. Save errors as messages.""" - from scanpipe.pipes import scancode - - extract_errors = scancode.extract_archive(location, target) - - for resource_location, errors in extract_errors.items(): - resource_path = Path(resource_location) - - if resource_path.is_relative_to(self.project.codebase_path): - resource_path = resource_path.relative_to(self.project.codebase_path) - details = {"resource_path": str(resource_path)} - elif resource_path.is_relative_to(self.project.input_path): - resource_path = resource_path.relative_to(self.project.input_path) - details = {"path": f"input/{str(resource_path)}"} - else: - details = {"filename": str(resource_path.name)} - - self.project.add_error( - description="\n".join(errors), - model="extract_archive", - details=details, - ) - - def extract_archives(self, location=None): - """Extract archives located in the codebase/ directory with extractcode.""" - from scanpipe.pipes import scancode - - if not location: - location = self.project.codebase_path - - extract_errors = scancode.extract_archives(location=location, recurse=True) - - for resource_path, errors in extract_errors.items(): - self.project.add_error( - description="\n".join(errors), - model="extract_archives", - details={"resource_path": resource_path}, - ) - - # Reload the project env post-extraction as the scancode-config.yml file - # may be located in one of the extracted archives. - self.env = self.project.get_env() - - def download_missing_inputs(self): - """ - Download any InputSource missing on disk. - Raise an error if any of the uploaded files is not available or not reachable. - """ - error_tracebacks = [] - - for input_source in self.project.inputsources.all(): - if input_source.exists(): - continue - - if input_source.is_uploaded: - msg = f"Uploaded file {input_source} not available." - self.log(msg) - error_tracebacks.append((msg, "No traceback available.")) - continue - - download_url = input_source.download_url - if not download_url: - continue - - url_hash = hashlib.sha256(download_url.encode()).hexdigest() - filename = input_source.filename or Path(download_url).name or f"{url_hash}.archive" - archive_path = Path(settings.CENTRAL_ARCHIVE_PATH) / url_hash / filename - - if archive_path.exists(): - logger.info(f"Reusing existing archive at {archive_path}") - input_source.file_path = str(archive_path) - input_source.save() - continue - - self.log(f"Fetching input from {input_source.download_url}") - try: - input_source.fetch() - - except Exception as error: - traceback_str = traceback.format_exc() - logger.error(traceback_str) - self.log(f"{input_source.download_url} could not be fetched.") - error_tracebacks.append((str(error), traceback_str)) - - if error_tracebacks: - raise InputFilesError(error_tracebacks) - - def archive_downloads(self): - """ - Archive downloaded inputs to the centralized DownloadStore if not already - archived.Updates InputSource with archiving metadata (sha256, download_date). - """ - logger.info(f"Archiving downloads for project {self.project.name}") - for input_source in self.project.inputsources.filter( - sha256__isnull=True, is_uploaded=False - ): - if input_source.download_url: - try: - response = requests.get( - input_source.download_url, stream=True,timeout=30 - ) - response.raise_for_status() - content = response.content - filename = ( - input_source.filename - or input_source.download_url.split("/")[-1] - ) - download = download_store.put( - content=content, - download_url=input_source.download_url, - download_date=datetime.now().isoformat(), - filename=filename, - ) - input_source.sha256 = download.sha256 - input_source.download_date = download.download_date - input_source.save() - except Exception as e: - self.add_error( - exception=e, - message=f"Failed to archive {input_source.download_url}", - ) - else: - logger.warning( - f"No download URL for input {input_source.filename}," - "skipping archiving" - ) - - -class ProjectPipeline(CommonStepsMixin, BasePipeline): - """Main class for all project related pipelines including common steps methods.""" - - # Flag specifying whether to download missing inputs as an initial step. - download_inputs = True - - # Optional URL that targets a view of the results relative to this Pipeline. - # This URL may contain dictionary-style string formatting, which will be - # interpolated against the project's field attributes. - # For example, you could use results_url="/project/{slug}/packages/?filter=value" - # to target the Package list view with an active filtering. - results_url = "" - - def __init__(self, run_instance): - """Load the Pipeline execution context from a Run database object.""" - self.run = run_instance - self.project = run_instance.project - self.env = self.project.get_env() - - self.pipeline_class = run_instance.pipeline_class - self.pipeline_name = run_instance.pipeline_name - - self.selected_groups = run_instance.selected_groups or [] - self.selected_steps = run_instance.selected_steps or [] - - self.ecosystem_config = None - - @classmethod - def get_initial_steps(cls): - """Add the ``download_inputs`` step as an initial step if enabled.""" - steps = [] - if cls.download_inputs: - steps.append(cls.download_missing_inputs) - if ENABLE_DOWNLOAD_ARCHIVING: - steps.append(cls.archive_downloads) - return tuple(steps) - - @classmethod - def get_info(cls, as_html=False): - """Add the option to render the values as HTML.""" - info = super().get_info() - - if as_html: - info["summary"] = convert_markdown_to_html(info["summary"]) - info["description"] = convert_markdown_to_html(info["description"]) - for step in info["steps"]: - step["doc"] = convert_markdown_to_html(step["doc"]) - - return info - - def append_to_log(self, message): - self.run.append_to_log(message) - - def set_current_step(self, message): - self.run.set_current_step(message) - - def add_error(self, exception, resource=None): - """Create a ``ProjectMessage`` ERROR record on the current `project`.""" - self.project.add_error( - model=self.pipeline_name, - exception=exception, - object_instance=resource, - ) - - @contextmanager - def save_errors(self, *exceptions, **kwargs): - """ - Context manager to save specified exceptions as ``ProjectMessage`` in the - database. - - - Example in a Pipeline step:: - - with self.save_errors(rootfs.DistroNotFound): - rootfs.scan_rootfs_for_system_packages(self.project, rfs) - - - Example when iterating over resources:: - - for resource in self.project.codebaseresources.all(): - with self.save_errors(Exception, resource=resource): - analyse(resource) - """ - try: - yield - except exceptions as error: - self.add_error(exception=error, **kwargs) - - -class Pipeline(ProjectPipeline): - """Alias for the ProjectPipeline class.""" - - pass - - -def is_pipeline(obj): - """ - Return True if the `obj` is a subclass of `Pipeline` except for the - `Pipeline` class itself. - """ - return inspect.isclass(obj) and issubclass(obj, Pipeline) and obj is not Pipeline - - -def profile(step): - """ - Profile a Pipeline step and save the results as HTML file in the project output - directory. - - Usage: - @profile - def step(self): - pass - """ - - @wraps(step) - def wrapper(*arg, **kwargs): - pipeline_instance = arg[0] - project = pipeline_instance.project - - with Profiler() as profiler: - result = step(*arg, **kwargs) - - output_file = project.get_output_file_path("profile", "html") - output_file.write_text(profiler.output_html()) - - pipeline_instance.log(f"Profiling results at {output_file.resolve()}") - - return result - - return wrapper +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +import hashlib +import inspect +import logging +import traceback +from contextlib import contextmanager +from datetime import datetime +from functools import wraps +from pathlib import Path + +import bleach +from markdown_it import MarkdownIt +from pyinstrument import Profiler + +from aboutcode.pipeline import BasePipeline +from scancodeio.settings import download_store +from scancodeio.settings import settings + +logger = logging.getLogger(__name__) + + +class InputFilesError(Exception): + """InputFile is missing or cannot be downloaded.""" + + def __init__(self, error_tracebacks): + self.error_tracebacks = error_tracebacks + super().__init__(self._generate_message()) + + def _generate_message(self): + message = "InputFilesError encountered with the following issues:\n" + for index, (error, tb) in enumerate(self.error_tracebacks, start=1): + message += f"\nError {index}: {str(error)}\n\n{tb}" + return message + + +def convert_markdown_to_html(markdown_text): + """Convert Markdown text to sanitized HTML.""" + # Using the "js-default" for safety. + html_content = MarkdownIt("js-default").renderInline(markdown_text) + # Sanitize HTML using bleach. + sanitized_html = bleach.clean(html_content) + return sanitized_html + + +class CommonStepsMixin: + """Common steps available on all project pipelines.""" + + def flag_empty_files(self): + """Flag empty files.""" + from scanpipe.pipes import flag + + flag.flag_empty_files(self.project) + + def flag_ignored_resources(self): + """Flag ignored resources based on Project ``ignored_patterns`` setting.""" + from scanpipe.pipes import flag + + ignored_patterns = self.env.get("ignored_patterns", []) + + if isinstance(ignored_patterns, str): + ignored_patterns = ignored_patterns.splitlines() + ignored_patterns.extend(flag.DEFAULT_IGNORED_PATTERNS) + + flag.flag_ignored_patterns( + codebaseresources=self.project.codebaseresources.no_status(), + patterns=ignored_patterns, + ) + + def extract_archive(self, location, target): + """Extract archive at `location` to `target`. Save errors as messages.""" + from scanpipe.pipes import scancode + + extract_errors = scancode.extract_archive(location, target) + + for resource_location, errors in extract_errors.items(): + resource_path = Path(resource_location) + + if resource_path.is_relative_to(self.project.codebase_path): + resource_path = resource_path.relative_to(self.project.codebase_path) + details = {"resource_path": str(resource_path)} + elif resource_path.is_relative_to(self.project.input_path): + resource_path = resource_path.relative_to(self.project.input_path) + details = {"path": f"input/{str(resource_path)}"} + else: + details = {"filename": str(resource_path.name)} + + self.project.add_error( + description="\n".join(errors), + model="extract_archive", + details=details, + ) + + def extract_archives(self, location=None): + """Extract archives located in the codebase/ directory with extractcode.""" + from scanpipe.pipes import scancode + + if not location: + location = self.project.codebase_path + + extract_errors = scancode.extract_archives(location=location, recurse=True) + + for resource_path, errors in extract_errors.items(): + self.project.add_error( + description="\n".join(errors), + model="extract_archives", + details={"resource_path": resource_path}, + ) + + # Reload the project env post-extraction as the scancode-config.yml file + # may be located in one of the extracted archives. + self.env = self.project.get_env() + + def download_missing_inputs(self): + """ + Download any InputSource missing on disk. + Raise an error if any of the uploaded files is not available or not reachable. + """ + error_tracebacks = [] + + for input_source in self.project.inputsources.all(): + if input_source.exists(): + continue + + if input_source.is_uploaded: + msg = f"Uploaded file {input_source} not available." + self.log(msg) + error_tracebacks.append((msg, "No traceback available.")) + continue + + download_url = input_source.download_url + if not download_url: + continue + + url_hash = hashlib.sha256(download_url.encode()).hexdigest() + filename = ( + input_source.filename + or Path(download_url).name + or f"{url_hash}.archive" + ) + archive_path = Path(settings.CENTRAL_ARCHIVE_PATH) / url_hash / filename + + if archive_path.exists(): + logger.info(f"Reusing existing archive at {archive_path}") + input_source.file_path = str(archive_path) + input_source.save() + continue + + self.log(f"Fetching input from {input_source.download_url}") + try: + input_source.fetch() + + except Exception as error: + traceback_str = traceback.format_exc() + logger.error(traceback_str) + self.log(f"{input_source.download_url} could not be fetched.") + error_tracebacks.append((str(error), traceback_str)) + + if error_tracebacks: + raise InputFilesError(error_tracebacks) + + def archive_downloads(self): + """ + Archive downloaded inputs to the centralized DownloadStore if not already + archived.Updates InputSource with archiving metadata (sha256, download_date). + """ + logger.info(f"Archiving downloads for project {self.project.name}") + for input_source in self.project.inputsources.filter( + sha256__isnull=True, is_uploaded=False + ): + if input_source.download_url: + logger.warning( + f"No download URL for input {input_source.filename}, " + "skipping archiving" + ) + continue + + if not input_source.file_path: + logger.warning( + f"No file_path for input {input_source.download_url}, " + "skipping archiving" + ) + continue + try: + with open(input_source.file_path, "rb") as f: + content = f.read() + filename = ( + input_source.filename or input_source.download_url.split("/")[-1] + ) + download = download_store.put( + content=content, + download_url=input_source.download_url, + download_date=datetime.now().isoformat(), + filename=filename, + ) + input_source.sha256 = download.sha256 + input_source.download_date = download.download_date + input_source.file_path = str(download.path) + input_source.save() + except Exception as e: + self.add_error( + exception=e, + message=f"Failed to archive {input_source.download_url}", + ) + + +class ProjectPipeline(CommonStepsMixin, BasePipeline): + """Main class for all project related pipelines including common steps methods.""" + + # Flag specifying whether to download missing inputs as an initial step. + download_inputs = True + + # Optional URL that targets a view of the results relative to this Pipeline. + # This URL may contain dictionary-style string formatting, which will be + # interpolated against the project's field attributes. + # For example, you could use results_url="/project/{slug}/packages/?filter=value" + # to target the Package list view with an active filtering. + results_url = "" + + def __init__(self, run_instance): + """Load the Pipeline execution context from a Run database object.""" + self.run = run_instance + self.project = run_instance.project + self.env = self.project.get_env() + + self.pipeline_class = run_instance.pipeline_class + self.pipeline_name = run_instance.pipeline_name + + self.selected_groups = run_instance.selected_groups or [] + self.selected_steps = run_instance.selected_steps or [] + + self.ecosystem_config = None + + @classmethod + def get_initial_steps(cls): + """Add the ``download_inputs`` step as an initial step if enabled.""" + steps = [] + if cls.download_inputs: + steps.append(cls.download_missing_inputs) + steps.append(cls.archive_downloads) + return tuple(steps) + + @classmethod + def get_info(cls, as_html=False): + """Add the option to render the values as HTML.""" + info = super().get_info() + + if as_html: + info["summary"] = convert_markdown_to_html(info["summary"]) + info["description"] = convert_markdown_to_html(info["description"]) + for step in info["steps"]: + step["doc"] = convert_markdown_to_html(step["doc"]) + + return info + + def append_to_log(self, message): + self.run.append_to_log(message) + + def set_current_step(self, message): + self.run.set_current_step(message) + + def add_error(self, exception, resource=None): + """Create a ``ProjectMessage`` ERROR record on the current `project`.""" + self.project.add_error( + model=self.pipeline_name, + exception=exception, + object_instance=resource, + ) + + @contextmanager + def save_errors(self, *exceptions, **kwargs): + """ + Context manager to save specified exceptions as ``ProjectMessage`` in the + database. + + - Example in a Pipeline step:: + + with self.save_errors(rootfs.DistroNotFound): + rootfs.scan_rootfs_for_system_packages(self.project, rfs) + + - Example when iterating over resources:: + + for resource in self.project.codebaseresources.all(): + with self.save_errors(Exception, resource=resource): + analyse(resource) + """ + try: + yield + except exceptions as error: + self.add_error(exception=error, **kwargs) + + +class Pipeline(ProjectPipeline): + """Alias for the ProjectPipeline class.""" + + pass + + +def is_pipeline(obj): + """ + Return True if the `obj` is a subclass of `Pipeline` except for the + `Pipeline` class itself. + """ + return inspect.isclass(obj) and issubclass(obj, Pipeline) and obj is not Pipeline + + +def profile(step): + """ + Profile a Pipeline step and save the results as HTML file in the project output + directory. + + Usage: + @profile + def step(self): + pass + """ + + @wraps(step) + def wrapper(*arg, **kwargs): + pipeline_instance = arg[0] + project = pipeline_instance.project + + with Profiler() as profiler: + result = step(*arg, **kwargs) + + output_file = project.get_output_file_path("profile", "html") + output_file.write_text(profiler.output_html()) + + pipeline_instance.log(f"Profiling results at {output_file.resolve()}") + + return result + + return wrapper diff --git a/scanpipe/pipes/input.py b/scanpipe/pipes/input.py index 81ae91c21d..906a2ee3a1 100644 --- a/scanpipe/pipes/input.py +++ b/scanpipe/pipes/input.py @@ -1,347 +1,345 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - -import hashlib -import logging -import os -import shutil -from datetime import datetime -from pathlib import Path - -from django.core.exceptions import FieldDoesNotExist -from django.core.validators import EMPTY_VALUES -from django.db import models - -import openpyxl -import requests -from typecode.contenttype import get_type - -from scanpipe import pipes -from scanpipe.models import CodebaseRelation -from scanpipe.models import CodebaseResource -from scanpipe.models import DiscoveredDependency -from scanpipe.models import DiscoveredLicense -from scanpipe.models import DiscoveredPackage -from scanpipe.models import InputSource -from scanpipe.pipes import scancode -from scanpipe.pipes.output import mappings_key_by_fieldname -from scancodeio.settings import download_store - -logger = logging.getLogger(__name__) - - -def copy_input(input_location, dest_path): - """Copy the ``input_location`` (file or directory) to the ``dest_path``.""" - input_path = Path(input_location) - destination_dir = Path(dest_path) - destination = destination_dir / input_path.name - - if input_path.is_dir(): - shutil.copytree(input_location, destination) - else: - if not os.path.exists(destination_dir): - os.makedirs(destination_dir) - shutil.copyfile(input_location, destination) - - return destination - - -def copy_inputs(input_locations, dest_path): - """Copy the provided ``input_locations`` to the ``dest_path``.""" - for input_location in input_locations: - copy_input(input_location, dest_path) - - -def move_input(input_location, dest_path): - """Move the provided ``input_location`` to the ``dest_path``.""" - destination = dest_path / Path(input_location).name - return shutil.move(input_location, destination) - - -def move_inputs(inputs, dest_path): - """Move the provided ``inputs`` to the ``dest_path``.""" - for input_location in inputs: - move_input(input_location, dest_path) - - -def get_tool_name_from_scan_headers(scan_data): - """Return the ``tool_name`` of the first header in the provided ``scan_data``.""" - if headers := scan_data.get("headers", []): - first_header = headers[0] - tool_name = first_header.get("tool_name", "") - return tool_name - - -def get_extra_data_from_scan_headers(scan_data): - """Return the ``extra_data`` of the first header in the provided ``scan_data``.""" - if headers := scan_data.get("headers", []): - first_header = headers[0] - if extra_data := first_header.get("extra_data"): - return extra_data - - -def is_archive(location): - """Return True if the file at ``location`` is an archive.""" - return get_type(location).is_archive - - -def load_inventory_from_toolkit_scan(project, input_location): - """ - Create license detections, packages, dependencies, and resources - loaded from the ScanCode-toolkit scan results located at ``input_location``. - """ - scanned_codebase = scancode.get_virtual_codebase(project, input_location) - scancode.create_discovered_licenses(project, scanned_codebase) - scancode.create_discovered_packages(project, scanned_codebase) - scancode.create_codebase_resources(project, scanned_codebase) - scancode.create_discovered_dependencies( - project, scanned_codebase, strip_datafile_path_root=True - ) - scancode.load_todo_issues(project, scanned_codebase) - - -def load_inventory_from_scanpipe(project, scan_data, extra_data_prefix=None): - """ - Create packages, dependencies, license detections, resources, and relations - loaded from a ScanCode.io JSON output provided as ``scan_data``. - - An ``extra_data_prefix`` can be provided in case multiple input files are loaded - into the same project. The prefix is usually the filename of the input. - """ - for detection_data in scan_data.get("license_detections", []): - pipes.update_or_create_license_detection(project, detection_data) - - for package_data in scan_data.get("packages", []): - pipes.update_or_create_package(project, package_data) - - for resource_data in scan_data.get("files", []): - pipes.update_or_create_resource(project, resource_data) - - for dependency_data in scan_data.get("dependencies", []): - pipes.update_or_create_dependency(project, dependency_data) - - for relation_data in scan_data.get("relations", []): - pipes.get_or_create_relation(project, relation_data) - - if extra_data := get_extra_data_from_scan_headers(scan_data): - if extra_data_prefix: - extra_data = {extra_data_prefix: extra_data} - project.update_extra_data(extra_data) - - -model_to_object_maker_func = { - DiscoveredPackage: pipes.update_or_create_package, - DiscoveredDependency: pipes.update_or_create_dependency, - DiscoveredLicense: pipes.update_or_create_license_detection, - CodebaseResource: pipes.update_or_create_resource, - CodebaseRelation: pipes.get_or_create_relation, -} - -worksheet_name_to_model = { - "PACKAGES": DiscoveredPackage, - "LICENSE_DETECTIONS": DiscoveredLicense, - "RESOURCES": CodebaseResource, - "DEPENDENCIES": DiscoveredDependency, - "RELATIONS": CodebaseRelation, -} - - -def get_worksheet_data(worksheet): - """Return the data from provided ``worksheet`` as a list of dict.""" - try: - header = [cell.value for cell in next(worksheet.rows)] - except StopIteration: - return {} - - worksheet_data = [ - dict(zip(header, row)) - for row in worksheet.iter_rows(min_row=2, values_only=True) - ] - return worksheet_data - - -def clean_xlsx_field_value(model_class, field_name, value): - """Clean the ``value`` for compatibility with the database ``model_class``.""" - if value in EMPTY_VALUES: - return - - if field_name == "for_packages": - return value.splitlines() - - elif field_name in ["purl", "for_package_uid", "datafile_path"]: - return value - - try: - field = model_class._meta.get_field(field_name) - except FieldDoesNotExist: - return - - if dict_key := mappings_key_by_fieldname.get(field_name): - return [{dict_key: entry} for entry in value.splitlines()] - - elif isinstance(field, models.JSONField): - if field.default is list: - return value.splitlines() - elif field.default is dict: - return # dict stored as JSON are not supported - - return value - - -def clean_xlsx_data_to_model_data(model_class, xlsx_data): - """Clean the ``xlsx_data`` for compatibility with the database ``model_class``.""" - cleaned_data = {} - - for field_name, value in xlsx_data.items(): - if cleaned_value := clean_xlsx_field_value(model_class, field_name, value): - cleaned_data[field_name] = cleaned_value - - return cleaned_data - - -def load_inventory_from_xlsx(project, input_location, extra_data_prefix=None): - """ - Create packages, dependencies, resources, and relations loaded from XLSX file - located at ``input_location``. - - An ``extra_data_prefix`` can be provided in case multiple input files are loaded - into the same project. The prefix is usually the filename of the input. - """ - workbook = openpyxl.load_workbook(input_location, read_only=True, data_only=True) - - for worksheet_name, model_class in worksheet_name_to_model.items(): - if worksheet_name not in workbook: - continue - - worksheet_data = get_worksheet_data(worksheet=workbook[worksheet_name]) - for row_data in worksheet_data: - object_maker_func = model_to_object_maker_func.get(model_class) - cleaned_data = clean_xlsx_data_to_model_data(model_class, row_data) - if cleaned_data: - object_maker_func(project, cleaned_data) - - if "LAYERS" in workbook: - layers_data = get_worksheet_data(worksheet=workbook["LAYERS"]) - extra_data = {"layers": layers_data} - if extra_data_prefix: - extra_data = {extra_data_prefix: extra_data} - project.update_extra_data(extra_data) - - -def add_input_from_url(project, url, filename=None): - """ - Download the file from the provided ``url`` and add it as an InputSource for the - specified ``project``. Optionally, specify a ``filename`` for the downloaded file. - If archiving is enabled, store the content in the DownloadStore and save metadata. - """ - try: - response = requests.get(url, stream=True,timeout=30) - response.raise_for_status() - content = response.content - except requests.RequestException as e: - logger.error(f"Failed to download {url}: {e}") - raise - - filename = filename or url.split("/")[-1] or "downloaded_file" - url_hash = hashlib.sha256(url.encode()).hexdigest() - archive_path = Path(project.settings.CENTRAL_ARCHIVE_PATH) / url_hash / filename - - if download_store: - try: - download = download_store.put( - content=content, - download_url=url, - download_date=datetime.now().isoformat(), - filename=filename, - ) - InputSource.objects.create( - project=project, - sha256=download.sha256, - download_url=download.download_url, - filename=download.filename, - download_date=download.download_date, - file_path=str(download.path), - is_uploaded=False, - ) - except Exception as e: - logger.error(f"Failed to archive download for {url}: {e}") - raise - else: - input_path = project.input_path / filename - try: - input_path.parent.mkdir(parents=True, exist_ok=True) - with open(input_path, "wb") as f: - f.write(content) - InputSource.objects.create( - project=project, - filename=filename, - download_url=url, - file_path=str(input_path), - is_uploaded=False, - ) - except Exception as e: - logger.error(f"Failed to save {filename} to {input_path}: {e}") - raise - -def add_input_from_upload(project, uploaded_file): - """ - Add an uploaded file as an InputSource for the specified ``project``. - If archiving is enabled, store the content in the DownloadStore and save metadata. - """ - content = uploaded_file.read() - filename = uploaded_file.name - - if download_store: - try: - download = download_store.put( - content=content, - download_url="", - download_date=datetime.now().isoformat(), - filename=filename, - ) - InputSource.objects.create( - project=project, - sha256=download.sha256, - download_url=download.download_url, - filename=download.filename, - download_date=download.download_date, - file_path=str(download.path), - is_uploaded=True, - ) - except Exception as e: - logger.error(f"Failed to archive upload {filename}: {e}") - raise - else: - input_path = project.input_path / filename - try: - input_path.parent.mkdir(parents=True, exist_ok=True) - with open(input_path, "wb") as f: - f.write(content) - InputSource.objects.create( - project=project, - filename=filename, - file_path=str(input_path), - is_uploaded=True, - ) - except Exception as e: - logger.error(f"Failed to save {filename} to {input_path}: {e}") - raise \ No newline at end of file +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +import logging +import os +import shutil +from datetime import datetime +from pathlib import Path + +from django.core.exceptions import FieldDoesNotExist +from django.core.validators import EMPTY_VALUES +from django.db import models + +import openpyxl +import requests +from typecode.contenttype import get_type + +from scancodeio.settings import download_store +from scanpipe import pipes +from scanpipe.models import CodebaseRelation +from scanpipe.models import CodebaseResource +from scanpipe.models import DiscoveredDependency +from scanpipe.models import DiscoveredLicense +from scanpipe.models import DiscoveredPackage +from scanpipe.models import InputSource +from scanpipe.pipes import scancode +from scanpipe.pipes.output import mappings_key_by_fieldname + +logger = logging.getLogger(__name__) + + +def copy_input(input_location, dest_path): + """Copy the ``input_location`` (file or directory) to the ``dest_path``.""" + input_path = Path(input_location) + destination_dir = Path(dest_path) + destination = destination_dir / input_path.name + + if input_path.is_dir(): + shutil.copytree(input_location, destination) + else: + if not os.path.exists(destination_dir): + os.makedirs(destination_dir) + shutil.copyfile(input_location, destination) + + return destination + + +def copy_inputs(input_locations, dest_path): + """Copy the provided ``input_locations`` to the ``dest_path``.""" + for input_location in input_locations: + copy_input(input_location, dest_path) + + +def move_input(input_location, dest_path): + """Move the provided ``input_location`` to the ``dest_path``.""" + destination = dest_path / Path(input_location).name + return shutil.move(input_location, destination) + + +def move_inputs(inputs, dest_path): + """Move the provided ``inputs`` to the ``dest_path``.""" + for input_location in inputs: + move_input(input_location, dest_path) + + +def get_tool_name_from_scan_headers(scan_data): + """Return the ``tool_name`` of the first header in the provided ``scan_data``.""" + if headers := scan_data.get("headers", []): + first_header = headers[0] + tool_name = first_header.get("tool_name", "") + return tool_name + + +def get_extra_data_from_scan_headers(scan_data): + """Return the ``extra_data`` of the first header in the provided ``scan_data``.""" + if headers := scan_data.get("headers", []): + first_header = headers[0] + if extra_data := first_header.get("extra_data"): + return extra_data + + +def is_archive(location): + """Return True if the file at ``location`` is an archive.""" + return get_type(location).is_archive + + +def load_inventory_from_toolkit_scan(project, input_location): + """ + Create license detections, packages, dependencies, and resources + loaded from the ScanCode-toolkit scan results located at ``input_location``. + """ + scanned_codebase = scancode.get_virtual_codebase(project, input_location) + scancode.create_discovered_licenses(project, scanned_codebase) + scancode.create_discovered_packages(project, scanned_codebase) + scancode.create_codebase_resources(project, scanned_codebase) + scancode.create_discovered_dependencies( + project, scanned_codebase, strip_datafile_path_root=True + ) + scancode.load_todo_issues(project, scanned_codebase) + + +def load_inventory_from_scanpipe(project, scan_data, extra_data_prefix=None): + """ + Create packages, dependencies, license detections, resources, and relations + loaded from a ScanCode.io JSON output provided as ``scan_data``. + + An ``extra_data_prefix`` can be provided in case multiple input files are loaded + into the same project. The prefix is usually the filename of the input. + """ + for detection_data in scan_data.get("license_detections", []): + pipes.update_or_create_license_detection(project, detection_data) + + for package_data in scan_data.get("packages", []): + pipes.update_or_create_package(project, package_data) + + for resource_data in scan_data.get("files", []): + pipes.update_or_create_resource(project, resource_data) + + for dependency_data in scan_data.get("dependencies", []): + pipes.update_or_create_dependency(project, dependency_data) + + for relation_data in scan_data.get("relations", []): + pipes.get_or_create_relation(project, relation_data) + + if extra_data := get_extra_data_from_scan_headers(scan_data): + if extra_data_prefix: + extra_data = {extra_data_prefix: extra_data} + project.update_extra_data(extra_data) + + +model_to_object_maker_func = { + DiscoveredPackage: pipes.update_or_create_package, + DiscoveredDependency: pipes.update_or_create_dependency, + DiscoveredLicense: pipes.update_or_create_license_detection, + CodebaseResource: pipes.update_or_create_resource, + CodebaseRelation: pipes.get_or_create_relation, +} + +worksheet_name_to_model = { + "PACKAGES": DiscoveredPackage, + "LICENSE_DETECTIONS": DiscoveredLicense, + "RESOURCES": CodebaseResource, + "DEPENDENCIES": DiscoveredDependency, + "RELATIONS": CodebaseRelation, +} + + +def get_worksheet_data(worksheet): + """Return the data from provided ``worksheet`` as a list of dict.""" + try: + header = [cell.value for cell in next(worksheet.rows)] + except StopIteration: + return {} + + worksheet_data = [ + dict(zip(header, row)) + for row in worksheet.iter_rows(min_row=2, values_only=True) + ] + return worksheet_data + + +def clean_xlsx_field_value(model_class, field_name, value): + """Clean the ``value`` for compatibility with the database ``model_class``.""" + if value in EMPTY_VALUES: + return + + if field_name == "for_packages": + return value.splitlines() + + elif field_name in ["purl", "for_package_uid", "datafile_path"]: + return value + + try: + field = model_class._meta.get_field(field_name) + except FieldDoesNotExist: + return + + if dict_key := mappings_key_by_fieldname.get(field_name): + return [{dict_key: entry} for entry in value.splitlines()] + + elif isinstance(field, models.JSONField): + if field.default is list: + return value.splitlines() + elif field.default is dict: + return # dict stored as JSON are not supported + + return value + + +def clean_xlsx_data_to_model_data(model_class, xlsx_data): + """Clean the ``xlsx_data`` for compatibility with the database ``model_class``.""" + cleaned_data = {} + + for field_name, value in xlsx_data.items(): + if cleaned_value := clean_xlsx_field_value(model_class, field_name, value): + cleaned_data[field_name] = cleaned_value + + return cleaned_data + + +def load_inventory_from_xlsx(project, input_location, extra_data_prefix=None): + """ + Create packages, dependencies, resources, and relations loaded from XLSX file + located at ``input_location``. + + An ``extra_data_prefix`` can be provided in case multiple input files are loaded + into the same project. The prefix is usually the filename of the input. + """ + workbook = openpyxl.load_workbook(input_location, read_only=True, data_only=True) + + for worksheet_name, model_class in worksheet_name_to_model.items(): + if worksheet_name not in workbook: + continue + + worksheet_data = get_worksheet_data(worksheet=workbook[worksheet_name]) + for row_data in worksheet_data: + object_maker_func = model_to_object_maker_func.get(model_class) + cleaned_data = clean_xlsx_data_to_model_data(model_class, row_data) + if cleaned_data: + object_maker_func(project, cleaned_data) + + if "LAYERS" in workbook: + layers_data = get_worksheet_data(worksheet=workbook["LAYERS"]) + extra_data = {"layers": layers_data} + if extra_data_prefix: + extra_data = {extra_data_prefix: extra_data} + project.update_extra_data(extra_data) + + +def add_input_from_url(project, url, filename=None): + """ + Download the file from the provided ``url`` and add it as an InputSource for the + specified ``project``. Optionally, specify a ``filename`` for the downloaded file. + If archiving is enabled, store the content in the DownloadStore and save metadata. + """ + try: + response = requests.get(url, stream=True, timeout=30) + response.raise_for_status() + content = response.content + except requests.RequestException as e: + logger.error(f"Failed to download {url}: {e}") + raise + + filename = filename or url.split("/")[-1] or "downloaded_file" + + if download_store: + try: + download = download_store.put( + content=content, + download_url=url, + download_date=datetime.now().isoformat(), + filename=filename, + ) + InputSource.objects.create( + project=project, + sha256=download.sha256, + download_url=download.download_url, + filename=download.filename, + download_date=download.download_date, + file_path=str(download.path), + is_uploaded=False, + ) + except Exception as e: + logger.error(f"Failed to archive download for {url}: {e}") + raise + else: + input_path = project.input_path / filename + try: + input_path.parent.mkdir(parents=True, exist_ok=True) + with open(input_path, "wb") as f: + f.write(content) + InputSource.objects.create( + project=project, + filename=filename, + download_url=url, + file_path=str(input_path), + is_uploaded=False, + ) + except Exception as e: + logger.error(f"Failed to save {filename} to {input_path}: {e}") + raise + + +def add_input_from_upload(project, uploaded_file): + """ + Add an uploaded file as an InputSource for the specified ``project``. + If archiving is enabled, store the content in the DownloadStore and save metadata. + """ + content = uploaded_file.read() + filename = uploaded_file.name + + if download_store: + try: + download = download_store.put( + content=content, + download_url="", + download_date=datetime.now().isoformat(), + filename=filename, + ) + InputSource.objects.create( + project=project, + sha256=download.sha256, + download_url=download.download_url, + filename=download.filename, + download_date=download.download_date, + file_path=str(download.path), + is_uploaded=True, + ) + except Exception as e: + logger.error(f"Failed to archive upload {filename}: {e}") + raise + else: + input_path = project.input_path / filename + try: + input_path.parent.mkdir(parents=True, exist_ok=True) + with open(input_path, "wb") as f: + f.write(content) + InputSource.objects.create( + project=project, + filename=filename, + file_path=str(input_path), + is_uploaded=True, + ) + except Exception as e: + logger.error(f"Failed to save {filename} to {input_path}: {e}") + raise diff --git a/scanpipe/tests/test_archiving.py b/scanpipe/tests/test_archiving.py index a249c96c46..0da1a236b5 100644 --- a/scanpipe/tests/test_archiving.py +++ b/scanpipe/tests/test_archiving.py @@ -1,86 +1,86 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - - -import hashlib -from pathlib import Path - -from django.test import TestCase - -from scanpipe.archiving import LocalFilesystemProvider -from scanpipe.tests import make_project - - -class TestArchiving(TestCase): - def setUp(self): - self.project = make_project() - self.root_path = Path(__file__).parent / "data" / "test_downloads" - self.store = LocalFilesystemProvider(root_path=self.root_path) - self.test_content = b"test content" - self.test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" - self.test_filename = "sample.tar.gz" - - def tearDown(self): - if self.root_path.exists(): - import shutil - - shutil.rmtree(self.root_path) - - def test_local_filesystem_provider_put_get(self): - download = self.store.put( - content=self.test_content, - download_url=self.test_url, - download_date="2025-08-21T09:00:00", - filename=self.test_filename, - ) - sha256 = hashlib.sha256(self.test_content).hexdigest() - self.assertEqual(download.sha256, sha256) - self.assertEqual(download.download_url, self.test_url) - self.assertEqual(download.filename, self.test_filename) - self.assertEqual(download.download_date, "2025-08-21T09:00:00") - content_path = ( - self.root_path / sha256[:2] / sha256[2:4] / sha256[4:] / "content" - ) - self.assertTrue(content_path.exists()) - with open(content_path, "rb") as f: - self.assertEqual(f.read(), self.test_content) - - retrieved = self.store.get(sha256) - self.assertEqual(retrieved.sha256, sha256) - self.assertEqual(retrieved.download_url, self.test_url) - self.assertEqual(retrieved.filename, self.test_filename) - - def test_local_filesystem_provider_deduplication(self): - download1 = self.store.put( - content=self.test_content, - download_url=self.test_url, - download_date="2025-08-21T09:00:00", - filename=self.test_filename, - ) - download2 = self.store.put( - content=self.test_content, - download_url="https://files.pythonhosted.org/packages/another.tar.gz", - download_date="2025-08-21T10:00:00", - filename="another.tar.gz", - ) - self.assertEqual(download1.sha256, download2.sha256) - self.assertEqual(download1.download_url, self.test_url) +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + + +import hashlib +from pathlib import Path + +from django.test import TestCase + +from scanpipe.archiving import LocalFilesystemProvider +from scanpipe.tests import make_project + + +class TestArchiving(TestCase): + def setUp(self): + self.project = make_project() + self.root_path = Path(__file__).parent / "data" / "test_downloads" + self.store = LocalFilesystemProvider(root_path=self.root_path) + self.test_content = b"test content" + self.test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" + self.test_filename = "sample.tar.gz" + + def tearDown(self): + if self.root_path.exists(): + import shutil + + shutil.rmtree(self.root_path) + + def test_local_filesystem_provider_put_get(self): + download = self.store.put( + content=self.test_content, + download_url=self.test_url, + download_date="2025-08-21T09:00:00", + filename=self.test_filename, + ) + sha256 = hashlib.sha256(self.test_content).hexdigest() + self.assertEqual(download.sha256, sha256) + self.assertEqual(download.download_url, self.test_url) + self.assertEqual(download.filename, self.test_filename) + self.assertEqual(download.download_date, "2025-08-21T09:00:00") + content_path = ( + self.root_path / sha256[:2] / sha256[2:4] / sha256[4:] / "content" + ) + self.assertTrue(content_path.exists()) + with open(content_path, "rb") as f: + self.assertEqual(f.read(), self.test_content) + + retrieved = self.store.get(sha256) + self.assertEqual(retrieved.sha256, sha256) + self.assertEqual(retrieved.download_url, self.test_url) + self.assertEqual(retrieved.filename, self.test_filename) + + def test_local_filesystem_provider_deduplication(self): + download1 = self.store.put( + content=self.test_content, + download_url=self.test_url, + download_date="2025-08-21T09:00:00", + filename=self.test_filename, + ) + download2 = self.store.put( + content=self.test_content, + download_url="https://files.pythonhosted.org/packages/another.tar.gz", + download_date="2025-08-21T10:00:00", + filename="another.tar.gz", + ) + self.assertEqual(download1.sha256, download2.sha256) + self.assertEqual(download1.download_url, self.test_url) diff --git a/scanpipe/tests/test_input.py b/scanpipe/tests/test_input.py index 3f2848cf1b..e55a90cace 100644 --- a/scanpipe/tests/test_input.py +++ b/scanpipe/tests/test_input.py @@ -1,143 +1,112 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: -# http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, -# software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an -# "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - - -from pathlib import Path -from unittest.mock import patch - -from django.core.files.uploadedfile import SimpleUploadedFile -from django.test import TestCase - -from scanpipe.models import InputSource -from scanpipe.pipes.input import add_input_from_upload -from scanpipe.pipes.input import add_input_from_url -from scancodeio.settings import settings -from scanpipe.tests import make_project - - -class TestInput(TestCase): - def setUp(self): - self.project = make_project() - self.test_filename = "sample.tar.gz" - self.test_data_path = ( - Path(__file__).parent / - "data" / - "test-downloads" / - self.test_filename - ) - with open(self.test_data_path, "rb") as f: - self.test_content = f.read() - - @patch("requests.get") - def test_add_input_from_url(self, mock_get): - test_url = ( - "https://files.pythonhosted.org/" - "packages/sample.tar.gz" - ) - mock_get.return_value.content = self.test_content - mock_get.return_value.status_code = 200 - add_input_from_url( - self.project, - test_url, - filename=self.test_filename - ) - input_source = InputSource.objects.get(project=self.project) - self.assertEqual(input_source.filename, self.test_filename) - self.assertEqual(input_source.download_url, test_url) - self.assertTrue(input_source.sha256) - self.assertTrue(input_source.download_date) - self.assertFalse(input_source.is_uploaded) - self.assertTrue( - input_source.file_path.startswith( - settings.CENTRAL_ARCHIVE_PATH - ) - ) - self.assertTrue(Path(input_source.file_path).exists()) - - @patch("scanpipe.pipes.input.download_store", None) - @patch("requests.get") - def test_add_input_from_url_fallback(self, mock_get): - test_url = ( - "https://files.pythonhosted.org/" - "packages/sample.tar.gz" - ) - mock_get.return_value.content = self.test_content - mock_get.return_value.status_code = 200 - add_input_from_url( - self.project, - test_url, - filename=self.test_filename - ) - input_source = InputSource.objects.get(project=self.project) - self.assertEqual(input_source.filename, self.test_filename) - self.assertEqual(input_source.download_url, test_url) - self.assertFalse(input_source.sha256) - self.assertFalse(input_source.download_date) - self.assertFalse(input_source.is_uploaded) - self.assertTrue( - str(input_source.file_path).startswith( - str(self.project.input_path) - ) - ) - self.assertTrue(Path(input_source.file_path).exists()) - - def test_add_input_from_upload(self): - uploaded_file = SimpleUploadedFile( - self.test_filename, - self.test_content - ) - add_input_from_upload(self.project, uploaded_file) - input_source = InputSource.objects.get(project=self.project) - self.assertEqual(input_source.filename, self.test_filename) - self.assertEqual(input_source.download_url, "") - self.assertTrue(input_source.sha256) - self.assertTrue(input_source.download_date) - self.assertTrue(input_source.is_uploaded) - self.assertTrue( - input_source.file_path.startswith( - settings.CENTRAL_ARCHIVE_PATH - ) - ) - self.assertTrue(Path(input_source.file_path).exists()) - - @patch("scanpipe.pipes.input.download_store", None) - def test_add_input_from_upload_fallback(self): - uploaded_file = SimpleUploadedFile( - self.test_filename, - self.test_content - ) - add_input_from_upload(self.project, uploaded_file) - input_source = InputSource.objects.get(project=self.project) - self.assertEqual(input_source.filename, self.test_filename) - self.assertEqual(input_source.download_url, "") - self.assertFalse(input_source.sha256) - self.assertFalse(input_source.download_date) - self.assertTrue(input_source.is_uploaded) - self.assertTrue( - str(input_source.file_path).startswith( - str(self.project.input_path) - ) - ) - self.assertTrue(Path(input_source.file_path).exists()) +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: +# http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, +# software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an +# "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + + +from pathlib import Path +from unittest.mock import patch + +from django.core.files.uploadedfile import SimpleUploadedFile +from django.test import TestCase + +from scancodeio.settings import settings +from scanpipe.models import InputSource +from scanpipe.pipes.input import add_input_from_upload +from scanpipe.pipes.input import add_input_from_url +from scanpipe.tests import make_project + + +class TestInput(TestCase): + def setUp(self): + self.project = make_project() + self.test_filename = "sample.tar.gz" + self.test_data_path = ( + Path(__file__).parent / "data" / "test-downloads" / self.test_filename + ) + with open(self.test_data_path, "rb") as f: + self.test_content = f.read() + + @patch("requests.get") + def test_add_input_from_url(self, mock_get): + test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" + mock_get.return_value.content = self.test_content + mock_get.return_value.status_code = 200 + add_input_from_url(self.project, test_url, filename=self.test_filename) + input_source = InputSource.objects.get(project=self.project) + self.assertEqual(input_source.filename, self.test_filename) + self.assertEqual(input_source.download_url, test_url) + self.assertTrue(input_source.sha256) + self.assertTrue(input_source.download_date) + self.assertFalse(input_source.is_uploaded) + self.assertTrue( + input_source.file_path.startswith(settings.CENTRAL_ARCHIVE_PATH) + ) + self.assertTrue(Path(input_source.file_path).exists()) + + @patch("scanpipe.pipes.input.download_store", None) + @patch("requests.get") + def test_add_input_from_url_fallback(self, mock_get): + test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" + mock_get.return_value.content = self.test_content + mock_get.return_value.status_code = 200 + add_input_from_url(self.project, test_url, filename=self.test_filename) + input_source = InputSource.objects.get(project=self.project) + self.assertEqual(input_source.filename, self.test_filename) + self.assertEqual(input_source.download_url, test_url) + self.assertFalse(input_source.sha256) + self.assertFalse(input_source.download_date) + self.assertFalse(input_source.is_uploaded) + self.assertTrue( + str(input_source.file_path).startswith(str(self.project.input_path)) + ) + self.assertTrue(Path(input_source.file_path).exists()) + + def test_add_input_from_upload(self): + uploaded_file = SimpleUploadedFile(self.test_filename, self.test_content) + add_input_from_upload(self.project, uploaded_file) + input_source = InputSource.objects.get(project=self.project) + self.assertEqual(input_source.filename, self.test_filename) + self.assertEqual(input_source.download_url, "") + self.assertTrue(input_source.sha256) + self.assertTrue(input_source.download_date) + self.assertTrue(input_source.is_uploaded) + self.assertTrue( + input_source.file_path.startswith(settings.CENTRAL_ARCHIVE_PATH) + ) + self.assertTrue(Path(input_source.file_path).exists()) + + @patch("scanpipe.pipes.input.download_store", None) + def test_add_input_from_upload_fallback(self): + uploaded_file = SimpleUploadedFile(self.test_filename, self.test_content) + add_input_from_upload(self.project, uploaded_file) + input_source = InputSource.objects.get(project=self.project) + self.assertEqual(input_source.filename, self.test_filename) + self.assertEqual(input_source.download_url, "") + self.assertFalse(input_source.sha256) + self.assertFalse(input_source.download_date) + self.assertTrue(input_source.is_uploaded) + self.assertTrue( + str(input_source.file_path).startswith(str(self.project.input_path)) + ) + self.assertTrue(Path(input_source.file_path).exists()) diff --git a/scanpipe/tests/test_pipelines.py b/scanpipe/tests/test_pipelines.py index edb1e4687e..722aaa33c5 100644 --- a/scanpipe/tests/test_pipelines.py +++ b/scanpipe/tests/test_pipelines.py @@ -1,6 +1,7 @@ <<<<<<< HEAD <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD ======= >>>>>>> ca2f49f5 (Revert "Revert "Revert "add tests for storing packages""") From 86c0d233e7920311cf51a4b5efbb8601dd5eb628 Mon Sep 17 00:00:00 2001 From: Varsha U N Date: Thu, 18 Sep 2025 09:16:46 +0530 Subject: [PATCH 13/18] fix CI errors Signed-off-by: Varsha U N --- scanpipe/pipelines/__init__.py | 3 +-- scanpipe/tests/test_pipelines.py | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scanpipe/pipelines/__init__.py b/scanpipe/pipelines/__init__.py index 5153bf1887..ba4703d9d5 100644 --- a/scanpipe/pipelines/__init__.py +++ b/scanpipe/pipelines/__init__.py @@ -32,11 +32,10 @@ import bleach from markdown_it import MarkdownIt from pyinstrument import Profiler +from django.conf import settings from aboutcode.pipeline import BasePipeline from scancodeio.settings import download_store -from scancodeio.settings import settings - logger = logging.getLogger(__name__) diff --git a/scanpipe/tests/test_pipelines.py b/scanpipe/tests/test_pipelines.py index 722aaa33c5..ad71a8bab1 100644 --- a/scanpipe/tests/test_pipelines.py +++ b/scanpipe/tests/test_pipelines.py @@ -2,6 +2,7 @@ <<<<<<< HEAD <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD ======= >>>>>>> ca2f49f5 (Revert "Revert "Revert "add tests for storing packages""") From fbfbebbc4be0f92c308815c3680a22e43008603e Mon Sep 17 00:00:00 2001 From: Varsha U N Date: Thu, 18 Sep 2025 10:31:21 +0530 Subject: [PATCH 14/18] fix minor errors Signed-off-by: Varsha U N --- Dockerfile | 107 +------------------------------ scanpipe/pipelines/__init__.py | 4 +- scanpipe/tests/test_pipelines.py | 44 ++----------- 3 files changed, 12 insertions(+), 143 deletions(-) diff --git a/Dockerfile b/Dockerfile index 9615d29f0c..d87dd649ca 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,111 +1,11 @@ <<<<<<< HEAD <<<<<<< HEAD <<<<<<< HEAD - + ======= >>>>>>> b6d23428 (Revert "Revert "add tests for storing packages"") ======= >>>>>>> 507231a0 (Revert "add tests for storing packages") -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/aboutcode-org/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/aboutcode-org/scancode.io for support and download. - -FROM python:3.13-slim - -LABEL org.opencontainers.image.source="https://github.com/aboutcode-org/scancode.io" -LABEL org.opencontainers.image.description="ScanCode.io" -LABEL org.opencontainers.image.licenses="Apache-2.0" - -ENV APP_NAME scancodeio -ENV APP_USER app -ENV APP_DIR /opt/$APP_NAME -ENV VENV_LOCATION /opt/$APP_NAME/.venv - -# Force Python unbuffered stdout and stderr (they are flushed to terminal immediately) -ENV PYTHONUNBUFFERED 1 -# Do not write Python .pyc files -ENV PYTHONDONTWRITEBYTECODE 1 -# Add the app dir in the Python path for entry points availability -ENV PYTHONPATH $PYTHONPATH:$APP_DIR - -# OS requirements as per -# https://scancode-toolkit.readthedocs.io/en/latest/getting-started/install.html -# Also install universal-ctags and xgettext for symbol and string collection. -RUN apt-get update \ - && apt-get install -y --no-install-recommends \ - bzip2 \ - xz-utils \ - zlib1g \ - libxml2-dev \ - libxslt1-dev \ - libgomp1 \ - libsqlite3-0 \ - libgcrypt20 \ - libpopt0 \ - libzstd1 \ - libgpgme11 \ - libdevmapper1.02.1 \ - libguestfs-tools \ - linux-image-amd64 \ - git \ - wait-for-it \ - universal-ctags \ - gettext \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* - -# Create the APP_USER group and user -RUN addgroup --system $APP_USER \ - && adduser --system --group --home=$APP_DIR $APP_USER \ - && chown $APP_USER:$APP_USER $APP_DIR - -# Create the /var/APP_NAME directory with proper permission for APP_USER -RUN mkdir -p /var/$APP_NAME \ - && chown $APP_USER:$APP_USER /var/$APP_NAME - -# Setup the work directory and the user as APP_USER for the remaining stages -WORKDIR $APP_DIR -USER $APP_USER - -# Create the virtualenv -RUN python -m venv $VENV_LOCATION -# Enable the virtualenv, similar effect as "source activate" -ENV PATH $VENV_LOCATION/bin:$PATH - -# Create static/ and workspace/ directories -RUN mkdir -p /var/$APP_NAME/static/ \ - && mkdir -p /var/$APP_NAME/workspace/ - -# Install the dependencies before the codebase COPY for proper Docker layer caching -COPY --chown=$APP_USER:$APP_USER pyproject.toml $APP_DIR/ -RUN pip install --no-cache-dir . - -# Copy the codebase and set the proper permissions for the APP_USER -<<<<<<< HEAD -<<<<<<< HEAD -COPY --chown=$APP_USER:$APP_USER . $APP_DIR -======= -COPY --chown=$APP_USER:$APP_USER . $APP_DIR ->>>>>>> b6d23428 (Revert "Revert "add tests for storing packages"") -======= # SPDX-License-Identifier: Apache-2.0 # # http://nexb.com and https://github.com/aboutcode-org/scancode.io @@ -200,7 +100,4 @@ RUN pip install --no-cache-dir . # Copy the codebase and set the proper permissions for the APP_USER COPY --chown=$APP_USER:$APP_USER . $APP_DIR ->>>>>>> ca2f49f5 (Revert "Revert "Revert "add tests for storing packages""") -======= -COPY --chown=$APP_USER:$APP_USER . $APP_DIR ->>>>>>> 507231a0 (Revert "add tests for storing packages") + diff --git a/scanpipe/pipelines/__init__.py b/scanpipe/pipelines/__init__.py index ba4703d9d5..5489cfca72 100644 --- a/scanpipe/pipelines/__init__.py +++ b/scanpipe/pipelines/__init__.py @@ -29,13 +29,15 @@ from functools import wraps from pathlib import Path +from django.conf import settings + import bleach from markdown_it import MarkdownIt from pyinstrument import Profiler -from django.conf import settings from aboutcode.pipeline import BasePipeline from scancodeio.settings import download_store + logger = logging.getLogger(__name__) diff --git a/scanpipe/tests/test_pipelines.py b/scanpipe/tests/test_pipelines.py index ad71a8bab1..e08176121a 100644 --- a/scanpipe/tests/test_pipelines.py +++ b/scanpipe/tests/test_pipelines.py @@ -1,11 +1,3 @@ -<<<<<<< HEAD -<<<<<<< HEAD -<<<<<<< HEAD -<<<<<<< HEAD -<<<<<<< HEAD - -======= ->>>>>>> ca2f49f5 (Revert "Revert "Revert "add tests for storing packages""") # SPDX-License-Identifier: Apache-2.0 # # http://nexb.com and https://github.com/nexB/scancode.io @@ -37,10 +29,6 @@ from pathlib import Path from unittest import mock from unittest import skipIf -<<<<<<< HEAD -======= -from unittest.mock import patch ->>>>>>> ca2f49f5 (Revert "Revert "Revert "add tests for storing packages""") from django.conf import settings from django.test import TestCase @@ -311,49 +299,31 @@ def test_archive_downloads(self, mock_get): with open(test_data_path, "rb") as f: test_content = f.read() -<<<<<<< HEAD - input_source=InputSource.objects.create( -======= - InputSource.objects.create( ->>>>>>> ca2f49f5 (Revert "Revert "Revert "add tests for storing packages""") - project=project1, - filename=test_filename, - download_url=test_url, - is_uploaded=False, + input_source = InputSource.objects.create( + InputSource.objects.create( + project=project1, + filename=test_filename, + download_url=test_url, + is_uploaded=False, + ) ) -<<<<<<< HEAD - -======= - ->>>>>>> ca2f49f5 (Revert "Revert "Revert "add tests for storing packages""") mock_get.return_value.content = test_content mock_get.return_value.status_code = 200 pipeline.download_missing_inputs() input_source.refresh_from_db() -<<<<<<< HEAD self.assertTrue( input_source.file_path.startswith(settings.CENTRAL_ARCHIVE_PATH) ) self.assertTrue(Path(input_source.file_path).exists()) -======= - self.assertTrue(input_source.file_path.startswith(settings.CENTRAL_ARCHIVE_PATH)) - self.assertTrue(Path(input_source.file_path).exists()) - - ->>>>>>> ca2f49f5 (Revert "Revert "Revert "add tests for storing packages""") pipeline.archive_downloads() input_source = InputSource.refresh_from_db() self.assertTrue(input_source.sha256) self.assertTrue(input_source.download_date) self.assertEqual(input_source.download_url, test_url) self.assertEqual(input_source.filename, test_filename) -<<<<<<< HEAD -======= - ->>>>>>> ca2f49f5 (Revert "Revert "Revert "add tests for storing packages""") project2 = make_project(name="project2") input_source2 = InputSource.objects.create( project=project2, From aefd0696925087eb704bb7a3ae9d7c82384aba76 Mon Sep 17 00:00:00 2001 From: Varsha U N Date: Thu, 18 Sep 2025 10:37:22 +0530 Subject: [PATCH 15/18] fix minor error Signed-off-by: Varsha U N --- Dockerfile | 8 -------- 1 file changed, 8 deletions(-) diff --git a/Dockerfile b/Dockerfile index d87dd649ca..37b3e5f87e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,11 +1,3 @@ -<<<<<<< HEAD -<<<<<<< HEAD -<<<<<<< HEAD - -======= ->>>>>>> b6d23428 (Revert "Revert "add tests for storing packages"") -======= ->>>>>>> 507231a0 (Revert "add tests for storing packages") # SPDX-License-Identifier: Apache-2.0 # # http://nexb.com and https://github.com/aboutcode-org/scancode.io From 8cceed79788a78371a2c3ab7a70081b85887a551 Mon Sep 17 00:00:00 2001 From: Varsha U N Date: Thu, 18 Sep 2025 18:16:36 +0530 Subject: [PATCH 16/18] fix the imports Signed-off-by: Varsha U N --- scanpipe/pipelines/__init__.py | 46 -------------------------------- scanpipe/tests/test_input.py | 2 +- scanpipe/tests/test_pipelines.py | 4 +-- 3 files changed, 3 insertions(+), 49 deletions(-) diff --git a/scanpipe/pipelines/__init__.py b/scanpipe/pipelines/__init__.py index 5489cfca72..7c1532803c 100644 --- a/scanpipe/pipelines/__init__.py +++ b/scanpipe/pipelines/__init__.py @@ -180,51 +180,6 @@ def download_missing_inputs(self): if error_tracebacks: raise InputFilesError(error_tracebacks) - def archive_downloads(self): - """ - Archive downloaded inputs to the centralized DownloadStore if not already - archived.Updates InputSource with archiving metadata (sha256, download_date). - """ - logger.info(f"Archiving downloads for project {self.project.name}") - for input_source in self.project.inputsources.filter( - sha256__isnull=True, is_uploaded=False - ): - if input_source.download_url: - logger.warning( - f"No download URL for input {input_source.filename}, " - "skipping archiving" - ) - continue - - if not input_source.file_path: - logger.warning( - f"No file_path for input {input_source.download_url}, " - "skipping archiving" - ) - continue - try: - with open(input_source.file_path, "rb") as f: - content = f.read() - filename = ( - input_source.filename or input_source.download_url.split("/")[-1] - ) - download = download_store.put( - content=content, - download_url=input_source.download_url, - download_date=datetime.now().isoformat(), - filename=filename, - ) - input_source.sha256 = download.sha256 - input_source.download_date = download.download_date - input_source.file_path = str(download.path) - input_source.save() - except Exception as e: - self.add_error( - exception=e, - message=f"Failed to archive {input_source.download_url}", - ) - - class ProjectPipeline(CommonStepsMixin, BasePipeline): """Main class for all project related pipelines including common steps methods.""" @@ -258,7 +213,6 @@ def get_initial_steps(cls): steps = [] if cls.download_inputs: steps.append(cls.download_missing_inputs) - steps.append(cls.archive_downloads) return tuple(steps) @classmethod diff --git a/scanpipe/tests/test_input.py b/scanpipe/tests/test_input.py index e55a90cace..44cfaa4409 100644 --- a/scanpipe/tests/test_input.py +++ b/scanpipe/tests/test_input.py @@ -29,8 +29,8 @@ from django.core.files.uploadedfile import SimpleUploadedFile from django.test import TestCase +from django.conf import settings -from scancodeio.settings import settings from scanpipe.models import InputSource from scanpipe.pipes.input import add_input_from_upload from scanpipe.pipes.input import add_input_from_url diff --git a/scanpipe/tests/test_pipelines.py b/scanpipe/tests/test_pipelines.py index e08176121a..5d956e3703 100644 --- a/scanpipe/tests/test_pipelines.py +++ b/scanpipe/tests/test_pipelines.py @@ -185,7 +185,7 @@ def test_scanpipe_pipeline_class_download_inputs_attribute(self): run = project1.add_pipeline("download_inputs") pipeline = run.make_pipeline_instance() self.assertTrue(pipeline.download_inputs) - expected = (CommonStepsMixin.download_missing_inputs,) + expected = (CommonStepsMixin.download_missing_inputs) self.assertEqual(expected, pipeline.get_initial_steps()) expected = (CommonStepsMixin.download_missing_inputs, DownloadInput.step1) self.assertEqual(expected, pipeline.get_steps()) @@ -301,7 +301,7 @@ def test_archive_downloads(self, mock_get): input_source = InputSource.objects.create( InputSource.objects.create( - project=project1, + project1=project1, filename=test_filename, download_url=test_url, is_uploaded=False, From ede7730b7d8cb20383e45cc2e01a272fb49ff79b Mon Sep 17 00:00:00 2001 From: Varsha U N Date: Thu, 18 Sep 2025 19:48:39 +0530 Subject: [PATCH 17/18] fix CI errors and imports Signed-off-by: Varsha U N --- scanpipe/pipelines/__init__.py | 3 +- scanpipe/pipes/input.py | 101 +++++++++---------------------- scanpipe/tests/test_input.py | 64 +++++++------------- scanpipe/tests/test_pipelines.py | 54 +---------------- 4 files changed, 52 insertions(+), 170 deletions(-) diff --git a/scanpipe/pipelines/__init__.py b/scanpipe/pipelines/__init__.py index 7c1532803c..f24ce0026b 100644 --- a/scanpipe/pipelines/__init__.py +++ b/scanpipe/pipelines/__init__.py @@ -25,7 +25,6 @@ import logging import traceback from contextlib import contextmanager -from datetime import datetime from functools import wraps from pathlib import Path @@ -36,7 +35,6 @@ from pyinstrument import Profiler from aboutcode.pipeline import BasePipeline -from scancodeio.settings import download_store logger = logging.getLogger(__name__) @@ -180,6 +178,7 @@ def download_missing_inputs(self): if error_tracebacks: raise InputFilesError(error_tracebacks) + class ProjectPipeline(CommonStepsMixin, BasePipeline): """Main class for all project related pipelines including common steps methods.""" diff --git a/scanpipe/pipes/input.py b/scanpipe/pipes/input.py index 906a2ee3a1..a7f0edee9c 100644 --- a/scanpipe/pipes/input.py +++ b/scanpipe/pipes/input.py @@ -23,7 +23,6 @@ import logging import os import shutil -from datetime import datetime from pathlib import Path from django.core.exceptions import FieldDoesNotExist @@ -34,7 +33,6 @@ import requests from typecode.contenttype import get_type -from scancodeio.settings import download_store from scanpipe import pipes from scanpipe.models import CodebaseRelation from scanpipe.models import CodebaseResource @@ -261,43 +259,21 @@ def add_input_from_url(project, url, filename=None): raise filename = filename or url.split("/")[-1] or "downloaded_file" + input_path = project.input_path / filename - if download_store: - try: - download = download_store.put( - content=content, - download_url=url, - download_date=datetime.now().isoformat(), - filename=filename, - ) - InputSource.objects.create( - project=project, - sha256=download.sha256, - download_url=download.download_url, - filename=download.filename, - download_date=download.download_date, - file_path=str(download.path), - is_uploaded=False, - ) - except Exception as e: - logger.error(f"Failed to archive download for {url}: {e}") - raise - else: - input_path = project.input_path / filename - try: - input_path.parent.mkdir(parents=True, exist_ok=True) - with open(input_path, "wb") as f: - f.write(content) - InputSource.objects.create( - project=project, - filename=filename, - download_url=url, - file_path=str(input_path), - is_uploaded=False, - ) - except Exception as e: - logger.error(f"Failed to save {filename} to {input_path}: {e}") - raise + try: + input_path.parent.mkdir(parents=True, exist_ok=True) + with open(input_path, "wb") as f: + f.write(content) + InputSource.objects.create( + project=project, + filename=filename, + download_url=url, + is_uploaded=False, + ) + except Exception as e: + logger.error(f"Failed to save {filename} to {input_path}: {e}") + raise def add_input_from_upload(project, uploaded_file): @@ -307,39 +283,16 @@ def add_input_from_upload(project, uploaded_file): """ content = uploaded_file.read() filename = uploaded_file.name - - if download_store: - try: - download = download_store.put( - content=content, - download_url="", - download_date=datetime.now().isoformat(), - filename=filename, - ) - InputSource.objects.create( - project=project, - sha256=download.sha256, - download_url=download.download_url, - filename=download.filename, - download_date=download.download_date, - file_path=str(download.path), - is_uploaded=True, - ) - except Exception as e: - logger.error(f"Failed to archive upload {filename}: {e}") - raise - else: - input_path = project.input_path / filename - try: - input_path.parent.mkdir(parents=True, exist_ok=True) - with open(input_path, "wb") as f: - f.write(content) - InputSource.objects.create( - project=project, - filename=filename, - file_path=str(input_path), - is_uploaded=True, - ) - except Exception as e: - logger.error(f"Failed to save {filename} to {input_path}: {e}") - raise + input_path = project.input_path / filename + try: + input_path.parent.mkdir(parents=True, exist_ok=True) + with open(input_path, "wb") as f: + f.write(content) + InputSource.objects.create( + project=project, + filename=filename, + is_uploaded=True, + ) + except Exception as e: + logger.error(f"Failed to save {filename} to {input_path}: {e}") + raise diff --git a/scanpipe/tests/test_input.py b/scanpipe/tests/test_input.py index 44cfaa4409..539474a87c 100644 --- a/scanpipe/tests/test_input.py +++ b/scanpipe/tests/test_input.py @@ -25,11 +25,11 @@ from pathlib import Path +from unittest.mock import Mock from unittest.mock import patch from django.core.files.uploadedfile import SimpleUploadedFile from django.test import TestCase -from django.conf import settings from scanpipe.models import InputSource from scanpipe.pipes.input import add_input_from_upload @@ -49,38 +49,17 @@ def setUp(self): @patch("requests.get") def test_add_input_from_url(self, mock_get): - test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" - mock_get.return_value.content = self.test_content - mock_get.return_value.status_code = 200 + test_url = "https://example.com/test.tar.gz" + mock_response = Mock() + mock_response.content = self.test_content + mock_response.raise_for_status.return_value = None + mock_get.return_value = mock_response add_input_from_url(self.project, test_url, filename=self.test_filename) input_source = InputSource.objects.get(project=self.project) - self.assertEqual(input_source.filename, self.test_filename) self.assertEqual(input_source.download_url, test_url) - self.assertTrue(input_source.sha256) - self.assertTrue(input_source.download_date) - self.assertFalse(input_source.is_uploaded) - self.assertTrue( - input_source.file_path.startswith(settings.CENTRAL_ARCHIVE_PATH) - ) - self.assertTrue(Path(input_source.file_path).exists()) - - @patch("scanpipe.pipes.input.download_store", None) - @patch("requests.get") - def test_add_input_from_url_fallback(self, mock_get): - test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" - mock_get.return_value.content = self.test_content - mock_get.return_value.status_code = 200 - add_input_from_url(self.project, test_url, filename=self.test_filename) - input_source = InputSource.objects.get(project=self.project) self.assertEqual(input_source.filename, self.test_filename) - self.assertEqual(input_source.download_url, test_url) - self.assertFalse(input_source.sha256) - self.assertFalse(input_source.download_date) self.assertFalse(input_source.is_uploaded) - self.assertTrue( - str(input_source.file_path).startswith(str(self.project.input_path)) - ) - self.assertTrue(Path(input_source.file_path).exists()) + self.assertTrue((self.project.input_path / self.test_filename).exists()) def test_add_input_from_upload(self): uploaded_file = SimpleUploadedFile(self.test_filename, self.test_content) @@ -88,25 +67,28 @@ def test_add_input_from_upload(self): input_source = InputSource.objects.get(project=self.project) self.assertEqual(input_source.filename, self.test_filename) self.assertEqual(input_source.download_url, "") - self.assertTrue(input_source.sha256) - self.assertTrue(input_source.download_date) self.assertTrue(input_source.is_uploaded) - self.assertTrue( - input_source.file_path.startswith(settings.CENTRAL_ARCHIVE_PATH) - ) - self.assertTrue(Path(input_source.file_path).exists()) + self.assertTrue((self.project.input_path / self.test_filename).exists()) + + @patch("requests.get") + def test_add_input_from_url_fallback(self, mock_get): + test_url = "https://example.com/test.tar.gz" + mock_response = Mock() + mock_response.content = self.test_content + mock_response.raise_for_status.return_value = None + mock_get.return_value = mock_response + add_input_from_url(self.project, test_url, filename=self.test_filename) + input_source = InputSource.objects.get(project=self.project) + self.assertEqual(input_source.download_url, test_url) + self.assertEqual(input_source.filename, self.test_filename) + self.assertFalse(input_source.is_uploaded) + self.assertTrue((self.project.input_path / self.test_filename).exists()) - @patch("scanpipe.pipes.input.download_store", None) def test_add_input_from_upload_fallback(self): uploaded_file = SimpleUploadedFile(self.test_filename, self.test_content) add_input_from_upload(self.project, uploaded_file) input_source = InputSource.objects.get(project=self.project) self.assertEqual(input_source.filename, self.test_filename) self.assertEqual(input_source.download_url, "") - self.assertFalse(input_source.sha256) - self.assertFalse(input_source.download_date) self.assertTrue(input_source.is_uploaded) - self.assertTrue( - str(input_source.file_path).startswith(str(self.project.input_path)) - ) - self.assertTrue(Path(input_source.file_path).exists()) + self.assertTrue((self.project.input_path / self.test_filename).exists()) diff --git a/scanpipe/tests/test_pipelines.py b/scanpipe/tests/test_pipelines.py index 5d956e3703..0927351e60 100644 --- a/scanpipe/tests/test_pipelines.py +++ b/scanpipe/tests/test_pipelines.py @@ -41,7 +41,6 @@ from scanpipe import pipes from scanpipe.models import CodebaseResource from scanpipe.models import DiscoveredPackage -from scanpipe.models import InputSource from scanpipe.pipelines import CommonStepsMixin from scanpipe.pipelines import InputFilesError from scanpipe.pipelines import Pipeline @@ -185,7 +184,7 @@ def test_scanpipe_pipeline_class_download_inputs_attribute(self): run = project1.add_pipeline("download_inputs") pipeline = run.make_pipeline_instance() self.assertTrue(pipeline.download_inputs) - expected = (CommonStepsMixin.download_missing_inputs) + expected = (CommonStepsMixin.download_missing_inputs,) self.assertEqual(expected, pipeline.get_initial_steps()) expected = (CommonStepsMixin.download_missing_inputs, DownloadInput.step1) self.assertEqual(expected, pipeline.get_steps()) @@ -287,57 +286,6 @@ def mock_make_to_path(**kwargs): self.assertTrue(input_source.exists()) @mock.patch("requests.get") - def test_archive_downloads(self, mock_get): - project1 = make_project() - run = project1.add_pipeline("scan_codebase") - pipeline = run.make_pipeline_instance() - test_filename = "sample.tar.gz" - test_url = "https://files.pythonhosted.org/packages/sample.tar.gz" - test_data_path = ( - Path(__file__).parent / "data" / "test-downloads" / test_filename - ) - with open(test_data_path, "rb") as f: - test_content = f.read() - - input_source = InputSource.objects.create( - InputSource.objects.create( - project1=project1, - filename=test_filename, - download_url=test_url, - is_uploaded=False, - ) - ) - mock_get.return_value.content = test_content - mock_get.return_value.status_code = 200 - - pipeline.download_missing_inputs() - input_source.refresh_from_db() - self.assertTrue( - input_source.file_path.startswith(settings.CENTRAL_ARCHIVE_PATH) - ) - self.assertTrue(Path(input_source.file_path).exists()) - - pipeline.archive_downloads() - input_source = InputSource.refresh_from_db() - self.assertTrue(input_source.sha256) - self.assertTrue(input_source.download_date) - self.assertEqual(input_source.download_url, test_url) - self.assertEqual(input_source.filename, test_filename) - - project2 = make_project(name="project2") - input_source2 = InputSource.objects.create( - project=project2, - filename=test_filename, - download_url=test_url, - is_uploaded=False, - ) - run2 = project2.add_pipeline("scan_codebase") - pipeline2 = run2.make_pipeline_instance() - pipeline2.download_missing_inputs() - input_source2.refresh_from_db() - self.assertEqual(input_source.file_path, input_source2.file_path) - self.assertTrue(Path(input_source2.file_path).exists()) - def test_scanpipe_pipeline_class_save_errors_context_manager(self): project1 = make_project() run = project1.add_pipeline("do_nothing") From 660a965ff6a34cac8a91c9132b6169b7ff62fe53 Mon Sep 17 00:00:00 2001 From: Varsha U N Date: Thu, 18 Sep 2025 20:08:23 +0530 Subject: [PATCH 18/18] fix ci errors Signed-off-by: Varsha U N --- scanpipe/tests/test_pipelines.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scanpipe/tests/test_pipelines.py b/scanpipe/tests/test_pipelines.py index 0927351e60..03dd1ff1f2 100644 --- a/scanpipe/tests/test_pipelines.py +++ b/scanpipe/tests/test_pipelines.py @@ -286,7 +286,7 @@ def mock_make_to_path(**kwargs): self.assertTrue(input_source.exists()) @mock.patch("requests.get") - def test_scanpipe_pipeline_class_save_errors_context_manager(self): + def test_scanpipe_pipeline_class_save_errors_context_manager(self, *args, **kwargs): project1 = make_project() run = project1.add_pipeline("do_nothing") pipeline = run.make_pipeline_instance()