Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions vulnerabilities/importers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
from vulnerabilities.pipelines.v2_importers import nvd_importer as nvd_importer_v2
from vulnerabilities.pipelines.v2_importers import oss_fuzz as oss_fuzz_v2
from vulnerabilities.pipelines.v2_importers import postgresql_importer as postgresql_importer_v2
from vulnerabilities.pipelines.v2_importers import project_kb_importer as project_kb_importer_v2
from vulnerabilities.pipelines.v2_importers import pypa_importer as pypa_importer_v2
from vulnerabilities.pipelines.v2_importers import pysec_importer as pysec_importer_v2
from vulnerabilities.pipelines.v2_importers import redhat_importer as redhat_importer_v2
Expand All @@ -81,6 +82,7 @@
mozilla_importer_v2.MozillaImporterPipeline,
github_osv_importer_v2.GithubOSVImporterPipeline,
redhat_importer_v2.RedHatImporterPipeline,
project_kb_importer_v2.ProjectKBPipeline,
nvd_importer.NVDImporterPipeline,
github_importer.GitHubAPIImporterPipeline,
gitlab_importer.GitLabImporterPipeline,
Expand Down
4 changes: 4 additions & 0 deletions vulnerabilities/improvers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
from vulnerabilities.pipelines import flag_ghost_packages
from vulnerabilities.pipelines import populate_vulnerability_summary_pipeline
from vulnerabilities.pipelines import remove_duplicate_advisories
from vulnerabilities.pipelines.v2_improvers import (
collect_commits_project_kb as collect_commits_project_kb_v2,
)
from vulnerabilities.pipelines.v2_improvers import compute_advisory_todo as compute_advisory_todo_v2
from vulnerabilities.pipelines.v2_improvers import compute_package_risk as compute_package_risk_v2
from vulnerabilities.pipelines.v2_improvers import (
Expand Down Expand Up @@ -68,5 +71,6 @@
compute_version_rank_v2.ComputeVersionRankPipeline,
compute_advisory_todo_v2.ComputeToDo,
compute_advisory_todo.ComputeToDo,
collect_commits_project_kb_v2.CollectFixCommitsProjectKBPipeline,
]
)
125 changes: 125 additions & 0 deletions vulnerabilities/pipelines/v2_importers/project_kb_importer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# VulnerableCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

import json
from pathlib import Path
from typing import Iterable

import saneyaml
from fetchcode.vcs import fetch_via_vcs
from packageurl import PackageURL
from univers.maven import VersionRange

from vulnerabilities.importer import AdvisoryData
from vulnerabilities.importer import AffectedPackageV2
from vulnerabilities.importer import ReferenceV2
from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2
from vulnerabilities.utils import get_advisory_url


class ProjectKBPipeline(VulnerableCodeBaseImporterPipelineV2):
"""
ProjectKB Importer Pipeline
Collect advisory from ProjectKB data:
- YAML statements: https://github.com/SAP/project-kb/blob/vulnerability-data/statements/*/*.yaml
"""

pipeline_id = "project-kb_v2"
spdx_license_expression = "Apache-2.0"
license_url = "https://github.com/SAP/project-kb/blob/main/LICENSE.txt"
repo_url = "git+https://github.com/SAP/project-kb@vulnerability-data"

@classmethod
def steps(cls):
return (cls.clone_repo, cls.collect_and_store_advisories, cls.clean_downloads)

def clone_repo(self):
self.log("Processing ProjectKB advisory data...")
self.vcs_response = fetch_via_vcs(self.repo_url)

def advisories_count(self):
base_path = Path(self.vcs_response.dest_dir) / "statements"
count = sum(1 for _ in base_path.rglob("*.yaml"))
self.log(f"Estimated advisories to process: {count}")
return count

def collect_advisories(self) -> Iterable[AdvisoryData]:
"""Collect fix commits from YAML statements under /statements."""
base_path = Path(self.vcs_response.dest_dir) / "statements"

for yaml_file in base_path.rglob("*.yaml"):
if yaml_file.name != "statement.yaml":
continue

with open(yaml_file, encoding="utf-8") as f:
yaml_data = saneyaml.load(f)

vulnerability_id = yaml_data.get("vulnerability_id")
if not vulnerability_id:
continue

note_texts = []
for note_entry in yaml_data.get("notes", []):
text_content = note_entry.get("text")
if text_content:
note_texts.append(text_content)
description = "\n".join(note_texts)

references = []
for fix in yaml_data.get("fixes", []):
for commit in fix.get("commits", []):
commit_id = commit.get("id")
repo_url = commit.get("repository")
if not commit_id or not repo_url:
continue

commit_url = repo_url.replace(".git", "") + "/commit/" + commit_id
ref = ReferenceV2.from_url(commit_url)
references.append(ref)

affected_packages = []
for artifact in yaml_data.get("artifacts", []):
affected = artifact.get("affected")
if not affected:
continue

purl_str = artifact.get("id")
purl = PackageURL.from_string(purl_str)

affected_package = AffectedPackageV2(
package=PackageURL(type=purl.type, namespace=purl.namespace, name=purl.name),
fixed_version_range=VersionRange.from_version(purl.version),
)
affected_packages.append(affected_package)

advisory_url = get_advisory_url(
file=yaml_file,
base_path=base_path,
url="https://github.com/SAP/project-kb/blob/vulnerability-data/statements/",
)

yield AdvisoryData(
advisory_id=vulnerability_id,
aliases=[],
summary=description or "",
affected_packages=affected_packages,
references_v2=references,
url=advisory_url,
original_advisory_text=json.dumps(yaml_data, indent=2, ensure_ascii=False),
)

def clean_downloads(self):
"""Remove the cloned repository from disk."""
self.log("Removing cloned repository...")
if self.vcs_response:
self.vcs_response.delete()

def on_failure(self):
"""Ensure cleanup happens on pipeline failure."""
self.clean_downloads()
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# VulnerableCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

import csv
from pathlib import Path

from fetchcode.vcs import fetch_via_vcs

from vulnerabilities.models import AdvisoryV2
from vulnerabilities.models import CodeFixV2
from vulnerabilities.pipelines import VulnerableCodePipeline


class CollectFixCommitsProjectKBPipeline(VulnerableCodePipeline):
"""
Pipeline to collect fix commits from Project KB:
https://github.com/SAP/project-kb/blob/main/MSR2019/dataset/vulas_db_msr2019_release.csv
"""

pipeline_id = "kb_project_fix_commits"
spdx_license_expression = "Apache-2.0"
license_url = "https://github.com/SAP/project-kb/blob/main/LICENSE.txt"
qualified_name = "kb_project_fix_commits"
repo_url = "git+https://github.com/SAP/project-kb"

@classmethod
def steps(cls):
return (
cls.clone,
cls.collect_fix_commits,
)

def clone(self):
self.log("Cloning repositories for ProjectKB fix commits from CSV...")
self.vcs_response = fetch_via_vcs(self.repo_url)

def collect_fix_commits(self):
self.log("Collecting fix commits from ProjectKB...")

csv_path = Path(self.vcs_response.dest_dir) / "MSR2019/dataset/vulas_db_msr2019_release.csv"

with open(csv_path, newline="", encoding="utf-8") as f:
reader = csv.reader(f)
next(reader, None) # skip header
rows = [r for r in reader if len(r) == 4 and r[0]]

vuln_ids = {r[0] for r in rows}
advisories = AdvisoryV2.objects.filter(advisory_id__in=vuln_ids).prefetch_related(
"impacted_packages__affecting_packages"
)
advisory_map = {a.advisory_id: a for a in advisories}

codefixes = []
for vuln_id, repo_url, commit, _ in rows:
advisory = advisory_map.get(vuln_id)
if not advisory:
continue

repo_url = repo_url.rstrip("/").removesuffix(".git")
vcs_url = f"{repo_url}/commit/{commit}"

for impact in advisory.impacted_packages.all():
for pkg in impact.affecting_packages.all():
codefixes.append(
CodeFixV2(
commits=[vcs_url],
advisory=advisory,
affected_package=pkg,
)
)

if codefixes:
CodeFixV2.objects.bulk_create(codefixes, ignore_conflicts=True)
self.log(f"Created {len(codefixes)} CodeFix entries.")
else:
self.log("No CodeFix entries created.")

def clean_downloads(self):
"""Remove the cloned repository from disk."""
if self.vcs_response:
self.log(f"Removing cloned repository")
self.vcs_response.delete()

def on_failure(self):
self.clean_downloads()
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# VulnerableCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#
from datetime import datetime
from datetime import timezone
from pathlib import Path
from types import SimpleNamespace
from unittest import TestCase
from unittest.mock import patch

import pytest

from vulnerabilities.models import AdvisoryV2
from vulnerabilities.models import CodeFixV2
from vulnerabilities.models import ImpactedPackage
from vulnerabilities.models import PackageV2
from vulnerabilities.pipelines.v2_importers.project_kb_importer import ProjectKBPipeline
from vulnerabilities.pipelines.v2_improvers.collect_commits_project_kb import (
CollectFixCommitsProjectKBPipeline,
)
from vulnerabilities.tests import util_tests

TEST_DATA = Path(__file__).parent.parent.parent / "test_data" / "kbmsr2019"


class TestProjectKbImporterPipeline(TestCase):
"""
Integration-style test that validates YAML → Advisory → JSON conversion
using real test data files, but mocks network and repo access.
"""

@patch(
"vulnerabilities.pipelines.v2_importers.project_kb_importer.get_advisory_url",
return_value="https://mocked.url/advisory",
)
def test_project_kb_collect_advisories_v2(self, mock_get_advisory_url):
pipeline = ProjectKBPipeline()
pipeline.vcs_response = SimpleNamespace(dest_dir=TEST_DATA)

for idx in range(1, 4):
yaml_file = TEST_DATA / str(idx) / f"statement.yaml"
expected_file = TEST_DATA / f"statement-{idx}-expected.json"

with patch(
"vulnerabilities.pipelines.v2_importers.project_kb_importer.Path.rglob",
return_value=[yaml_file],
):
result = [adv.to_dict() for adv in pipeline.collect_advisories()]

util_tests.check_results_against_json(result, expected_file)

@pytest.mark.django_db
def test_collect_fix_commits_uses_existing_csv(self):
"""
Test that CollectFixCommitsProjectKBPipeline.collect_fix_commits()
reads an existing ProjectKB CSV file and creates CodeFixV2 entries.
"""

advisory = AdvisoryV2.objects.create(
advisory_id="CVE-2018-8034",
datasource_id="test-datasource",
avid="TEST-1234",
unique_content_id="unique-test-id",
url="https://example.com/advisory/CVE-2018-8034",
date_collected=datetime.now(timezone.utc),
)

pkg1 = PackageV2.objects.create(name="test_name1", type="test")
pkg2 = PackageV2.objects.create(name="test_name2", type="test")

impacted = ImpactedPackage.objects.create(advisory=advisory)
impacted.affecting_packages.set([pkg1, pkg2])

pipeline = CollectFixCommitsProjectKBPipeline()
pipeline.vcs_response = SimpleNamespace(dest_dir=TEST_DATA)

pipeline.collect_fix_commits()

fixes = CodeFixV2.objects.all()
assert len(fixes) == 2
assert [fix.commits for fix in fixes] == [
["https://github.com/apache/tomcat/commit/2835bb4e030c1c741ed0847bb3b9c3822e4fbc8a"],
["https://github.com/apache/tomcat/commit/2835bb4e030c1c741ed0847bb3b9c3822e4fbc8a"],
]
Loading