Skip to content

Commit efdb7ec

Browse files
committed
Update mine Cran to use grouper function
Move extract_cran_packages to miners/cran.py Signed-off-by: ziad hany <[email protected]>
1 parent 180d916 commit efdb7ec

File tree

3 files changed

+63
-58
lines changed

3 files changed

+63
-58
lines changed

minecode_pipelines/miners/cran.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66
# See https://github.com/aboutcode-org/purldb for support or download.
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
9+
import json
910
from pathlib import Path
1011
import requests
12+
from packageurl import PackageURL
1113

1214

1315
def fetch_cran_db(output_file="cran_db.json") -> Path:
@@ -26,3 +28,39 @@ def fetch_cran_db(output_file="cran_db.json") -> Path:
2628
f.write(chunk)
2729

2830
return output_path
31+
32+
33+
def extract_cran_packages(json_file_path: str) -> list:
34+
"""
35+
Extract package names and their versions from a CRAN DB JSON file.
36+
ex:
37+
{
38+
"AATtools": {
39+
"_id": "AATtools",
40+
"_rev": "8-9ebb721d05b946f2b437b49e892c9e8c",
41+
"name": "AATtools",
42+
"versions": {
43+
"0.0.1": {...},
44+
"0.0.2": {...},
45+
"0.0.3": {...}
46+
}
47+
}
48+
"""
49+
db_path = Path(json_file_path)
50+
if not db_path.exists():
51+
raise FileNotFoundError(f"File not found: {db_path}")
52+
53+
with open(db_path, encoding="utf-8") as f:
54+
data = json.load(f)
55+
56+
for pkg_name, pkg_data in data.items():
57+
versions = list(pkg_data.get("versions", {}).keys())
58+
purls = []
59+
for version in versions:
60+
purl = PackageURL(
61+
type="cran",
62+
name=pkg_name,
63+
version=version,
64+
)
65+
purls.append(purl.to_string())
66+
yield purls

minecode_pipelines/pipelines/mine_cran.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ def steps(cls):
4545
cls.check_federatedcode_eligibility,
4646
cls.setup_federatedcode_cran,
4747
cls.mine_and_publish_cran_packageurls,
48+
cls.cleanup_db_and_repo,
4849
)
4950

5051
def check_federatedcode_eligibility(self):

minecode_pipelines/pipes/cran.py

Lines changed: 24 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,13 @@
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
2222

23-
import json
24-
from pathlib import Path
25-
from packageurl import PackageURL
2623
from aboutcode.hashid import get_package_purls_yml_file_path, get_core_purl
27-
2824
from scanpipe.pipes.federatedcode import commit_changes
2925
from scanpipe.pipes.federatedcode import push_changes
3026
from minecode_pipelines import VERSION
27+
from minecode_pipelines.miners.cran import extract_cran_packages
3128
from minecode_pipelines.pipes import write_data_to_yaml_file
29+
from minecode_pipelines.utils import grouper
3230

3331
PACKAGE_BATCH_SIZE = 1000
3432

@@ -38,25 +36,32 @@ def mine_and_publish_cran_packageurls(cloned_data_repo, db_path, logger):
3836
Extract CRAN packages from the database, write their package URLs (purls) to YAML,
3937
and commit changes in batches to the given cloned repository.
4038
"""
41-
batch_counter = 0
42-
purl_files = []
43-
base_purls = []
39+
packages_to_sync = list(extract_cran_packages(db_path))
40+
41+
for package_batch in grouper(packages_to_sync, PACKAGE_BATCH_SIZE):
42+
purl_files = []
43+
base_purls = []
4444

45-
for updated_purls in extract_cran_packages(db_path):
46-
batch_counter += 1
47-
if not updated_purls:
48-
continue
45+
if logger:
46+
logger(f"Starting package mining for a batch of {PACKAGE_BATCH_SIZE} packages")
4947

50-
first_purl = updated_purls[0]
51-
base_purl = get_core_purl(first_purl)
52-
purl_yaml_path = cloned_data_repo.working_dir / get_package_purls_yml_file_path(first_purl)
53-
write_data_to_yaml_file(path=purl_yaml_path, data=updated_purls)
48+
for updated_purls in package_batch:
49+
if not updated_purls:
50+
continue # skip padded None values or empty
51+
52+
first_purl = updated_purls[0]
53+
base_purl = get_core_purl(first_purl)
54+
purl_yaml_path = cloned_data_repo.working_dir / get_package_purls_yml_file_path(
55+
first_purl
56+
)
57+
write_data_to_yaml_file(path=purl_yaml_path, data=updated_purls)
5458

55-
logger(f"writing packageURLs for package: {str(base_purl)} at: {purl_yaml_path}")
56-
purl_files.append(purl_yaml_path)
57-
base_purls.append(str(base_purl))
59+
logger(f"writing packageURLs for package: {str(base_purl)} at: {purl_yaml_path}")
60+
purl_files.append(purl_yaml_path)
61+
base_purls.append(str(base_purl))
5862

59-
if purl_files and base_purls and batch_counter > PACKAGE_BATCH_SIZE:
63+
# After finishing the batch, commit & push if there’s something to save
64+
if purl_files and base_purls:
6065
commit_changes(
6166
repo=cloned_data_repo,
6267
files_to_commit=purl_files,
@@ -66,42 +71,3 @@ def mine_and_publish_cran_packageurls(cloned_data_repo, db_path, logger):
6671
tool_version=VERSION,
6772
)
6873
push_changes(repo=cloned_data_repo)
69-
70-
batch_counter = 0
71-
purl_files.clear()
72-
base_purls.clear()
73-
74-
if purl_files and base_purls:
75-
commit_changes(
76-
repo=cloned_data_repo,
77-
files_to_commit=purl_files,
78-
purls=base_purls,
79-
mine_type="packageURL",
80-
tool_name="pkg:pypi/minecode-pipelines",
81-
tool_version=VERSION,
82-
)
83-
push_changes(repo=cloned_data_repo)
84-
85-
86-
def extract_cran_packages(json_file_path: str) -> list:
87-
"""
88-
Extract package names and their versions from a CRAN DB JSON file.
89-
"""
90-
db_path = Path(json_file_path)
91-
if not db_path.exists():
92-
raise FileNotFoundError(f"File not found: {db_path}")
93-
94-
with open(db_path, encoding="utf-8") as f:
95-
data = json.load(f)
96-
97-
for pkg_name, pkg_data in data.items():
98-
versions = list(pkg_data.get("versions", {}).keys())
99-
purls = []
100-
for version in versions:
101-
purl = PackageURL(
102-
type="cran",
103-
name=pkg_name,
104-
version=version,
105-
)
106-
purls.append(purl.to_string())
107-
yield purls

0 commit comments

Comments
 (0)