20
20
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
21
21
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
22
22
23
- import json
24
- from pathlib import Path
25
- from packageurl import PackageURL
26
23
from aboutcode .hashid import get_package_purls_yml_file_path , get_core_purl
27
-
28
24
from scanpipe .pipes .federatedcode import commit_changes
29
25
from scanpipe .pipes .federatedcode import push_changes
30
26
from minecode_pipelines import VERSION
27
+ from minecode_pipelines .miners .cran import extract_cran_packages
31
28
from minecode_pipelines .pipes import write_data_to_yaml_file
29
+ from minecode_pipelines .utils import grouper
32
30
33
31
PACKAGE_BATCH_SIZE = 1000
34
32
@@ -38,25 +36,32 @@ def mine_and_publish_cran_packageurls(cloned_data_repo, db_path, logger):
38
36
Extract CRAN packages from the database, write their package URLs (purls) to YAML,
39
37
and commit changes in batches to the given cloned repository.
40
38
"""
41
- batch_counter = 0
42
- purl_files = []
43
- base_purls = []
39
+ packages_to_sync = list (extract_cran_packages (db_path ))
40
+
41
+ for package_batch in grouper (packages_to_sync , PACKAGE_BATCH_SIZE ):
42
+ purl_files = []
43
+ base_purls = []
44
44
45
- for updated_purls in extract_cran_packages (db_path ):
46
- batch_counter += 1
47
- if not updated_purls :
48
- continue
45
+ if logger :
46
+ logger (f"Starting package mining for a batch of { PACKAGE_BATCH_SIZE } packages" )
49
47
50
- first_purl = updated_purls [0 ]
51
- base_purl = get_core_purl (first_purl )
52
- purl_yaml_path = cloned_data_repo .working_dir / get_package_purls_yml_file_path (first_purl )
53
- write_data_to_yaml_file (path = purl_yaml_path , data = updated_purls )
48
+ for updated_purls in package_batch :
49
+ if not updated_purls :
50
+ continue # skip padded None values or empty
51
+
52
+ first_purl = updated_purls [0 ]
53
+ base_purl = get_core_purl (first_purl )
54
+ purl_yaml_path = cloned_data_repo .working_dir / get_package_purls_yml_file_path (
55
+ first_purl
56
+ )
57
+ write_data_to_yaml_file (path = purl_yaml_path , data = updated_purls )
54
58
55
- logger (f"writing packageURLs for package: { str (base_purl )} at: { purl_yaml_path } " )
56
- purl_files .append (purl_yaml_path )
57
- base_purls .append (str (base_purl ))
59
+ logger (f"writing packageURLs for package: { str (base_purl )} at: { purl_yaml_path } " )
60
+ purl_files .append (purl_yaml_path )
61
+ base_purls .append (str (base_purl ))
58
62
59
- if purl_files and base_purls and batch_counter > PACKAGE_BATCH_SIZE :
63
+ # After finishing the batch, commit & push if there’s something to save
64
+ if purl_files and base_purls :
60
65
commit_changes (
61
66
repo = cloned_data_repo ,
62
67
files_to_commit = purl_files ,
@@ -66,42 +71,3 @@ def mine_and_publish_cran_packageurls(cloned_data_repo, db_path, logger):
66
71
tool_version = VERSION ,
67
72
)
68
73
push_changes (repo = cloned_data_repo )
69
-
70
- batch_counter = 0
71
- purl_files .clear ()
72
- base_purls .clear ()
73
-
74
- if purl_files and base_purls :
75
- commit_changes (
76
- repo = cloned_data_repo ,
77
- files_to_commit = purl_files ,
78
- purls = base_purls ,
79
- mine_type = "packageURL" ,
80
- tool_name = "pkg:pypi/minecode-pipelines" ,
81
- tool_version = VERSION ,
82
- )
83
- push_changes (repo = cloned_data_repo )
84
-
85
-
86
- def extract_cran_packages (json_file_path : str ) -> list :
87
- """
88
- Extract package names and their versions from a CRAN DB JSON file.
89
- """
90
- db_path = Path (json_file_path )
91
- if not db_path .exists ():
92
- raise FileNotFoundError (f"File not found: { db_path } " )
93
-
94
- with open (db_path , encoding = "utf-8" ) as f :
95
- data = json .load (f )
96
-
97
- for pkg_name , pkg_data in data .items ():
98
- versions = list (pkg_data .get ("versions" , {}).keys ())
99
- purls = []
100
- for version in versions :
101
- purl = PackageURL (
102
- type = "cran" ,
103
- name = pkg_name ,
104
- version = version ,
105
- )
106
- purls .append (purl .to_string ())
107
- yield purls
0 commit comments