Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ install_requires =

[options.extras_require]
fast_winnowing =
scanoss_winnowing>=0.2.0
scanoss_winnowing>=0.3.0

[options.packages.find]
where = src
Expand Down
16 changes: 8 additions & 8 deletions src/scanoss/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ def setup_args() -> None:
p_scan.add_argument('--output', '-o', type=str, help='Output result file name (optional - default stdout).')
p_scan.add_argument('--format', '-f', type=str, choices=['plain', 'cyclonedx', 'spdxlite', 'csv'],
help='Result output format (optional - default: plain)')
p_scan.add_argument('--threads', '-T', type=int, default=10,
help='Number of threads to use while scanning (optional - default 10)')
p_scan.add_argument('--threads', '-T', type=int, default=5,
help='Number of threads to use while scanning (optional - default 5)')
p_scan.add_argument('--flags', '-F', type=int,
help='Scanning engine flags (1: disable snippet matching, 2 enable snippet ids, '
'4: disable dependencies, 8: disable licenses, 16: disable copyrights,'
Expand All @@ -87,10 +87,10 @@ def setup_args() -> None:
'1024: enable download_url, 2048: enable GitHub full path, '
'4096: disable extended server stats)')
p_scan.add_argument('--skip-snippets', '-S', action='store_true', help='Skip the generation of snippets')
p_scan.add_argument('--post-size', '-P', type=int, default=64,
help='Number of kilobytes to limit the post to while scanning (optional - default 64)')
p_scan.add_argument('--timeout', '-M', type=int, default=120,
help='Timeout (in seconds) for API communication (optional - default 120)')
p_scan.add_argument('--post-size', '-P', type=int, default=32,
help='Number of kilobytes to limit the post to while scanning (optional - default 32)')
p_scan.add_argument('--timeout', '-M', type=int, default=180,
help='Timeout (in seconds) for API communication (optional - default 180)')
p_scan.add_argument('--retry', '-R', type=int, default=5,
help='Retry limit for API communication (optional - default 5)')
p_scan.add_argument('--no-wfp-output', action='store_true', help='Skip WFP file generation')
Expand Down Expand Up @@ -445,9 +445,9 @@ def scan(parser, args):
print_stderr("Scanning all hidden files/folders...")
if args.skip_snippets:
print_stderr("Skipping snippets...")
if args.post_size != 64:
if args.post_size != 32:
print_stderr(f'Changing scanning POST size to: {args.post_size}k...')
if args.timeout != 120:
if args.timeout != 180:
print_stderr(f'Changing scanning POST timeout to: {args.timeout}...')
if args.retry != 5:
print_stderr(f'Changing scanning POST retry to: {args.retry}...')
Expand Down
32 changes: 22 additions & 10 deletions src/scanoss/scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ class Scanner(ScanossBase):
def __init__(self, wfp: str = None, scan_output: str = None, output_format: str = 'plain',
debug: bool = False, trace: bool = False, quiet: bool = False, api_key: str = None, url: str = None,
sbom_path: str = None, scan_type: str = None, flags: str = None, nb_threads: int = 5,
post_size: int = 64, timeout: int = 120, no_wfp_file: bool = False,
post_size: int = 32, timeout: int = 180, no_wfp_file: bool = False,
all_extensions: bool = False, all_folders: bool = False, hidden_files_folders: bool = False,
scan_options: int = 7, sc_timeout: int = 600, sc_command: str = None, grpc_url: str = None,
obfuscate: bool = False, ignore_cert_errors: bool = False, proxy: str = None, grpc_proxy: str = None,
Expand Down Expand Up @@ -141,6 +141,7 @@ def __init__(self, wfp: str = None, scan_output: str = None, output_format: str
else:
self.threaded_scan = None
self.max_post_size = post_size * 1024 if post_size > 0 else MAX_POST_SIZE # Set the max post size (default 64k)
self.post_file_count = post_size if post_size > 0 else 32 # Max number of files for any given POST (default 32)
if self._skip_snippets:
self.max_post_size = 8 * 1024 # 8k Max post size if we're skipping snippets

Expand Down Expand Up @@ -360,11 +361,13 @@ def scan_folder(self, scan_dir: str) -> bool:
spinner = None
if not self.quiet and self.isatty:
spinner = Spinner('Fingerprinting ')
save_wfps_for_print = not self.no_wfp_file or not self.threaded_scan
wfp_list = []
scan_block = ''
scan_size = 0
queue_size = 0
file_count = 0
file_count = 0 # count all files fingerprinted
wfp_file_count = 0 # count number of files in each queue post
scan_started = False
for root, dirs, files in os.walk(scan_dir):
self.print_trace(f'U Root: {root}, Dirs: {dirs}, Files {files}')
Expand All @@ -389,7 +392,9 @@ def scan_folder(self, scan_dir: str) -> bool:
wfp = self.winnowing.wfp_for_file(path, Scanner.__strip_dir(scan_dir, scan_dir_len, path))
if wfp is None or wfp == '':
self.print_stderr(f'Warning: No WFP returned for {path}')
wfp_list.append(wfp)
continue
if save_wfps_for_print:
wfp_list.append(wfp)
file_count += 1
if self.threaded_scan:
wfp_size = len(wfp.encode("utf-8"))
Expand All @@ -398,13 +403,17 @@ def scan_folder(self, scan_dir: str) -> bool:
self.threaded_scan.queue_add(scan_block)
queue_size += 1
scan_block = ''
wfp_file_count = 0
scan_block += wfp
scan_size = len(scan_block.encode("utf-8"))
if scan_size >= self.max_post_size:
wfp_file_count += 1
# If the scan request block (group of WFPs) or larger than the POST size or we have reached the file limit, add it to the queue
if wfp_file_count > self.post_file_count or scan_size >= self.max_post_size:
self.threaded_scan.queue_add(scan_block)
queue_size += 1
scan_block = ''
if queue_size > self.nb_threads and not scan_started: # Start scanning if we have something to do
wfp_file_count = 0
if not scan_started and queue_size > self.nb_threads: # Start scanning if we have something to do
scan_started = True
if not self.threaded_scan.run(wait=False):
self.print_stderr(
Expand All @@ -416,8 +425,8 @@ def scan_folder(self, scan_dir: str) -> bool:
if spinner:
spinner.finish()

if wfp_list:
if not self.no_wfp_file or not self.threaded_scan: # Write a WFP file if no threading is requested
if file_count > 0:
if save_wfps_for_print: # Write a WFP file if no threading is requested
self.print_debug(f'Writing fingerprints to {self.wfp}')
with open(self.wfp, 'w') as f:
f.write(''.join(wfp_list))
Expand Down Expand Up @@ -730,7 +739,8 @@ def scan_wfp_file_threaded(self, file: str = None, file_map: dict = None) -> boo
raise Exception(f"ERROR: Specified WFP file does not exist or is not a file: {wfp_file}")
cur_size = 0
queue_size = 0
file_count = 0
file_count = 0 # count all files fingerprinted
wfp_file_count = 0 # count number of files in each queue post
scan_started = False
wfp = ''
scan_block = ''
Expand All @@ -742,17 +752,19 @@ def scan_wfp_file_threaded(self, file: str = None, file_map: dict = None) -> boo
cur_size = len(wfp.encode("utf-8"))
scan_block = line # Start storing the next file
file_count += 1
wfp_file_count += 1
else:
scan_block += line # Store the rest of the WFP for this file
l_size = cur_size + len(scan_block.encode('utf-8'))
# Hit the max post size, so sending the current batch and continue processing
if l_size >= self.max_post_size and wfp:
if (wfp_file_count > self.post_file_count or l_size >= self.max_post_size) and wfp:
if self.debug and cur_size > self.max_post_size:
Scanner.print_stderr(f'Warning: Post size {cur_size} greater than limit {self.max_post_size}')
self.threaded_scan.queue_add(wfp)
queue_size += 1
wfp = ''
if queue_size > self.nb_threads and not scan_started: # Start scanning if we have something to do
wfp_file_count = 0
if not scan_started and queue_size > self.nb_threads: # Start scanning if we have something to do
scan_started = True
if not self.threaded_scan.run(wait=False):
self.print_stderr(
Expand Down
4 changes: 2 additions & 2 deletions src/scanoss/scanossapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class ScanossApi(ScanossBase):

def __init__(self, scan_type: str = None, sbom_path: str = None, scan_format: str = None, flags: str = None,
url: str = None, api_key: str = None, debug: bool = False, trace: bool = False, quiet: bool = False,
timeout: int = 120, ver_details: str = None, ignore_cert_errors: bool = False,
timeout: int = 180, ver_details: str = None, ignore_cert_errors: bool = False,
proxy: str = None, ca_cert: str = None, pac: PACFile = None, retry: int = 5):
"""
Initialise the SCANOSS API
Expand Down Expand Up @@ -81,7 +81,7 @@ def __init__(self, scan_type: str = None, sbom_path: str = None, scan_format: st
self.scan_format = scan_format if scan_format else 'plain'
self.sbom_path = sbom_path
self.flags = flags
self.timeout = timeout if timeout > 5 else 120
self.timeout = timeout if timeout > 5 else 180
self.retry_limit = retry if retry >= 0 else 5
self.ignore_cert_errors = ignore_cert_errors
self.headers = {}
Expand Down
83 changes: 57 additions & 26 deletions src/scanoss/winnowing.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,14 @@
".o", ".a", ".so", ".obj", ".dll", ".lib", ".out", ".app", ".bin",
".lst", ".dat", ".json", ".htm", ".html", ".xml", ".md", ".txt",
".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx", ".odt", ".ods", ".odp", ".pages", ".key", ".numbers",
".pdf", ".min.js", ".mf", ".sum"
".pdf", ".min.js", ".mf", ".sum", ".woff", ".woff2"
}

CRC8_MAXIM_DOW_TABLE_SIZE = 0x100
CRC8_MAXIM_DOW_POLYNOMIAL = 0x8C # 0x31 reflected
CRC8_MAXIM_DOW_INITIAL = 0x00 # 0x00 reflected
CRC8_MAXIM_DOW_FINAL = 0x00 # 0x00 reflected
CRC8_MAXIM_DOW_POLYNOMIAL = 0x8C # 0x31 reflected
CRC8_MAXIM_DOW_INITIAL = 0x00 # 0x00 reflected
CRC8_MAXIM_DOW_FINAL = 0x00 # 0x00 reflected


class Winnowing(ScanossBase):
"""
Expand Down Expand Up @@ -108,16 +109,16 @@ class Winnowing(ScanossBase):
a list of WFP fingerprints with their corresponding line numbers.
"""

def __init__(self, size_limit: bool = True, debug: bool = False, trace: bool = False, quiet: bool = False,
skip_snippets: bool = False, post_size: int = 64, all_extensions: bool = False,
def __init__(self, size_limit: bool = False, debug: bool = False, trace: bool = False, quiet: bool = False,
skip_snippets: bool = False, post_size: int = 32, all_extensions: bool = False,
obfuscate: bool = False, hpsm: bool = False
):
"""
Instantiate Winnowing class
Parameters
----------
size_limit: bool
Limit the size of a fingerprint to 64k (post size) - Default True
Limit the size of a fingerprint to 32k (post size) - Default False
"""
super().__init__(debug, trace, quiet)
self.size_limit = size_limit
Expand All @@ -130,6 +131,7 @@ def __init__(self, size_limit: bool = True, debug: bool = False, trace: bool = F
self.hpsm = hpsm
if hpsm:
self.crc8_maxim_dow_table = []
self.crc8_generate_table()

@staticmethod
def __normalize(byte):
Expand Down Expand Up @@ -285,7 +287,7 @@ def wfp_for_contents(self, file: str, bin_file: bool, contents: bytes) -> str:
if self.size_limit and \
(len(wfp.encode("utf-8")) + len(
output.encode("utf-8"))) > self.max_post_size:
self.print_debug(f'Truncating WFP (64k limit) for: {file}')
self.print_debug(f'Truncating WFP ({self.max_post_size} limit) for: {file}')
output = ''
break # Stop collecting snippets as it's over 64k
wfp += output + '\n'
Expand All @@ -310,53 +312,82 @@ def wfp_for_contents(self, file: str, bin_file: bool, contents: bytes) -> str:
return wfp

def calc_hpsm(self, content):
list_normalized = [] #Array of numbers
crc_lines = [] #Array of numbers that represent the crc8_maxim for each line of the file
"""
Calculate the HPSM data for this content

:param content: content bytes to calculate
:return: HPSM encoded data
"""
list_normalized = [] # Array of numbers
crc_lines = [] # Array of numbers that represent the crc8_maxim for each line of the file
last_line = 0
self.crc8_generate_table()
for i, byte in enumerate(content):
c = byte
if c == ASCII_LF: #When there is a new line
if c == ASCII_LF: # When there is a new line
if len(list_normalized):
crc_lines.append(self.crc8_buffer(list_normalized))
list_normalized=[]
list_normalized = []
elif last_line+1 == i:
crc_lines.append(0xFF)
elif i-last_line > 1:
elif i-last_line > 1:
crc_lines.append(0x00)
last_line = i
else:
c_normalized = self.__normalize(c)
if c_normalized != 0:
list_normalized.append(c_normalized)
crc_lines_hex = []
for x in crc_lines:
crc_lines_hex.append(hex(x))
hpsm = ''.join('{:02x}'.format(x) for x in crc_lines)
return hpsm

def crc8_generate_table(self):
for i in range(CRC8_MAXIM_DOW_TABLE_SIZE):
self.crc8_maxim_dow_table.append(self.crc8_byte_checksum(0, i))

def crc8_byte_checksum(self, crc, byte):
"""
Generate the CRC8 maxim dow table

:return: nothing
"""
if not self.crc8_maxim_dow_table or len(self.crc8_maxim_dow_table) == 0:
for i in range(CRC8_MAXIM_DOW_TABLE_SIZE):
self.crc8_maxim_dow_table.append(self.crc8_byte_checksum(0, i))

@staticmethod
def crc8_byte_checksum(crc: int, byte):
"""
Calculate the CRC8 checksum for the given byte

:param crc:
:param byte:
:return: CRC for the byte
"""
crc ^= byte
for count in range(8):
isSet = crc & 0x01
is_set = crc & 0x01
crc >>= 1
if isSet:
if is_set:
crc ^= CRC8_MAXIM_DOW_POLYNOMIAL
return crc

def crc8_byte(self, crc, byte):
def crc8_byte(self, crc: int, byte):
"""
Calculate the CRC8 for the given CRC & Byte

:param crc:
:param byte:
:return:
"""
index = byte ^ crc
return self.crc8_maxim_dow_table[ index ] ^ ( crc >> 8 )
return self.crc8_maxim_dow_table[index] ^ (crc >> 8)

def crc8_buffer(self, buffer):
"""
Calculate the CRC for the given buffer list

:param buffer:
:return:
"""
crc = CRC8_MAXIM_DOW_INITIAL
for index in range(len(buffer)):
crc = self.crc8_byte(crc, buffer[index])
crc ^= CRC8_MAXIM_DOW_FINAL
crc ^= CRC8_MAXIM_DOW_FINAL # Bitwise OR (XOR) of crc in Maxim Dow Final
return crc

#
Expand Down