scanoss · eeisegn · Aug 22, 2023 · Aug 17, 2023 · Aug 18, 2023 · Aug 22, 2023
diff --git a/setup.cfg b/setup.cfg
@@ -36,7 +36,7 @@ install_requires =
 
 [options.extras_require]
 fast_winnowing =
-    scanoss_winnowing>=0.2.0
+    scanoss_winnowing>=0.3.0
 
 [options.packages.find]
 where = src

diff --git a/src/scanoss/cli.py b/src/scanoss/cli.py
@@ -77,8 +77,8 @@ def setup_args() -> None:
     p_scan.add_argument('--output',   '-o', type=str, help='Output result file name (optional - default stdout).')
     p_scan.add_argument('--format',   '-f', type=str, choices=['plain', 'cyclonedx', 'spdxlite', 'csv'],
                         help='Result output format (optional - default: plain)')
-    p_scan.add_argument('--threads', '-T', type=int, default=10,
-                        help='Number of threads to use while scanning (optional - default 10)')
+    p_scan.add_argument('--threads', '-T', type=int, default=5,
+                        help='Number of threads to use while scanning (optional - default 5)')
     p_scan.add_argument('--flags', '-F', type=int,
                         help='Scanning engine flags (1: disable snippet matching, 2 enable snippet ids, '
                              '4: disable dependencies, 8: disable licenses, 16: disable copyrights,'
@@ -87,10 +87,10 @@ def setup_args() -> None:
                              '1024: enable download_url, 2048: enable GitHub full path, '
                              '4096: disable extended server stats)')
     p_scan.add_argument('--skip-snippets', '-S', action='store_true', help='Skip the generation of snippets')
-    p_scan.add_argument('--post-size', '-P', type=int, default=64,
-                        help='Number of kilobytes to limit the post to while scanning (optional - default 64)')
-    p_scan.add_argument('--timeout', '-M', type=int, default=120,
-                        help='Timeout (in seconds) for API communication (optional - default 120)')
+    p_scan.add_argument('--post-size', '-P', type=int, default=32,
+                        help='Number of kilobytes to limit the post to while scanning (optional - default 32)')
+    p_scan.add_argument('--timeout', '-M', type=int, default=180,
+                        help='Timeout (in seconds) for API communication (optional - default 180)')
     p_scan.add_argument('--retry', '-R', type=int, default=5,
                         help='Retry limit for API communication (optional - default 5)')
     p_scan.add_argument('--no-wfp-output', action='store_true', help='Skip WFP file generation')
@@ -445,9 +445,9 @@ def scan(parser, args):
             print_stderr("Scanning all hidden files/folders...")
         if args.skip_snippets:
             print_stderr("Skipping snippets...")
-        if args.post_size != 64:
+        if args.post_size != 32:
             print_stderr(f'Changing scanning POST size to: {args.post_size}k...')
-        if args.timeout != 120:
+        if args.timeout != 180:
             print_stderr(f'Changing scanning POST timeout to: {args.timeout}...')
         if args.retry != 5:
             print_stderr(f'Changing scanning POST retry to: {args.retry}...')

diff --git a/src/scanoss/scanner.py b/src/scanoss/scanner.py
@@ -97,7 +97,7 @@ class Scanner(ScanossBase):
     def __init__(self, wfp: str = None, scan_output: str = None, output_format: str = 'plain',
                  debug: bool = False, trace: bool = False, quiet: bool = False, api_key: str = None, url: str = None,
                  sbom_path: str = None, scan_type: str = None, flags: str = None, nb_threads: int = 5,
-                 post_size: int = 64, timeout: int = 120, no_wfp_file: bool = False,
+                 post_size: int = 32, timeout: int = 180, no_wfp_file: bool = False,
                  all_extensions: bool = False, all_folders: bool = False, hidden_files_folders: bool = False,
                  scan_options: int = 7, sc_timeout: int = 600, sc_command: str = None, grpc_url: str = None,
                  obfuscate: bool = False, ignore_cert_errors: bool = False, proxy: str = None, grpc_proxy: str = None,
@@ -141,6 +141,7 @@ def __init__(self, wfp: str = None, scan_output: str = None, output_format: str
         else:
             self.threaded_scan = None
         self.max_post_size = post_size * 1024 if post_size > 0 else MAX_POST_SIZE  # Set the max post size (default 64k)
+        self.post_file_count = post_size if post_size > 0 else 32  # Max number of files for any given POST (default 32)
         if self._skip_snippets:
             self.max_post_size = 8 * 1024  # 8k Max post size if we're skipping snippets
 
@@ -360,11 +361,13 @@ def scan_folder(self, scan_dir: str) -> bool:
         spinner = None
         if not self.quiet and self.isatty:
             spinner = Spinner('Fingerprinting ')
+        save_wfps_for_print = not self.no_wfp_file or not self.threaded_scan
         wfp_list = []
         scan_block = ''
         scan_size = 0
         queue_size = 0
-        file_count = 0
+        file_count = 0  # count all files fingerprinted
+        wfp_file_count = 0  # count number of files in each queue post
         scan_started = False
         for root, dirs, files in os.walk(scan_dir):
             self.print_trace(f'U Root: {root}, Dirs: {dirs}, Files {files}')
@@ -389,7 +392,9 @@ def scan_folder(self, scan_dir: str) -> bool:
                     wfp = self.winnowing.wfp_for_file(path, Scanner.__strip_dir(scan_dir, scan_dir_len, path))
                     if wfp is None or wfp == '':
                         self.print_stderr(f'Warning: No WFP returned for {path}')
-                    wfp_list.append(wfp)
+                        continue
+                    if save_wfps_for_print:
+                        wfp_list.append(wfp)
                     file_count += 1
                     if self.threaded_scan:
                         wfp_size = len(wfp.encode("utf-8"))
@@ -398,13 +403,17 @@ def scan_folder(self, scan_dir: str) -> bool:
                             self.threaded_scan.queue_add(scan_block)
                             queue_size += 1
                             scan_block = ''
+                            wfp_file_count = 0
                         scan_block += wfp
                         scan_size = len(scan_block.encode("utf-8"))
-                        if scan_size >= self.max_post_size:
+                        wfp_file_count += 1
+                        # If the scan request block (group of WFPs) or larger than the POST size or we have reached the file limit, add it to the queue
+                        if wfp_file_count > self.post_file_count or scan_size >= self.max_post_size:
                             self.threaded_scan.queue_add(scan_block)
                             queue_size += 1
                             scan_block = ''
-                        if queue_size > self.nb_threads and not scan_started:  # Start scanning if we have something to do
+                            wfp_file_count = 0
+                        if not scan_started and queue_size > self.nb_threads:  # Start scanning if we have something to do
                             scan_started = True
                             if not self.threaded_scan.run(wait=False):
                                 self.print_stderr(
@@ -416,8 +425,8 @@ def scan_folder(self, scan_dir: str) -> bool:
         if spinner:
             spinner.finish()
 
-        if wfp_list:
-            if not self.no_wfp_file or not self.threaded_scan:  # Write a WFP file if no threading is requested
+        if file_count > 0:
+            if save_wfps_for_print:  # Write a WFP file if no threading is requested
                 self.print_debug(f'Writing fingerprints to {self.wfp}')
                 with open(self.wfp, 'w') as f:
                     f.write(''.join(wfp_list))
@@ -730,7 +739,8 @@ def scan_wfp_file_threaded(self, file: str = None, file_map: dict = None) -> boo
             raise Exception(f"ERROR: Specified WFP file does not exist or is not a file: {wfp_file}")
         cur_size = 0
         queue_size = 0
-        file_count = 0
+        file_count = 0  # count all files fingerprinted
+        wfp_file_count = 0  # count number of files in each queue post
         scan_started = False
         wfp = ''
         scan_block = ''
@@ -742,17 +752,19 @@ def scan_wfp_file_threaded(self, file: str = None, file_map: dict = None) -> boo
                         cur_size = len(wfp.encode("utf-8"))
                     scan_block = line  # Start storing the next file
                     file_count += 1
+                    wfp_file_count += 1
                 else:
                     scan_block += line  # Store the rest of the WFP for this file
                 l_size = cur_size + len(scan_block.encode('utf-8'))
                 # Hit the max post size, so sending the current batch and continue processing
-                if l_size >= self.max_post_size and wfp:
+                if (wfp_file_count > self.post_file_count or l_size >= self.max_post_size) and wfp:
                     if self.debug and cur_size > self.max_post_size:
                         Scanner.print_stderr(f'Warning: Post size {cur_size} greater than limit {self.max_post_size}')
                     self.threaded_scan.queue_add(wfp)
                     queue_size += 1
                     wfp = ''
-                    if queue_size > self.nb_threads and not scan_started:  # Start scanning if we have something to do
+                    wfp_file_count = 0
+                    if not scan_started and queue_size > self.nb_threads:  # Start scanning if we have something to do
                         scan_started = True
                         if not self.threaded_scan.run(wait=False):
                             self.print_stderr(

diff --git a/src/scanoss/scanossapi.py b/src/scanoss/scanossapi.py
@@ -52,7 +52,7 @@ class ScanossApi(ScanossBase):
 
     def __init__(self, scan_type: str = None, sbom_path: str = None, scan_format: str = None, flags: str = None,
                  url: str = None, api_key: str = None, debug: bool = False, trace: bool = False, quiet: bool = False,
-                 timeout: int = 120, ver_details: str = None, ignore_cert_errors: bool = False,
+                 timeout: int = 180, ver_details: str = None, ignore_cert_errors: bool = False,
                  proxy: str = None, ca_cert: str = None, pac: PACFile = None, retry: int = 5):
         """
         Initialise the SCANOSS API
@@ -81,7 +81,7 @@ def __init__(self, scan_type: str = None, sbom_path: str = None, scan_format: st
         self.scan_format = scan_format if scan_format else 'plain'
         self.sbom_path = sbom_path
         self.flags = flags
-        self.timeout = timeout if timeout > 5 else 120
+        self.timeout = timeout if timeout > 5 else 180
         self.retry_limit = retry if retry >= 0 else 5
         self.ignore_cert_errors = ignore_cert_errors
         self.headers = {}

diff --git a/src/scanoss/winnowing.py b/src/scanoss/winnowing.py
@@ -57,13 +57,14 @@
     ".o", ".a", ".so", ".obj", ".dll", ".lib", ".out", ".app", ".bin",
     ".lst", ".dat", ".json", ".htm", ".html", ".xml", ".md", ".txt",
     ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx", ".odt", ".ods", ".odp", ".pages", ".key", ".numbers",
-    ".pdf", ".min.js", ".mf", ".sum"
+    ".pdf", ".min.js", ".mf", ".sum", ".woff", ".woff2"
 }
 
 CRC8_MAXIM_DOW_TABLE_SIZE = 0x100
-CRC8_MAXIM_DOW_POLYNOMIAL = 0x8C # 0x31 reflected
-CRC8_MAXIM_DOW_INITIAL = 0x00 # 0x00 reflected
-CRC8_MAXIM_DOW_FINAL = 0x00 # 0x00 reflected
+CRC8_MAXIM_DOW_POLYNOMIAL = 0x8C  # 0x31 reflected
+CRC8_MAXIM_DOW_INITIAL = 0x00  # 0x00 reflected
+CRC8_MAXIM_DOW_FINAL = 0x00  # 0x00 reflected
+
 
 class Winnowing(ScanossBase):
     """
@@ -108,16 +109,16 @@ class Winnowing(ScanossBase):
     a list of WFP fingerprints with their corresponding line numbers.
     """
 
-    def __init__(self, size_limit: bool = True, debug: bool = False, trace: bool = False, quiet: bool = False,
-                 skip_snippets: bool = False, post_size: int = 64, all_extensions: bool = False, 
+    def __init__(self, size_limit: bool = False, debug: bool = False, trace: bool = False, quiet: bool = False,
+                 skip_snippets: bool = False, post_size: int = 32, all_extensions: bool = False,
                  obfuscate: bool = False, hpsm: bool = False
                  ):
         """
         Instantiate Winnowing class
         Parameters
         ----------
             size_limit: bool
-                Limit the size of a fingerprint to 64k (post size) - Default True
+                Limit the size of a fingerprint to 32k (post size) - Default False
         """
         super().__init__(debug, trace, quiet)
         self.size_limit = size_limit
@@ -130,6 +131,7 @@ def __init__(self, size_limit: bool = True, debug: bool = False, trace: bool = F
         self.hpsm = hpsm
         if hpsm:
             self.crc8_maxim_dow_table = []
+            self.crc8_generate_table()
 
     @staticmethod
     def __normalize(byte):
@@ -285,7 +287,7 @@ def wfp_for_contents(self, file: str, bin_file: bool, contents: bytes) -> str:
                                     if self.size_limit and \
                                             (len(wfp.encode("utf-8")) + len(
                                                 output.encode("utf-8"))) > self.max_post_size:
-                                        self.print_debug(f'Truncating WFP (64k limit) for: {file}')
+                                        self.print_debug(f'Truncating WFP ({self.max_post_size} limit) for: {file}')
                                         output = ''
                                         break  # Stop collecting snippets as it's over 64k
                                     wfp += output + '\n'
@@ -310,53 +312,82 @@ def wfp_for_contents(self, file: str, bin_file: bool, contents: bytes) -> str:
         return wfp
 
     def calc_hpsm(self, content):
-        list_normalized = []    #Array of numbers
-        crc_lines = []  #Array of numbers that represent the crc8_maxim for each line of the file
+        """
+        Calculate the HPSM data for this content
+
+        :param content: content bytes to calculate
+        :return: HPSM encoded data
+        """
+        list_normalized = []  # Array of numbers
+        crc_lines = []  # Array of numbers that represent the crc8_maxim for each line of the file
         last_line = 0
-        self.crc8_generate_table()
         for i, byte in enumerate(content):
             c = byte
-            if c == ASCII_LF:   #When there is a new line
+            if c == ASCII_LF:   # When there is a new line
                 if len(list_normalized): 
                     crc_lines.append(self.crc8_buffer(list_normalized))
-                    list_normalized=[]
+                    list_normalized = []
                 elif last_line+1 == i:
                     crc_lines.append(0xFF)
-                elif i-last_line  > 1:
+                elif i-last_line > 1:
                     crc_lines.append(0x00)
                 last_line = i
             else:
                 c_normalized = self.__normalize(c)
                 if c_normalized != 0:
                     list_normalized.append(c_normalized)
-        crc_lines_hex = []
-        for x in crc_lines:
-            crc_lines_hex.append(hex(x))
         hpsm = ''.join('{:02x}'.format(x) for x in crc_lines)
         return hpsm
 
     def crc8_generate_table(self):
-        for i in range(CRC8_MAXIM_DOW_TABLE_SIZE):
-            self.crc8_maxim_dow_table.append(self.crc8_byte_checksum(0, i))
-
-    def crc8_byte_checksum(self, crc, byte):
+        """
+        Generate the CRC8 maxim dow table
+
+        :return: nothing
+        """
+        if not self.crc8_maxim_dow_table or len(self.crc8_maxim_dow_table) == 0:
+            for i in range(CRC8_MAXIM_DOW_TABLE_SIZE):
+                self.crc8_maxim_dow_table.append(self.crc8_byte_checksum(0, i))
+
+    @staticmethod
+    def crc8_byte_checksum(crc: int, byte):
+        """
+        Calculate the CRC8 checksum for the given byte
+
+        :param crc:
+        :param byte:
+        :return: CRC for the byte
+        """
         crc ^= byte
         for count in range(8):
-            isSet = crc & 0x01
+            is_set = crc & 0x01
             crc >>= 1
-            if isSet:
+            if is_set:
                 crc ^= CRC8_MAXIM_DOW_POLYNOMIAL
         return crc
 
-    def crc8_byte(self, crc, byte):
+    def crc8_byte(self, crc: int, byte):
+        """
+        Calculate the CRC8 for the given CRC & Byte
+
+        :param crc:
+        :param byte:
+        :return:
+        """
         index = byte ^ crc
-        return self.crc8_maxim_dow_table[ index ] ^ ( crc >> 8 )
+        return self.crc8_maxim_dow_table[index] ^ (crc >> 8)
 
     def crc8_buffer(self, buffer):
+        """
+        Calculate the CRC for the given buffer list
+
+        :param buffer:
+        :return:
+        """
         crc = CRC8_MAXIM_DOW_INITIAL
         for index in range(len(buffer)):
             crc = self.crc8_byte(crc, buffer[index])
-        crc ^= CRC8_MAXIM_DOW_FINAL
+        crc ^= CRC8_MAXIM_DOW_FINAL  # Bitwise OR (XOR) of crc in Maxim Dow Final
         return crc
 
 #