Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions inc/limits.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

/* Limits */
#define MAX_ARGLN 512 // Max command line argument length
#define MAX_COMPONENT_SCAN_HASHES 1000 // Max number of url hashes accepted by -C
#define MAX_PATH 1024
#define MAX_HASHES_READ 65535
#define MAX_FILE_SIZE (1024 * 1024 * 4)
Expand All @@ -51,7 +52,9 @@
#define SNIPPETS_DEFAULT_ADJUST_TOLERANCE true /** Adjust tolerance based on file size */
#define SNIPPETS_DEFAULT_HONOR_FILE_EXTENSION true /** Honor file extension during snippet matching */
#define DEFAULT_FETCH_MAX_FILES 12000 /** Maximum number of files to fetch during component matching */
#define MAX_FILE_CONTENT_SIZE_DEFAULT (1024ULL * 1024 * 50) /** Default maximum file content size (50MB) printed by -k */
/* Variables */
extern int fetch_max_files; // Maximum number of files to fetch during component matching
extern uint64_t max_file_content_size; // Maximum file content size printed by -k

#endif
1 change: 1 addition & 0 deletions inc/match_list.h
Original file line number Diff line number Diff line change
Expand Up @@ -147,5 +147,6 @@ void component_list_destroy(component_list_t *list);
bool component_list_add_binary(component_list_t *list, component_data_t *new_comp, bool (*val)(component_data_t *a, component_data_t *b), bool remove_a);
bool match_list_eval(match_list_t *list, match_data_t * in, bool (*eval)(match_data_t *fpa, match_data_t *fpb));
void match_list_tolerance_set(float in);
float match_list_tolerance_get(void);

#endif
80 changes: 80 additions & 0 deletions inc/purl_scan.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* inc/purl_scan.h
*
* SCANOSS Inventory Scanner
*
* Copyright (C) 2018-2024 SCANOSS.COM
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 2 of the License, or
* (at your option) any later version.

* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.

* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

#ifndef __PURL_SCAN_H
#define __PURL_SCAN_H

/**
* @brief Resolve the purls and url hashes related to a file MD5.
*
* Looks up the given file MD5 in the KB (url and file tables) and prints, in
* JSON, the unique purls associated with that file along with every url hash
* (url_id) where the file was seen and the best (lowest) KB rank found for
* each purl.
*
* @param file_md5_hex file MD5 in hex (32 chars)
* @return EXIT_SUCCESS on success, EXIT_FAILURE on invalid input
*/
int purl_scan(char *file_md5_hex);

/**
* @brief Report the details of one or more components identified by url hash.
*
* Accepts a single url_hash (url_id) or a comma-separated list. Each hash is
* looked up in the KB and its component details are rendered in JSON, reusing
* the same rendering used in regular scan reports. Output is always an array
* under the "results" key, one entry per valid hash:
* {"results": [{"url_hash": "...", "component": {...}}, ...]}.
* Invalid hashes are skipped with a stderr warning.
*
* @param url_hash_list comma-separated url hashes in hex (32 chars each)
* @return EXIT_SUCCESS if at least one valid hash was processed,
* EXIT_FAILURE if input is null/empty or no hash was valid
*/
int component_scan(char *url_hash_list);

/**
* @brief Run a snippet-only scan whose WFP input comes from stdin.
*
* Reads a WFP block (same format used by `-w` scans) from stdin, runs the
* snippet selection pipeline (no full-file lookup, no component resolution)
* and prints a JSON report listing the file_md5 candidates grouped by snippet
* region, together with their input/oss line ranges. Candidate cohort size is
* controlled by the tolerance set via -T (match_list_tolerance_set).
*
* @return EXIT_SUCCESS on success, EXIT_FAILURE on invalid/empty input
*/
int snippet_scan_stdin(void);

/**
* @brief Run a snippet-only scan whose WFP input is passed as a string.
*
* Same behavior as snippet_scan_stdin() but reads the WFP block from the
* provided in-memory buffer. Used by `-S "<wfp>"` so callers (e.g. FlexAPI)
* can pass the WFP directly as an argv value instead of piping it via stdin.
*
* @param wfp NUL-terminated buffer holding the WFP block
* @return EXIT_SUCCESS on success, EXIT_FAILURE on invalid/empty input
*/
int snippet_scan_string(const char *wfp);

#endif
1 change: 1 addition & 0 deletions inc/report.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,5 @@ void json_open();
void json_close(void);
void kb_version_get(void);
bool print_json_component(component_data_t * component);
void print_purl_array(component_data_t * component);
#endif
2 changes: 1 addition & 1 deletion inc/scanoss.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
#define WFP_LN 4
#define WFP_REC_LN 18

#define SCANOSS_VERSION "5.4.25"
#define SCANOSS_VERSION "5.4.26"

/* Log files */
#define SCAN_LOG "/tmp/scanoss_scan.log"
Expand Down
45 changes: 22 additions & 23 deletions src/copyright.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,28 +35,27 @@
#include "util.h"
#include "decrypt.h"
#include "debug.h"
const char *copyright_sources[] = {"component_declared", "file_header", "license_file", "scancode"};

/**
* @brief get fisrt copyright LDB function pointer. Will be executed for the ldb_fetch_recordset function in each iteration. See LDB documentation for more details.
* @param key //TODO
* @param subkey //TODO
* @param subkey_ln //TODO
* @param[out] data //TODO
* @param datalen //TODO
* @param iteration //TODO
* @param ptr output pointer, returns the fisrt copyright obtained from the database
* @return //TODO
*/
/* static bool get_first_copyright(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *data, uint32_t datalen, int iteration, void *ptr)
static char * copyright_id_to_source_name(int id)
{
char * result = decrypt_data(data, datalen, oss_copyright, key, subkey);
if (result)
strncpy(ptr, skip_first_comma((char *) result), MAX_COPYRIGHT);

free(result);
return true;
}*/
switch (id)
{
case 1:
case 5:
return "file_header";
case 0:
case 2:
case 6:
case 8:
return "license_file";
case 3:
case 4:
case 7:
return "scancode";
default:
return NULL;
}
}

/**
* @brief //Remove undesired characteres from a copyright
Expand Down Expand Up @@ -105,13 +104,13 @@ static bool print_copyrights_item(uint8_t *key, uint8_t *subkey, int subkey_ln,

char result[MAX_FIELD_LN] = "\0";
int len = 0;

if (!dup && (*copyright) && (src <= (sizeof(copyright_sources) / sizeof(copyright_sources[0]))))
char * source_id = copyright_id_to_source_name(src);
if (!dup && (*copyright) && source_id)
{
if (comp->copyright_text)
len += sprintf(result+len,",");
len += sprintf(result+len,"{\"name\": \"%s\",", copyright);
len += sprintf(result+len,"\"source\": \"%s\"}", copyright_sources[atoi(source)]);
len += sprintf(result+len,"\"source\": \"%s\"}", source_id);
}
if (*result)
str_cat_realloc(&comp->copyright_text, result);
Expand Down
6 changes: 6 additions & 0 deletions src/help.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,12 @@ Configuration:\n\
-a, --attribution FILE Show attribution notices for the provided SBOM.json file.\n\
-c, --component HINT Add a component HINT to guide scan results.\n\
-k, --key KEY Show contents of the specified KEY file from MZ sources archive.\n\
--max-file-content-size MB Set maximum file content size in MB printed by -k (default: 50).\n\
-P, --purl MD5 Return the purls and versions related to the given file MD5 (JSON).\n\
-C, --url-hash MD5 Return the details of the component identified by the given url hash (JSON).\n\
-S, --snippet-scan WFP Snippet-only scan: take a single-file WFP block as argument and return JSON with candidate\n\
file_md5s and their line ranges, filtered by the tolerance set via -T (precede -S with -T\n\
to apply). Use -S - to read the WFP from stdin instead of passing it as argument.\n\
-l, --license LICENSE Display OSADL metadata for the given SPDX license ID.\n\
-L, --full-license Enable full license report.\n\
-F, --flags FLAGS Set engine scanning flags (see below).\n\
Expand Down
7 changes: 5 additions & 2 deletions src/ignorelist.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,13 @@ char *extension(char *path)
{
char *dot = strrchr(path, '.');
char *slash = strrchr(path, '/');
if (!slash) slash = path;

if (!dot && !slash) return NULL;
/* No dot, or the last dot belongs to a parent directory (e.g. "a.b/file"):
the file has no extension. Returning the basename here (as the previous
implementation did) wrongly treated a plain filename as an extension. */
if (!dot) return NULL;
if (dot > slash) return dot + 1;
if (slash != path) return slash + 1;
return NULL;
}

Expand Down
1 change: 1 addition & 0 deletions src/limits.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@
*/

int fetch_max_files = 12000; /** Maximum number of files to fetch during component matching */
uint64_t max_file_content_size = MAX_FILE_CONTENT_SIZE_DEFAULT; /** Maximum file content size printed by -k */
38 changes: 27 additions & 11 deletions src/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#include "parse.h"
#include "report.h"
#include "scan.h"
#include "purl_scan.h"
#include "scanoss.h"
#include "util.h"
#include "component.h"
Expand Down Expand Up @@ -278,8 +279,11 @@ static struct option long_options[] = {
{"sbom", required_argument, 0, 's'},
{"blacklist", required_argument, 0, 'b'},
{"force-snippet", required_argument, 0, 256}, /* Long option only, no short form */
{"snippet-scan", required_argument, 0, 'S'},
{"component", required_argument, 0, 'c'},
{"key", required_argument, 0, 'k'},
{"purl", required_argument, 0, 'P'},
{"url-hash", required_argument, 0, 'C'},
{"attribution", required_argument, 0, 'a'},
{"flags", required_argument, 0, 'F'},
{"license", required_argument, 0, 'l'},
Expand All @@ -292,6 +296,7 @@ static struct option long_options[] = {
{"min-snippet-lines", required_argument, 0, 259}, /* Long option only */
{"ignore-file-ext", no_argument, 0, 260}, /* Long option only */
{"range-tolerance", required_argument, 0, 261}, /* Long option only */
{"max-file-content-size", required_argument, 0, 262}, /* Long option only */
{"wfp", no_argument, 0, 'w'},
{"test", no_argument, 0, 't'},
{"version", no_argument, 0, 'v'},
Expand Down Expand Up @@ -334,18 +339,8 @@ int main(int argc, char **argv)
bool invalid_argument = false;
char * ldb_db_name = NULL;

while ((option = getopt_long(argc, argv, ":r:T:s:b:c:k:a:F:l:n:M:N:wtLvhdqH", long_options, &option_index)) != -1)
while ((option = getopt_long(argc, argv, ":r:T:s:b:c:k:a:F:l:n:M:N:P:C:S:wtLvhdqH", long_options, &option_index)) != -1)
{
/* Check valid alpha is entered */
if (optarg)
{
if ((strlen(optarg) > MAX_ARGLN))
{
invalid_argument = true;
break;
}
}

switch (option)
{
case 's':
Expand All @@ -371,6 +366,16 @@ int main(int argc, char **argv)
exit(EXIT_SUCCESS);
break;

case 'P':
initialize_ldb_tables(ldb_db_name);
exit(purl_scan(optarg));
break;

case 'C':
initialize_ldb_tables(ldb_db_name);
exit(component_scan(optarg));
break;

case 'a':
if (declared_components)
{
Expand Down Expand Up @@ -414,6 +419,12 @@ int main(int argc, char **argv)
case 256: /* --force-snippet (long option only) */
force_snippet_scan = true;
break;
case 'S':
initialize_ldb_tables(ldb_db_name);
if (optarg && optarg[0] == '-' && optarg[1] == '\0')
exit(snippet_scan_stdin());
exit(snippet_scan_string(optarg));
break;
case 't':
initialize_ldb_tables(ldb_db_name);
scan_benchmark();
Expand Down Expand Up @@ -480,6 +491,11 @@ int main(int argc, char **argv)
scanlog("Range tolerance set to %d\n", scan_range_tolerance);
break;

case 262: /* --max-file-content-size (value in MB) */
max_file_content_size = strtoull(optarg, NULL, 10) * 1024 * 1024;
scanlog("Max file content size set to %lu MB\n", (unsigned long) (max_file_content_size / (1024 * 1024)));
break;
Comment on lines +494 to +497

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🎯 Functional Correctness | 🟡 Minor | ⚡ Quick win

Validate --max-file-content-size input.

strtoull returns 0 for non-numeric input, and an explicit 0 is accepted as-is. Either case sets max_file_content_size = 0, which makes mz_get_key() (src/mz.c Line 105) reject every file with the "exceeds the maximum allowed" error. Consider rejecting 0/unparseable values, or treating 0 as "unlimited".

🛡️ Suggested guard
 			case 262: /* --max-file-content-size (value in MB) */
-				max_file_content_size = strtoull(optarg, NULL, 10) * 1024 * 1024;
+			{
+				char *endptr = NULL;
+				unsigned long long mb = strtoull(optarg, &endptr, 10);
+				if (endptr == optarg || *endptr != '\0' || mb == 0)
+				{
+					fprintf(stderr, "Invalid --max-file-content-size value: %s\n", optarg);
+					exit(EXIT_FAILURE);
+				}
+				max_file_content_size = mb * 1024 * 1024;
+			}
 				scanlog("Max file content size set to %lu MB\n", (unsigned long) (max_file_content_size / (1024 * 1024)));
 				break;
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
case 262: /* --max-file-content-size (value in MB) */
max_file_content_size = strtoull(optarg, NULL, 10) * 1024 * 1024;
scanlog("Max file content size set to %lu MB\n", (unsigned long) (max_file_content_size / (1024 * 1024)));
break;
case 262: /* --max-file-content-size (value in MB) */
{
char *endptr = NULL;
unsigned long long mb = strtoull(optarg, &endptr, 10);
if (endptr == optarg || *endptr != '\0' || mb == 0)
{
fprintf(stderr, "Invalid --max-file-content-size value: %s\n", optarg);
exit(EXIT_FAILURE);
}
max_file_content_size = mb * 1024 * 1024;
}
scanlog("Max file content size set to %lu MB\n", (unsigned long) (max_file_content_size / (1024 * 1024)));
break;
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@src/main.c` around lines 494 - 497, Validate the `--max-file-content-size`
option in the `case 262` handler before assigning `max_file_content_size`,
because `strtoull` currently accepts non-numeric input and zero, which can
unintentionally disable all file scanning in `mz_get_key`. Add input checking in
the argument parsing path in `main` so invalid or zero values are either
rejected with an error or explicitly treated as unlimited, and keep the existing
`scanlog` message consistent with the chosen behavior.


case 'H':
if (hpsm_lib_load())
hpsm_enabled = true;
Expand Down
27 changes: 10 additions & 17 deletions src/match.c
Original file line number Diff line number Diff line change
Expand Up @@ -301,26 +301,19 @@ int compare_file_extension(component_data_t *a, component_data_t *b)
char *ext_a = extension(a->file);
char *ext_b = extension(b->file);

if (!ext_a && ext_b)
return 1;

if (ext_a && !ext_b)
return -1;

if (!ext_a && !ext_b)
/* A candidate is preferred only when its extension actually matches the
scanned file's extension. The mere presence/absence of an extension is
not a valid criterion: doing so would prefer any extended path over a
plain filename even when neither matches the scanned file. */
bool match_a = ext_a && !strcmp(ext_a, ext_file);
bool match_b = ext_b && !strcmp(ext_b, ext_file);

if (match_a == match_b)
return 0;

int result_a = strcmp(ext_a, ext_file);
int result_b = strcmp(ext_b, ext_file);

if (result_a == result_b)
return 0;
else if (!result_a)
else if (match_a)
return -1;
else if (!result_b)
else
return 1;

return 0;
}

/**
Expand Down
7 changes: 6 additions & 1 deletion src/match_list.c
Original file line number Diff line number Diff line change
Expand Up @@ -215,11 +215,16 @@ void match_list_tolerance_set(float in)
{
if (in > 99)
in = 99;

match_list_tolerance = 100.0-in;
scanlog("setting match list tolerance to %.1f\n", match_list_tolerance);
}

float match_list_tolerance_get(void)
{
return 100.0 - match_list_tolerance;
}

bool tolerance_eval(int a, int b)
{
int relative_error = (abs(a - b) * 100) / ((a + b) / 2);
Expand Down
11 changes: 11 additions & 0 deletions src/mz.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include "decrypt.h"
#include <ldb.h>
#include "debug.h"
#include "limits.h"
#include <unistd.h>

/**
Expand Down Expand Up @@ -100,6 +101,16 @@ void mz_get_key(struct ldb_table kb, char *key)
/* Decompress */
MZ_DEFLATE(&job);

/* Reject files whose content exceeds the configured maximum size */
if (job.data_ln > max_file_content_size)
{
fprintf(stderr, "File content size (%.2f MB) exceeds the maximum allowed (%lu MB). Use --max-file-content-size to change the limit.\n", (double) job.data_ln / (1024 * 1024), (unsigned long) (max_file_content_size / (1024 * 1024)));
free(job.data);
free(job.key);
free(job.mz);
exit(EXIT_FAILURE);
}

Comment on lines +104 to +113

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🎯 Functional Correctness | 🟡 Minor | ⚡ Quick win

Limit check looks correct; note max_file_content_size == 0 rejects all files.

The decompressed-size guard and cleanup ordering are correct. Be aware the effective behavior depends on input validation for --max-file-content-size; a 0 value (from invalid/empty input) would reject every file here. See the related comment in src/main.c (case 262).

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@src/mz.c` around lines 104 - 113, The size guard in mz.c is fine, but a zero
value for max_file_content_size will cause every file to be rejected here.
Update the --max-file-content-size handling in main.c (case 262 / the option
parsing that sets max_file_content_size) to validate the parsed value and reject
or default invalid/empty input so this variable is never left at 0. Make the
validation happen before mz.c uses the limit, and keep the existing rejection
path unchanged.

job.data[job.data_ln] = 0;
printf("%s", job.data);
return;
Expand Down
Loading
Loading