diff --git a/inc/limits.h b/inc/limits.h
index c4f781d..15626ad 100644
--- a/inc/limits.h
+++ b/inc/limits.h
@@ -28,6 +28,7 @@
/* Limits */
#define MAX_ARGLN 512 // Max command line argument length
+#define MAX_COMPONENT_SCAN_HASHES 1000 // Max number of url hashes accepted by -C
#define MAX_PATH 1024
#define MAX_HASHES_READ 65535
#define MAX_FILE_SIZE (1024 * 1024 * 4)
@@ -51,7 +52,9 @@
#define SNIPPETS_DEFAULT_ADJUST_TOLERANCE true /** Adjust tolerance based on file size */
#define SNIPPETS_DEFAULT_HONOR_FILE_EXTENSION true /** Honor file extension during snippet matching */
#define DEFAULT_FETCH_MAX_FILES 12000 /** Maximum number of files to fetch during component matching */
+#define MAX_FILE_CONTENT_SIZE_DEFAULT (1024ULL * 1024 * 50) /** Default maximum file content size (50MB) printed by -k */
/* Variables */
extern int fetch_max_files; // Maximum number of files to fetch during component matching
+extern uint64_t max_file_content_size; // Maximum file content size printed by -k
#endif
diff --git a/inc/match_list.h b/inc/match_list.h
index 6520724..de20c11 100644
--- a/inc/match_list.h
+++ b/inc/match_list.h
@@ -147,5 +147,6 @@ void component_list_destroy(component_list_t *list);
bool component_list_add_binary(component_list_t *list, component_data_t *new_comp, bool (*val)(component_data_t *a, component_data_t *b), bool remove_a);
bool match_list_eval(match_list_t *list, match_data_t * in, bool (*eval)(match_data_t *fpa, match_data_t *fpb));
void match_list_tolerance_set(float in);
+float match_list_tolerance_get(void);
#endif
diff --git a/inc/purl_scan.h b/inc/purl_scan.h
new file mode 100644
index 0000000..6533d33
--- /dev/null
+++ b/inc/purl_scan.h
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * inc/purl_scan.h
+ *
+ * SCANOSS Inventory Scanner
+ *
+ * Copyright (C) 2018-2024 SCANOSS.COM
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+
+#ifndef __PURL_SCAN_H
+#define __PURL_SCAN_H
+
+/**
+ * @brief Resolve the purls and url hashes related to a file MD5.
+ *
+ * Looks up the given file MD5 in the KB (url and file tables) and prints, in
+ * JSON, the unique purls associated with that file along with every url hash
+ * (url_id) where the file was seen and the best (lowest) KB rank found for
+ * each purl.
+ *
+ * @param file_md5_hex file MD5 in hex (32 chars)
+ * @return EXIT_SUCCESS on success, EXIT_FAILURE on invalid input
+ */
+int purl_scan(char *file_md5_hex);
+
+/**
+ * @brief Report the details of one or more components identified by url hash.
+ *
+ * Accepts a single url_hash (url_id) or a comma-separated list. Each hash is
+ * looked up in the KB and its component details are rendered in JSON, reusing
+ * the same rendering used in regular scan reports. Output is always an array
+ * under the "results" key, one entry per valid hash:
+ * {"results": [{"url_hash": "...", "component": {...}}, ...]}.
+ * Invalid hashes are skipped with a stderr warning.
+ *
+ * @param url_hash_list comma-separated url hashes in hex (32 chars each)
+ * @return EXIT_SUCCESS if at least one valid hash was processed,
+ * EXIT_FAILURE if input is null/empty or no hash was valid
+ */
+int component_scan(char *url_hash_list);
+
+/**
+ * @brief Run a snippet-only scan whose WFP input comes from stdin.
+ *
+ * Reads a WFP block (same format used by `-w` scans) from stdin, runs the
+ * snippet selection pipeline (no full-file lookup, no component resolution)
+ * and prints a JSON report listing the file_md5 candidates grouped by snippet
+ * region, together with their input/oss line ranges. Candidate cohort size is
+ * controlled by the tolerance set via -T (match_list_tolerance_set).
+ *
+ * @return EXIT_SUCCESS on success, EXIT_FAILURE on invalid/empty input
+ */
+int snippet_scan_stdin(void);
+
+/**
+ * @brief Run a snippet-only scan whose WFP input is passed as a string.
+ *
+ * Same behavior as snippet_scan_stdin() but reads the WFP block from the
+ * provided in-memory buffer. Used by `-S ""` so callers (e.g. FlexAPI)
+ * can pass the WFP directly as an argv value instead of piping it via stdin.
+ *
+ * @param wfp NUL-terminated buffer holding the WFP block
+ * @return EXIT_SUCCESS on success, EXIT_FAILURE on invalid/empty input
+ */
+int snippet_scan_string(const char *wfp);
+
+#endif
diff --git a/inc/report.h b/inc/report.h
index da70165..b55bc50 100644
--- a/inc/report.h
+++ b/inc/report.h
@@ -16,4 +16,5 @@ void json_open();
void json_close(void);
void kb_version_get(void);
bool print_json_component(component_data_t * component);
+void print_purl_array(component_data_t * component);
#endif
diff --git a/inc/scanoss.h b/inc/scanoss.h
index 8985597..1f4a2be 100644
--- a/inc/scanoss.h
+++ b/inc/scanoss.h
@@ -33,7 +33,7 @@
#define WFP_LN 4
#define WFP_REC_LN 18
-#define SCANOSS_VERSION "5.4.25"
+#define SCANOSS_VERSION "5.4.26"
/* Log files */
#define SCAN_LOG "/tmp/scanoss_scan.log"
diff --git a/src/copyright.c b/src/copyright.c
index 3e1edb1..c7d7a74 100644
--- a/src/copyright.c
+++ b/src/copyright.c
@@ -35,28 +35,27 @@
#include "util.h"
#include "decrypt.h"
#include "debug.h"
-const char *copyright_sources[] = {"component_declared", "file_header", "license_file", "scancode"};
-/**
- * @brief get fisrt copyright LDB function pointer. Will be executed for the ldb_fetch_recordset function in each iteration. See LDB documentation for more details.
- * @param key //TODO
- * @param subkey //TODO
- * @param subkey_ln //TODO
- * @param[out] data //TODO
- * @param datalen //TODO
- * @param iteration //TODO
- * @param ptr output pointer, returns the fisrt copyright obtained from the database
- * @return //TODO
- */
-/* static bool get_first_copyright(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *data, uint32_t datalen, int iteration, void *ptr)
+static char * copyright_id_to_source_name(int id)
{
- char * result = decrypt_data(data, datalen, oss_copyright, key, subkey);
- if (result)
- strncpy(ptr, skip_first_comma((char *) result), MAX_COPYRIGHT);
-
- free(result);
- return true;
-}*/
+ switch (id)
+ {
+ case 1:
+ case 5:
+ return "file_header";
+ case 0:
+ case 2:
+ case 6:
+ case 8:
+ return "license_file";
+ case 3:
+ case 4:
+ case 7:
+ return "scancode";
+ default:
+ return NULL;
+ }
+}
/**
* @brief //Remove undesired characteres from a copyright
@@ -105,13 +104,13 @@ static bool print_copyrights_item(uint8_t *key, uint8_t *subkey, int subkey_ln,
char result[MAX_FIELD_LN] = "\0";
int len = 0;
-
- if (!dup && (*copyright) && (src <= (sizeof(copyright_sources) / sizeof(copyright_sources[0]))))
+ char * source_id = copyright_id_to_source_name(src);
+ if (!dup && (*copyright) && source_id)
{
if (comp->copyright_text)
len += sprintf(result+len,",");
len += sprintf(result+len,"{\"name\": \"%s\",", copyright);
- len += sprintf(result+len,"\"source\": \"%s\"}", copyright_sources[atoi(source)]);
+ len += sprintf(result+len,"\"source\": \"%s\"}", source_id);
}
if (*result)
str_cat_realloc(&comp->copyright_text, result);
diff --git a/src/help.c b/src/help.c
index 441656a..df67845 100644
--- a/src/help.c
+++ b/src/help.c
@@ -65,6 +65,12 @@ Configuration:\n\
-a, --attribution FILE Show attribution notices for the provided SBOM.json file.\n\
-c, --component HINT Add a component HINT to guide scan results.\n\
-k, --key KEY Show contents of the specified KEY file from MZ sources archive.\n\
+ --max-file-content-size MB Set maximum file content size in MB printed by -k (default: 50).\n\
+-P, --purl MD5 Return the purls and versions related to the given file MD5 (JSON).\n\
+-C, --url-hash MD5 Return the details of the component identified by the given url hash (JSON).\n\
+-S, --snippet-scan WFP Snippet-only scan: take a single-file WFP block as argument and return JSON with candidate\n\
+ file_md5s and their line ranges, filtered by the tolerance set via -T (precede -S with -T\n\
+ to apply). Use -S - to read the WFP from stdin instead of passing it as argument.\n\
-l, --license LICENSE Display OSADL metadata for the given SPDX license ID.\n\
-L, --full-license Enable full license report.\n\
-F, --flags FLAGS Set engine scanning flags (see below).\n\
diff --git a/src/ignorelist.c b/src/ignorelist.c
index 2fdf920..5ad11bf 100644
--- a/src/ignorelist.c
+++ b/src/ignorelist.c
@@ -46,10 +46,13 @@ char *extension(char *path)
{
char *dot = strrchr(path, '.');
char *slash = strrchr(path, '/');
+ if (!slash) slash = path;
- if (!dot && !slash) return NULL;
+ /* No dot, or the last dot belongs to a parent directory (e.g. "a.b/file"):
+ the file has no extension. Returning the basename here (as the previous
+ implementation did) wrongly treated a plain filename as an extension. */
+ if (!dot) return NULL;
if (dot > slash) return dot + 1;
- if (slash != path) return slash + 1;
return NULL;
}
diff --git a/src/limits.c b/src/limits.c
index 67b6cfd..ac3787f 100644
--- a/src/limits.c
+++ b/src/limits.c
@@ -10,3 +10,4 @@
*/
int fetch_max_files = 12000; /** Maximum number of files to fetch during component matching */
+uint64_t max_file_content_size = MAX_FILE_CONTENT_SIZE_DEFAULT; /** Maximum file content size printed by -k */
diff --git a/src/main.c b/src/main.c
index 75d0ed0..fb91fe2 100644
--- a/src/main.c
+++ b/src/main.c
@@ -40,6 +40,7 @@
#include "parse.h"
#include "report.h"
#include "scan.h"
+#include "purl_scan.h"
#include "scanoss.h"
#include "util.h"
#include "component.h"
@@ -278,8 +279,11 @@ static struct option long_options[] = {
{"sbom", required_argument, 0, 's'},
{"blacklist", required_argument, 0, 'b'},
{"force-snippet", required_argument, 0, 256}, /* Long option only, no short form */
+ {"snippet-scan", required_argument, 0, 'S'},
{"component", required_argument, 0, 'c'},
{"key", required_argument, 0, 'k'},
+ {"purl", required_argument, 0, 'P'},
+ {"url-hash", required_argument, 0, 'C'},
{"attribution", required_argument, 0, 'a'},
{"flags", required_argument, 0, 'F'},
{"license", required_argument, 0, 'l'},
@@ -292,6 +296,7 @@ static struct option long_options[] = {
{"min-snippet-lines", required_argument, 0, 259}, /* Long option only */
{"ignore-file-ext", no_argument, 0, 260}, /* Long option only */
{"range-tolerance", required_argument, 0, 261}, /* Long option only */
+ {"max-file-content-size", required_argument, 0, 262}, /* Long option only */
{"wfp", no_argument, 0, 'w'},
{"test", no_argument, 0, 't'},
{"version", no_argument, 0, 'v'},
@@ -334,18 +339,8 @@ int main(int argc, char **argv)
bool invalid_argument = false;
char * ldb_db_name = NULL;
- while ((option = getopt_long(argc, argv, ":r:T:s:b:c:k:a:F:l:n:M:N:wtLvhdqH", long_options, &option_index)) != -1)
+ while ((option = getopt_long(argc, argv, ":r:T:s:b:c:k:a:F:l:n:M:N:P:C:S:wtLvhdqH", long_options, &option_index)) != -1)
{
- /* Check valid alpha is entered */
- if (optarg)
- {
- if ((strlen(optarg) > MAX_ARGLN))
- {
- invalid_argument = true;
- break;
- }
- }
-
switch (option)
{
case 's':
@@ -371,6 +366,16 @@ int main(int argc, char **argv)
exit(EXIT_SUCCESS);
break;
+ case 'P':
+ initialize_ldb_tables(ldb_db_name);
+ exit(purl_scan(optarg));
+ break;
+
+ case 'C':
+ initialize_ldb_tables(ldb_db_name);
+ exit(component_scan(optarg));
+ break;
+
case 'a':
if (declared_components)
{
@@ -414,6 +419,12 @@ int main(int argc, char **argv)
case 256: /* --force-snippet (long option only) */
force_snippet_scan = true;
break;
+ case 'S':
+ initialize_ldb_tables(ldb_db_name);
+ if (optarg && optarg[0] == '-' && optarg[1] == '\0')
+ exit(snippet_scan_stdin());
+ exit(snippet_scan_string(optarg));
+ break;
case 't':
initialize_ldb_tables(ldb_db_name);
scan_benchmark();
@@ -480,6 +491,11 @@ int main(int argc, char **argv)
scanlog("Range tolerance set to %d\n", scan_range_tolerance);
break;
+ case 262: /* --max-file-content-size (value in MB) */
+ max_file_content_size = strtoull(optarg, NULL, 10) * 1024 * 1024;
+ scanlog("Max file content size set to %lu MB\n", (unsigned long) (max_file_content_size / (1024 * 1024)));
+ break;
+
case 'H':
if (hpsm_lib_load())
hpsm_enabled = true;
diff --git a/src/match.c b/src/match.c
index c20d927..6ae56c7 100644
--- a/src/match.c
+++ b/src/match.c
@@ -301,26 +301,19 @@ int compare_file_extension(component_data_t *a, component_data_t *b)
char *ext_a = extension(a->file);
char *ext_b = extension(b->file);
- if (!ext_a && ext_b)
- return 1;
-
- if (ext_a && !ext_b)
- return -1;
-
- if (!ext_a && !ext_b)
+ /* A candidate is preferred only when its extension actually matches the
+ scanned file's extension. The mere presence/absence of an extension is
+ not a valid criterion: doing so would prefer any extended path over a
+ plain filename even when neither matches the scanned file. */
+ bool match_a = ext_a && !strcmp(ext_a, ext_file);
+ bool match_b = ext_b && !strcmp(ext_b, ext_file);
+
+ if (match_a == match_b)
return 0;
-
- int result_a = strcmp(ext_a, ext_file);
- int result_b = strcmp(ext_b, ext_file);
-
- if (result_a == result_b)
- return 0;
- else if (!result_a)
+ else if (match_a)
return -1;
- else if (!result_b)
+ else
return 1;
-
- return 0;
}
/**
diff --git a/src/match_list.c b/src/match_list.c
index 134e429..596a031 100644
--- a/src/match_list.c
+++ b/src/match_list.c
@@ -215,11 +215,16 @@ void match_list_tolerance_set(float in)
{
if (in > 99)
in = 99;
-
+
match_list_tolerance = 100.0-in;
scanlog("setting match list tolerance to %.1f\n", match_list_tolerance);
}
+float match_list_tolerance_get(void)
+{
+ return 100.0 - match_list_tolerance;
+}
+
bool tolerance_eval(int a, int b)
{
int relative_error = (abs(a - b) * 100) / ((a + b) / 2);
diff --git a/src/mz.c b/src/mz.c
index c3ab15a..3d0404c 100644
--- a/src/mz.c
+++ b/src/mz.c
@@ -36,6 +36,7 @@
#include "decrypt.h"
#include
#include "debug.h"
+#include "limits.h"
#include
/**
@@ -100,6 +101,16 @@ void mz_get_key(struct ldb_table kb, char *key)
/* Decompress */
MZ_DEFLATE(&job);
+ /* Reject files whose content exceeds the configured maximum size */
+ if (job.data_ln > max_file_content_size)
+ {
+ fprintf(stderr, "File content size (%.2f MB) exceeds the maximum allowed (%lu MB). Use --max-file-content-size to change the limit.\n", (double) job.data_ln / (1024 * 1024), (unsigned long) (max_file_content_size / (1024 * 1024)));
+ free(job.data);
+ free(job.key);
+ free(job.mz);
+ exit(EXIT_FAILURE);
+ }
+
job.data[job.data_ln] = 0;
printf("%s", job.data);
return;
diff --git a/src/purl_scan.c b/src/purl_scan.c
new file mode 100644
index 0000000..2c12442
--- /dev/null
+++ b/src/purl_scan.c
@@ -0,0 +1,706 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * src/purl_scan.c
+ *
+ * SCANOSS Inventory Scanner
+ *
+ * Copyright (C) 2018-2024 SCANOSS.COM
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+
+/**
+ * @file purl_scan.c
+ * @brief Resolve the purls and url hashes related to a given file MD5.
+ *
+ * This implements the "-P " command: given a file MD5 it walks the
+ * KB (url and file tables) and reports, in JSON, the unique purls associated
+ * with that file, the url hashes (url_id) where the file was seen for each
+ * purl and the best (lowest) KB rank found. It does not use the best-match
+ * selection logic.
+ */
+
+#include
+#include "scanoss.h"
+#include "debug.h"
+#include "decrypt.h"
+#include "parse.h"
+#include "util.h"
+#include "limits.h"
+#include "component.h"
+#include "report.h"
+#include "url.h"
+#include "license.h"
+#include "health.h"
+#include "dependency.h"
+#include "copyright.h"
+#include "vulnerability.h"
+#include "scan.h"
+#include "match.h"
+#include "match_list.h"
+#include "snippets.h"
+#include "purl_scan.h"
+
+/* Snippet-scan configuration globals (defined in main.c) */
+extern int scan_max_snippets;
+extern int scan_max_components;
+extern bool scan_adjust_tolerance;
+extern int scan_ranking_threshold;
+extern int scan_min_match_hits;
+extern int scan_min_match_lines;
+extern int scan_range_tolerance;
+extern bool scan_honor_file_extension;
+
+/* Single purl entry: the purl, the set of url hashes seen and the best rank */
+typedef struct purl_entry_t
+{
+ char *purl;
+ char **url_hashes;
+ int n_url_hashes;
+ int url_hashes_cap;
+ int rank;
+ struct purl_entry_t *next;
+} purl_entry_t;
+
+/* Context passed through the ldb recordset handlers */
+typedef struct purl_scan_ctx_t
+{
+ purl_entry_t *head;
+ int count;
+ uint32_t files_processed;
+} purl_scan_ctx_t;
+
+/* MD5 of the empty string, used as a sentinel in the file table */
+static const uint8_t empty_string_md5[MD5_LEN] =
+ {0xd4,0x1d,0x8c,0xd9,0x8f,0x00,0xb2,0x04,0xe9,0x80,0x09,0x98,0xec,0xf8,0x42,0x7e};
+
+/**
+ * @brief Find an existing purl entry or create a new one.
+ */
+static purl_entry_t * purl_entry_get(purl_scan_ctx_t *ctx, const char *purl)
+{
+ for (purl_entry_t *e = ctx->head; e; e = e->next)
+ if (!strcmp(e->purl, purl))
+ return e;
+
+ purl_entry_t *e = calloc(1, sizeof(*e));
+ e->purl = strdup(purl);
+ e->rank = COMPONENT_DEFAULT_RANK;
+ e->next = ctx->head;
+ ctx->head = e;
+ ctx->count++;
+ return e;
+}
+
+/**
+ * @brief Add a url hash to a purl entry, ignoring duplicates and empty values.
+ */
+static void purl_entry_add_url_hash(purl_entry_t *e, const char *url_hash)
+{
+ if (!url_hash || !*url_hash)
+ return;
+
+ for (int i = 0; i < e->n_url_hashes; i++)
+ if (!strcmp(e->url_hashes[i], url_hash))
+ return;
+
+ if (e->n_url_hashes >= e->url_hashes_cap)
+ {
+ e->url_hashes_cap = e->url_hashes_cap ? e->url_hashes_cap * 2 : 8;
+ e->url_hashes = realloc(e->url_hashes, e->url_hashes_cap * sizeof(char *));
+ }
+ e->url_hashes[e->n_url_hashes++] = strdup(url_hash);
+}
+
+/* qsort comparators for deterministic output */
+static int url_hash_cmp(const void *a, const void *b)
+{
+ return strcmp(*(const char **) a, *(const char **) b);
+}
+
+static int purl_entry_ptr_cmp(const void *a, const void *b)
+{
+ const purl_entry_t *ea = *(const purl_entry_t **) a;
+ const purl_entry_t *eb = *(const purl_entry_t **) b;
+ if (ea->rank != eb->rank)
+ return ea->rank - eb->rank;
+ return strcmp(ea->purl, eb->purl);
+}
+
+/**
+ * @brief url table recordset handler. Extracts the purl and rank from the url
+ * record and stores the url_id (= url_hash) under the matching purl entry.
+ */
+static bool handle_url_for_purls(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *raw_data, uint32_t datalen, int iteration, void *ptr)
+{
+ if (!datalen)
+ return false;
+
+ char *data = decrypt_data(raw_data, datalen, oss_url, key, subkey);
+ if (!data)
+ return false;
+
+ char purl[MAX_FILE_PATH];
+ char rank[MAX_FIELD_LN];
+ extract_csv(purl, data, 6, sizeof(purl));
+ extract_csv(rank, data, -1, sizeof(rank));
+ free(data);
+
+ if (!*purl)
+ return false;
+
+ purl_scan_ctx_t *ctx = (purl_scan_ctx_t *) ptr;
+ purl_entry_t *e = purl_entry_get(ctx, purl);
+
+ char url_hash_hex[MD5_LEN * 2 + 1];
+ ldb_bin_to_hex(key, MD5_LEN, url_hash_hex);
+ purl_entry_add_url_hash(e, url_hash_hex);
+
+ if (*rank)
+ {
+ int r = atoi(rank);
+ if (r > 0 && r < e->rank)
+ e->rank = r;
+ }
+
+ return false;
+}
+
+/**
+ * @brief file table recordset handler. Each record holds a 16 byte url id
+ * followed by the (encrypted) path; for every url id we query the url table.
+ */
+static bool handle_file_for_purls(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *raw_data, uint32_t datalen, int iteration, void *ptr)
+{
+ /* Bound the amount of files processed (same limit used during matching) */
+ if (iteration >= fetch_max_files)
+ {
+ scanlog("purl_scan: max file iterations reached: %d\n", fetch_max_files);
+ return true;
+ }
+
+ if (datalen < MD5_LEN)
+ return false;
+
+ /* Skip records pointing to the empty string md5 */
+ if (!memcmp(raw_data, empty_string_md5, MD5_LEN))
+ return false;
+
+ uint8_t url_id[MD5_LEN];
+ memcpy(url_id, raw_data, MD5_LEN);
+
+ ldb_fetch_recordset(NULL, oss_url, url_id, false, handle_url_for_purls, ptr);
+
+ ((purl_scan_ctx_t *) ptr)->files_processed++;
+ return false;
+}
+
+int purl_scan(char *file_md5_hex)
+{
+ if (!file_md5_hex || !valid_md5(file_md5_hex))
+ {
+ fprintf(stdout, "Invalid file MD5: %s\n", file_md5_hex ? file_md5_hex : "(null)");
+ return EXIT_FAILURE;
+ }
+
+ uint8_t file_md5[MD5_LEN];
+ ldb_hex_to_bin(file_md5_hex, MD5_LEN * 2, file_md5);
+
+ purl_scan_ctx_t ctx;
+ memset(&ctx, 0, sizeof(ctx));
+
+ ldb_fetch_recordset(NULL, oss_file, file_md5, false, handle_file_for_purls, &ctx);
+
+ scanlog("purl_scan: %d unique purls found across %u files for %s\n", ctx.count, ctx.files_processed, file_md5_hex);
+
+ /* Sort entries (rank asc, then purl) and url hashes for a deterministic output */
+ purl_entry_t **sorted = NULL;
+ if (ctx.count)
+ {
+ sorted = malloc(ctx.count * sizeof(purl_entry_t *));
+ int i = 0;
+ for (purl_entry_t *e = ctx.head; e; e = e->next)
+ {
+ if (e->n_url_hashes > 1)
+ qsort(e->url_hashes, e->n_url_hashes, sizeof(char *), url_hash_cmp);
+ sorted[i++] = e;
+ }
+ qsort(sorted, ctx.count, sizeof(purl_entry_t *), purl_entry_ptr_cmp);
+ }
+
+ if (!quiet)
+ {
+ printf("{\"file_md5\": \"%s\", \"matches\": [", file_md5_hex);
+ for (int i = 0; i < ctx.count; i++)
+ {
+ purl_entry_t *e = sorted[i];
+ if (i)
+ printf(", ");
+ printf("{\"purl\": \"%s\", \"url_hashes\": [", e->purl);
+ for (int v = 0; v < e->n_url_hashes; v++)
+ {
+ if (v)
+ printf(", ");
+ printf("\"%s\"", e->url_hashes[v]);
+ }
+ printf("], \"rank\": %d}", e->rank);
+ }
+ printf("]}\n");
+ fflush(stdout);
+ }
+
+ /* Cleanup */
+ free(sorted);
+ purl_entry_t *e = ctx.head;
+ while (e)
+ {
+ purl_entry_t *next = e->next;
+ for (int v = 0; v < e->n_url_hashes; v++)
+ free(e->url_hashes[v]);
+ free(e->url_hashes);
+ free(e->purl);
+ free(e);
+ e = next;
+ }
+
+ return EXIT_SUCCESS;
+}
+
+/**
+ * @brief Resolve and report a single component identified by its url hash.
+ * Emits one JSON object of the form {"url_hash": "...", "component": {...}}.
+ */
+static void component_scan_one(const char *url_hash_hex)
+{
+ uint8_t url_hash[MD5_LEN];
+ ldb_hex_to_bin((char *) url_hash_hex, MD5_LEN * 2, url_hash);
+
+ scanlog("component_scan_one: looking up url_hash=%s (declared_components=%p)\n",
+ url_hash_hex, (void *) declared_components);
+
+ component_data_t *component = NULL;
+ int records = ldb_fetch_recordset(NULL, oss_url, url_hash, false, get_oldest_url, &component);
+ scanlog("component_scan_one: url_hash=%s -> %d records, selected purl=%s identified=%d\n",
+ url_hash_hex, records,
+ (component && component->purls[0]) ? component->purls[0] : "(none)",
+ component ? component->identified : -99);
+
+ printf("{\"url_hash\": \"%s\", \"component\": ", url_hash_hex);
+ if (!component)
+ {
+ printf("null");
+ }
+ else
+ {
+ /* Fill missing purl md5s (lazy step also done by print_json_component) */
+ for (int i = 0; i < MAX_PURLS; i++)
+ {
+ if (component->purls[i] && !component->purls_md5[i])
+ {
+ component->purls_md5[i] = malloc(MD5_LEN);
+ MD5((uint8_t *)component->purls[i], strlen(component->purls[i]), component->purls_md5[i]);
+ }
+ }
+
+ /* print_licenses (and other enrichers) look up comp->file_md5_ref
+ unconditionally. We don't have a matched file in this mode, so
+ point it at the url_md5 to avoid a NULL deref; the lookup will
+ simply return no extra records. */
+ if (!component->file_md5_ref)
+ component->file_md5_ref = component->url_md5;
+
+ fetch_related_purls(component);
+ fill_main_url(component);
+
+ printf("{");
+ print_purl_array(component);
+
+ printf("\"vendor\": \"%s\",", component->vendor ? component->vendor : "");
+ printf("\"component\": \"%s\",", component->component ? component->component : "");
+
+ char *version_clean = string_clean(component->version);
+ printf("\"version\": \"%s\",", version_clean ? version_clean : "");
+
+ char *latest_clean = string_clean(component->latest_version);
+ printf("\"latest\": \"%s\",", latest_clean ? latest_clean : "");
+
+ printf("\"url\": \"%s\",", component->main_url ? component->main_url : (component->url ? component->url : ""));
+ printf("\"release_date\": \"%s\",", component->release_date ? component->release_date : "");
+
+ /* The lookup is by url hash, so report the url basename as the file */
+ char *file_field = NULL;
+ if (component->url)
+ {
+ char *url_copy = strdup(component->url);
+ file_field = strdup(basename(url_copy));
+ free(url_copy);
+ }
+ printf("\"file\": \"%s\",", file_field ? file_field : "");
+ free(file_field);
+
+ printf("\"rank\": %d", component->rank);
+
+ if (!(engine_flags & DISABLE_LICENSES))
+ {
+ print_licenses(component);
+ if (component->license_text)
+ printf(",%s", json_remove_invalid_char(component->license_text));
+ }
+
+ if (!(engine_flags & DISABLE_HEALTH))
+ {
+ if (!component->health_text)
+ print_health(component);
+ if (component->health_text)
+ printf(",%s", json_remove_invalid_char(component->health_text));
+
+ printf(",\"url_stats\":{");
+ if (component->url_stats[0] > 0)
+ {
+ printf("\"total_files\":%d,"
+ "\"indexed_files\":%d,"
+ "\"source_files\":%d,"
+ "\"ignored_files\":%d,"
+ "\"package_size\":%d",
+ component->url_stats[0], component->url_stats[1], component->url_stats[2],
+ component->url_stats[3], component->url_stats[4]);
+ }
+ printf("}");
+ }
+
+ if (!(engine_flags & DISABLE_DEPENDENCIES))
+ {
+ if (!component->dependency_text)
+ print_dependencies(component);
+ if (component->dependency_text)
+ printf(",%s", json_remove_invalid_char(component->dependency_text));
+ }
+
+ if (!(engine_flags & DISABLE_COPYRIGHTS))
+ {
+ print_copyrights(component);
+ if (component->copyright_text)
+ printf(",%s", component->copyright_text);
+ }
+
+ if (!(engine_flags & DISABLE_VULNERABILITIES))
+ {
+ print_vulnerabilities(component);
+ if (component->vulnerabilities_text)
+ printf(",%s", json_remove_invalid_char(component->vulnerabilities_text));
+ }
+
+ printf("}");
+ }
+ printf("}");
+
+ if (component)
+ component_data_free(component);
+}
+
+/**
+ * @brief Resolve and report the details of one or more components identified
+ * by url hash (url_id). Accepts a single hash or a comma-separated list.
+ *
+ * Output is always an array under the "results" key, one entry per input
+ * hash: {"results": [{"url_hash": "...", "component": {...}}, ...]}.
+ * Invalid hashes are skipped with a stderr warning.
+ *
+ * @param url_hash_list comma-separated url hashes in hex (32 chars each)
+ * @return EXIT_SUCCESS if at least one valid hash was processed,
+ * EXIT_FAILURE if input is null/empty or no hash was valid
+ */
+int component_scan(char *url_hash_list)
+{
+ if (!url_hash_list || !*url_hash_list)
+ {
+ fprintf(stdout, "Invalid url hash list: %s\n", url_hash_list ? url_hash_list : "(null)");
+ return EXIT_FAILURE;
+ }
+
+ char *input = strdup(url_hash_list);
+ int emitted = 0;
+
+ if (!quiet)
+ printf("{\"results\": [");
+
+ char *saveptr = NULL;
+ for (char *tok = strtok_r(input, ",", &saveptr); tok; tok = strtok_r(NULL, ",", &saveptr))
+ {
+ while (*tok == ' ' || *tok == '\t') tok++;
+ char *end = tok + strlen(tok);
+ while (end > tok && (end[-1] == ' ' || end[-1] == '\t' || end[-1] == '\n' || end[-1] == '\r'))
+ *--end = '\0';
+
+ if (!*tok)
+ continue;
+
+ if (!valid_md5(tok))
+ {
+ fprintf(stderr, "Invalid url hash MD5, skipping: %s\n", tok);
+ continue;
+ }
+
+ if (!quiet)
+ {
+ if (emitted)
+ printf(", ");
+ component_scan_one(tok);
+ }
+ emitted++;
+ }
+
+ if (!quiet)
+ {
+ printf("]}\n");
+ fflush(stdout);
+ }
+
+ free(input);
+ return emitted > 0 ? EXIT_SUCCESS : EXIT_FAILURE;
+}
+
+/**
+ * @brief Snippet-only scan with WFP coming from an arbitrary FILE*. Emits
+ * JSON listing the candidate file_md5s grouped by snippet region, with
+ * input/oss line ranges, filtered by the cohort tolerance set via -T.
+ */
+static int snippet_scan_stream(FILE *in)
+{
+ char *line = NULL;
+ size_t len = 0;
+ ssize_t lineln;
+
+ scan_data_t *scan = NULL;
+ char file_md5_hex[MD5_LEN * 2 + 1] = "";
+ char file_path[MAX_FILE_PATH] = "";
+ uint64_t file_size = 0;
+ bool got_file = false;
+
+ scanlog("--- SNIPPET SCAN ---\n");
+
+ while ((lineln = getline(&line, &len, in)) != -1)
+ {
+ trim(line);
+
+ if (!*line)
+ continue;
+
+ bool is_file = (memcmp(line, "file=", 5) == 0);
+ bool is_fh2 = (memcmp(line, "fh2=", 4) == 0);
+ bool is_hpsm = (memcmp(line, "hpsm=", 5) == 0);
+ bool is_bin = (memcmp(line, "bin=", 4) == 0);
+ bool is_wfp = (!is_file && !is_fh2 && !is_hpsm && !is_bin);
+
+ /* Snippet-only mode: ignore hpsm/bin payloads */
+ if (is_hpsm || is_bin)
+ continue;
+
+ if (is_file)
+ {
+ if (got_file)
+ {
+ fprintf(stderr, "snippet-scan: multiple file= entries received, ignoring extras\n");
+ continue;
+ }
+
+ const int tagln = 5;
+ if (strlen(line) < (size_t)(tagln + MD5_LEN * 2 + 1))
+ {
+ fprintf(stderr, "snippet-scan: malformed file= line\n");
+ free(line);
+ return EXIT_FAILURE;
+ }
+
+ char *hexmd5 = strndup(line + tagln, MD5_LEN * 2);
+ if (!hexmd5 || strlen(hexmd5) < MD5_LEN * 2 || !valid_md5(hexmd5))
+ {
+ fprintf(stderr, "snippet-scan: invalid md5 in file= line\n");
+ free(hexmd5);
+ free(line);
+ return EXIT_FAILURE;
+ }
+ strcpy(file_md5_hex, hexmd5);
+ free(hexmd5);
+
+ uint8_t *rec = (uint8_t *) strdup(line + tagln + MD5_LEN * 2 + 1);
+ char *target_path = field_n(2, (char *) rec);
+ if (!target_path)
+ {
+ fprintf(stderr, "snippet-scan: malformed file= line (missing path)\n");
+ free(rec);
+ free(line);
+ return EXIT_FAILURE;
+ }
+
+ strncpy(file_path, target_path, sizeof(file_path) - 1);
+ file_path[sizeof(file_path) - 1] = '\0';
+
+ char size_field[MAX_FIELD_LN] = "0";
+ extract_csv(size_field, (char *) rec, 1, sizeof(size_field));
+ file_size = strtoull(size_field, NULL, 10);
+
+ scan = scan_data_init(file_path,
+ scan_max_snippets,
+ scan_max_components,
+ scan_adjust_tolerance,
+ scan_ranking_threshold,
+ scan_min_match_hits,
+ scan_min_match_lines,
+ scan_range_tolerance,
+ scan_honor_file_extension);
+ scan->preload = true;
+ /* The input WFP has no reliable extension, and oss_file lookup
+ inside snippet_extension_discard would discard valid hits. */
+ scan->snippet_honor_file_extension = false;
+ /* scan->file_size is a fixed 32-byte buffer; write the parsed
+ numeric value, which always fits, instead of the raw field */
+ snprintf(scan->file_size, 32, "%llu", (unsigned long long) file_size);
+ ldb_hex_to_bin(file_md5_hex, MD5_LEN * 2, scan->md5);
+ strcpy(scan->source_md5, file_md5_hex);
+ free(rec);
+ got_file = true;
+ continue;
+ }
+
+ if (is_fh2 && scan && strlen(line) == MD5_LEN_HEX + 4)
+ {
+ ldb_hex_to_bin(&line[4], MD5_LEN_HEX, scan->md5_fh2);
+ scan->windows_line_endings = true;
+ continue;
+ }
+
+ if (is_wfp && scan && (scan->hash_count < MAX_HASHES_READ))
+ {
+ int line_ln = strlen(line);
+ for (int e = 0; e < line_ln; e++)
+ if (line[e] == '=' || line[e] == ',') line[e] = 0;
+
+ int line_nr = atoi(line);
+ char *hexhash = line + strlen(line) + 1;
+
+ while (*hexhash)
+ {
+ ldb_hex_to_bin(hexhash, 8, (uint8_t *) &scan->hashes[scan->hash_count]);
+ uint32_reverse((uint8_t *) &scan->hashes[scan->hash_count]);
+ scan->lines[scan->hash_count] = line_nr;
+ hexhash += strlen(hexhash) + 1;
+ scan->hash_count++;
+ if (scan->hash_count >= MAX_HASHES_READ)
+ break;
+ }
+ }
+ }
+ free(line);
+
+ if (!scan)
+ {
+ fprintf(stderr, "snippet-scan: no file= entry in WFP input\n");
+ return EXIT_FAILURE;
+ }
+
+ if (scan->hash_count == 0)
+ {
+ fprintf(stderr, "snippet-scan: no WFP hashes in WFP input\n");
+ scan_data_free(scan);
+ return EXIT_FAILURE;
+ }
+
+ scan->total_lines = scan->lines[scan->hash_count - 1];
+ scan->timer = microseconds_now();
+ scan->match_type = ldb_scan_snippets(scan);
+
+ if (scan->match_type != MATCH_NONE)
+ biggest_snippet(scan);
+
+ if (!quiet)
+ {
+ char *escaped_path = scape_slashes(file_path);
+ printf("{");
+ printf("\"file_md5\":\"%s\",", file_md5_hex);
+ printf("\"file_path\":\"%s\",", escaped_path ? escaped_path : "");
+ printf("\"file_size\":%llu,", (unsigned long long) file_size);
+ printf("\"total_lines\":%d,", scan->total_lines);
+ printf("\"tolerance_pct\":%.1f,", match_list_tolerance_get());
+ printf("\"snippet_groups\":[");
+
+ bool first_group = true;
+ for (int g = 0; g < scan->matches_list_array_index; g++)
+ {
+ match_list_t *list = scan->matches_list_array[g];
+ if (!list || !list->items)
+ continue;
+
+ if (!first_group)
+ printf(",");
+ first_group = false;
+
+ printf("{\"group_index\":%d,\"candidates\":[", g);
+ bool first_cand = true;
+ for (struct entry *np = list->headp.lh_first; np != NULL; np = np->entries.le_next)
+ {
+ match_data_t *m = np->match;
+ if (!m)
+ continue;
+
+ char md5_hex[MD5_LEN * 2 + 1];
+ ldb_bin_to_hex(m->file_md5, MD5_LEN, md5_hex);
+
+ if (!first_cand)
+ printf(",");
+ first_cand = false;
+
+ printf("{\"file_md5\":\"%s\",", md5_hex);
+ printf("\"hits\":%d,", m->hits);
+ printf("\"lines_matched\":%d,", m->lines_matched);
+ printf("\"matched_percent\":%d,", m->matched_percent);
+ printf("\"input_line_ranges\":\"%s\",", m->line_ranges ? m->line_ranges : "");
+ printf("\"oss_line_ranges\":\"%s\"", m->oss_ranges ? m->oss_ranges : "");
+ printf("}");
+ }
+ printf("]}");
+ }
+ printf("]}\n");
+ fflush(stdout);
+ free(escaped_path);
+ }
+
+ scan_data_free(scan);
+ return EXIT_SUCCESS;
+}
+
+int snippet_scan_stdin(void)
+{
+ return snippet_scan_stream(stdin);
+}
+
+int snippet_scan_string(const char *wfp)
+{
+ if (!wfp || !*wfp)
+ {
+ fprintf(stderr, "snippet-scan: empty WFP argument\n");
+ return EXIT_FAILURE;
+ }
+
+ FILE *in = fmemopen((void *) wfp, strlen(wfp), "r");
+ if (!in)
+ {
+ fprintf(stderr, "snippet-scan: failed to open WFP argument as stream\n");
+ return EXIT_FAILURE;
+ }
+
+ int rc = snippet_scan_stream(in);
+ fclose(in);
+ return rc;
+}
diff --git a/src/url.c b/src/url.c
index cbb21c1..eee229f 100644
--- a/src/url.c
+++ b/src/url.c
@@ -335,10 +335,10 @@ void purl_release_date(char *purl, char *date)
bool get_oldest_url(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *data, uint32_t datalen, int iteration, void *ptr)
{
char * url = decrypt_data(data, datalen, oss_url, key, subkey);
- if (!url)
+ if (!url)
return false;
- //scanlog("url: %s\n", url);
+ scanlog("get_oldest_url iter=%d url_record='%s'\n", iteration, url);
/* Get oldest */
component_data_t **comp_address = ptr;
@@ -357,6 +357,9 @@ bool get_oldest_url(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *data,
}
comp->identified = IDENTIFIED_NONE;
asset_declared(comp);
+ scanlog("get_oldest_url iter=%d purl[0]=%s identified=%d rank=%d release=%s\n",
+ iteration, comp->purls[0] ? comp->purls[0] : "(null)", comp->identified, comp->rank,
+ comp->release_date ? comp->release_date : "(null)");
purl_latest_version_add(comp);
if (component_rank_max > 0 && comp->rank > component_rank_max)