diff --git a/engine/src/main/java/com/github/jinba1/cuckoodb/ColumnMeta.java b/engine/src/main/java/com/github/jinba1/cuckoodb/ColumnMeta.java new file mode 100644 index 0000000..815af91 --- /dev/null +++ b/engine/src/main/java/com/github/jinba1/cuckoodb/ColumnMeta.java @@ -0,0 +1,22 @@ +package com.github.jinba1.cuckoodb; + +/** + * Describes one output column of a {@link QueryResultSet}, in result-column order. + * + *

Result rows are strictly positional and column names are not unique — a + * join {@code SELECT *} can emit the same bare name twice (e.g. two {@code a} columns). + * {@code qualifiedName} disambiguates those: it is the dotted schema origin + * (e.g. {@code student.a} vs {@code enrolled.a}) when one exists, and {@code null} for + * aggregate/computed columns and for single-table scans whose schema carries no table + * prefix. Clients must address columns by position, not by name. + * + *

{@code type} is best-effort, inferred from the first row's runtime value, so it is + * {@code null} for an empty result (no row to infer from). The authoritative typed schema + * for a base table comes from the catalog, not this field. + * + * @param name the bare header name (table prefix stripped; aggregate keys kept whole) + * @param qualifiedName the dotted schema origin, or null when there is none + * @param type the inferred column type, or null when the result is empty + */ +public record ColumnMeta(String name, String qualifiedName, ColumnType type) { +} diff --git a/engine/src/main/java/com/github/jinba1/cuckoodb/CuckooDB.java b/engine/src/main/java/com/github/jinba1/cuckoodb/CuckooDB.java index 511a421..4b0d28d 100644 --- a/engine/src/main/java/com/github/jinba1/cuckoodb/CuckooDB.java +++ b/engine/src/main/java/com/github/jinba1/cuckoodb/CuckooDB.java @@ -4,7 +4,9 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; +import java.util.Collections; import java.util.List; +import java.util.Map; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVPrinter; @@ -125,15 +127,13 @@ public static QueryResult execute(Operator root, String outputFile) { try { try (CSVPrinter printer = new CSVPrinter(new FileWriter(outputFile), format)) { printer.printRecord(headers); - Tuple tuple; - while ((tuple = root.getNextTuple()) != null) { + rows = drain(root, tuple -> { List fields = new ArrayList<>(tuple.getTuple().size()); for (Value v : tuple.getTuple()) { fields.add(v.toString()); } printer.printRecord(fields); - rows++; - } + }); } } catch (RuntimeException e) { // QueryExecutionException and internal errors alike: never leave a @@ -148,8 +148,7 @@ public static QueryResult execute(Operator root, String outputFile) { "Failed to write output file '" + outputFile + "': " + e.getMessage()); } - // The planner places Limit topmost, so the root knows whether the cap cut the result - boolean truncated = root instanceof LimitOperator limitOp && limitOp.wasTruncated(); + boolean truncated = wasTruncated(root); QueryResult result = truncated ? QueryResult.truncated(rows) : QueryResult.complete(rows); System.out.println("Query executed successfully!"); @@ -158,6 +157,118 @@ public static QueryResult execute(Operator root, String outputFile) { return result; } + /** + * Drains the plan into an in-memory result set — column metadata, positional rows, and the + * same truncation/hint signal {@link #execute} reports, but with no stdout and no file write. + * This is the library/REST entry point; the CLI keeps using {@link #execute}. Both share + * {@link #drain}, so they cannot diverge in iteration, row count, or truncation semantics. + * @param root the root operator of an executable plan (never null; EXPLAIN is handled before here) + * @return the fully-materialized result + */ + public static QueryResultSet executeToResultSet(Operator root) { + String schemaId = root.propagateSchemaId(); + List names = root.getContext().getOrderedColumnNames(schemaId); + List> rows = new ArrayList<>(); + try { + drain(root, tuple -> rows.add(List.copyOf(tuple.getTuple()))); + } catch (IOException e) { + // The in-memory sink does no I/O; drain only declares IOException for the CSV path. + throw new QueryExecutionException(ErrorCode.INTERNAL, + "Unexpected I/O while draining query result: " + e.getMessage()); + } + boolean truncated = wasTruncated(root); + String hint = truncated ? QueryResult.truncated(rows.size()).hint() : null; + List columns = buildColumns(root.getContext(), schemaId, names, rows); + // The result is a value handed to library/REST callers; the outer lists are + // unmodifiable so a consumer cannot mutate the result after the fact (each inner + // row is already immutable via List.copyOf in the drain sink). + return new QueryResultSet(Collections.unmodifiableList(columns), + Collections.unmodifiableList(rows), truncated, hint); + } + + /** + * Pulls every tuple from the plan to EOF, handing each to {@code sink}, and returns the row + * count. The single drain point for both result paths: truncation can only be read once this + * has run to a null tuple, so sharing it keeps the file and in-memory paths consistent. + */ + private static long drain(Operator root, TupleSink sink) throws IOException { + long rows = 0; + Tuple tuple; + while ((tuple = root.getNextTuple()) != null) { + sink.accept(tuple); + rows++; + } + return rows; + } + + /** A per-tuple action that may fail with an I/O error (the CSV writer's printRecord does). */ + @FunctionalInterface + private interface TupleSink { + void accept(Tuple tuple) throws IOException; + } + + /** + * Whether a LIMIT cut the result short. The planner places LIMIT topmost, so only the root + * can report this, and only after the drain reached EOF (the peek past the cap has happened). + */ + private static boolean wasTruncated(Operator root) { + return root instanceof LimitOperator limitOp && limitOp.wasTruncated(); + } + + /** + * Builds per-position column metadata for a result set. {@code name} is the bare header + * (table prefix stripped, aggregate keys kept whole); {@code qualifiedName} is the dotted, + * non-aggregate schema key for that index when one exists (null for single-table scans and + * computed columns); {@code type} is inferred from the first row, or null for an empty result. + */ + private static List buildColumns(PlanContext ctx, String schemaId, + List names, List> rows) { + int width = names.size(); + String[] qualified = new String[width]; + Map schema = ctx.getSchema(schemaId); + if (schema != null) { + // Mirror getOrderedColumnNames' deterministic rule — sorted keys, first non-null + // wins per index — so `name` and `qualifiedName` are chosen from a single key + // choice and never disagree run-to-run. Skip internal intermediate-schema keys + // (temp_.col) outright: a join over a pushed-down source registers BOTH a + // base-qualified key (Enrolled.a) and a temp_ key at the same index, and the + // temp_ id is not a real origin (and is plan/JVM-unstable). + List keys = new ArrayList<>(schema.keySet()); + Collections.sort(keys); + for (String key : keys) { + int idx = schema.get(key); + if (idx < 0 || idx >= width || qualified[idx] != null + || key.startsWith(Constants.INTERMEDIATE_SCHEMA_PREFIX)) { + continue; + } + // A dotted, non-aggregate key (e.g. "Student.a") is the column's qualified + // origin. Aggregate keys like "sum(student.c)" also contain '.', so the '(' + // check excludes them. Lowercased to match the bare name (-> "student.a"). + if (key.indexOf('.') >= 0 && key.indexOf('(') < 0) { + qualified[idx] = key.toLowerCase(); + } + } + } + List firstRow = rows.isEmpty() ? null : rows.get(0); + List columns = new ArrayList<>(width); + for (int i = 0; i < width; i++) { + ColumnType type = firstRow == null ? null : inferType(firstRow.get(i)); + columns.add(new ColumnMeta(names.get(i), qualified[i], type)); + } + return columns; + } + + /** Maps a runtime value to its column type. Exhaustive over the sealed {@link Value}. */ + private static ColumnType inferType(Value value) { + if (value instanceof IntValue) { + return ColumnType.INT; + } else if (value instanceof StringValue) { + return ColumnType.STRING; + } + throw new QueryExecutionException(ErrorCode.INTERNAL, + "Unknown value type: " + value.getClass().getName()); + } + /** Never leave a truncated file that looks like a complete result. */ private static void deletePartialOutput(File outputFileObj, String outputFile) { if (outputFileObj.isFile() && !outputFileObj.delete()) { diff --git a/engine/src/main/java/com/github/jinba1/cuckoodb/DBCatalog.java b/engine/src/main/java/com/github/jinba1/cuckoodb/DBCatalog.java index 0360d24..9ac5549 100644 --- a/engine/src/main/java/com/github/jinba1/cuckoodb/DBCatalog.java +++ b/engine/src/main/java/com/github/jinba1/cuckoodb/DBCatalog.java @@ -10,6 +10,7 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; +import java.util.concurrent.ConcurrentHashMap; import java.util.stream.Stream; /** @@ -29,17 +30,19 @@ public class DBCatalog { private static DBCatalog instance; - private final Map dbLocations; - private final Map> dbSchemata; - private final Map> dbColumnTypes; + /** + * One entry per table, each a fully-populated {@link TableMeta} (location + schema + + * types). A single map means a table is published with one atomic write and read with + * one atomic lookup — no torn state across what used to be three parallel maps. This is + * what makes runtime {@link #registerTable} safe against concurrent readers. + */ + private final Map tables; /** * Private constructor to ensure singleton design. */ private DBCatalog() { - dbLocations = new java.util.concurrent.ConcurrentHashMap<>(); - dbSchemata = new java.util.concurrent.ConcurrentHashMap<>(); - dbColumnTypes = new java.util.concurrent.ConcurrentHashMap<>(); + tables = new ConcurrentHashMap<>(); } /** @@ -94,7 +97,7 @@ private void loadDBCatalog(String dBDirectory) { for (Path csv : csvs) { String fileName = csv.getFileName().toString(); String tableName = fileName.substring(0, fileName.length() - 4); - loadTable(tableName, csv); + tables.put(tableName, parseTable(tableName, csv)); } } catch (IOException e) { throw new QueryExecutionException(ErrorCode.DATA_ERROR, @@ -102,7 +105,40 @@ private void loadDBCatalog(String dBDirectory) { } } - private void loadTable(String tableName, Path csv) throws IOException { + /** + * Registers a single table at runtime from a CSV file, for callers (the REST gateway) + * that add tables after startup. Parses and infers the schema with the same rules as + * directory load, then publishes the table with one atomic {@code putIfAbsent}. + *

+ * Returns {@code false} (and leaves the existing table untouched) when a table of that + * name already exists — the caller's 409 signal. Never {@code containsKey}-then-{@code put}, + * which would reintroduce the race this design removes. + * @param tableName the catalog name to publish under + * @param csv the CSV file backing the table; must outlive every query that scans it + * @return {@code true} if this call registered the table; {@code false} if the name was taken + * @throws QueryExecutionException with {@link ErrorCode#DATA_ERROR} if the CSV is malformed + */ + public boolean registerTable(String tableName, Path csv) { + TableMeta meta; + try { + meta = parseTable(tableName, csv); + } catch (IOException e) { + throw new QueryExecutionException(ErrorCode.DATA_ERROR, + "Could not read table '" + tableName + "' from " + csv + ": " + e.getMessage()); + } + return tables.putIfAbsent(tableName, meta) == null; + } + + /** + * Parses one CSV into table metadata: column names from the header row, types inferred + * from the data rows. Pure (no map writes) so both directory load and {@link #registerTable} + * share identical parse-and-infer rules. The returned schema and types are unmodifiable. + *

Type inference starts every column as INT and demotes to STRING on the first + * non-integer field, so a header-only CSV (no data rows) infers every column as + * INT — there is no evidence to refute INT. REST callers uploading a schema-only file get + * an all-INT table until rows arrive. + */ + private static TableMeta parseTable(String tableName, Path csv) throws IOException { CSVFormat format = CSVFormat.RFC4180.builder() .setIgnoreSurroundingSpaces(true) .build(); @@ -146,9 +182,9 @@ private void loadTable(String tableName, Path csv) throws IOException { types.add(isInt[i] ? ColumnType.INT : ColumnType.STRING); } - dbSchemata.put(tableName, Collections.unmodifiableMap(columnMap)); - dbColumnTypes.put(tableName, Collections.unmodifiableList(types)); - dbLocations.put(tableName, csv); + return new TableMeta(csv, + Collections.unmodifiableMap(columnMap), + Collections.unmodifiableList(types)); } } @@ -162,13 +198,25 @@ private static boolean parsesAsInt(String field) { } } + /** + * Returns the full metadata for a table in one atomic lookup, or null if absent. + * Callers that need more than one of location/schema/types (e.g. ScanOperator) use + * this so the fields they read are guaranteed to come from the same registration. + * @param tableName The name of the table + * @return The table's metadata, or null if the table is not in the catalog + */ + public TableMeta getTableMeta(String tableName) { + return tables.get(tableName); + } + /** * Returns the file path for a specified table. * @param tableName The name of the table * @return The Path object representing the table's data file location */ public Path getDBLocation(String tableName) { - return dbLocations.get(tableName); + TableMeta meta = tables.get(tableName); + return meta == null ? null : meta.path(); } /** @@ -178,7 +226,8 @@ public Path getDBLocation(String tableName) { * @return A map from column names to their positions (indices) */ public Map getDBSchemata(String tableName) { - return dbSchemata.get(tableName); + TableMeta meta = tables.get(tableName); + return meta == null ? null : meta.schema(); } /** @@ -187,7 +236,8 @@ public Map getDBSchemata(String tableName) { * @return A list of ColumnType values in column order, or null if table not found */ public List getColumnTypes(String tableName) { - return dbColumnTypes.get(tableName); + TableMeta meta = tables.get(tableName); + return meta == null ? null : meta.types(); } /** @@ -196,7 +246,7 @@ public List getColumnTypes(String tableName) { * @return The sorted list of loaded table names */ public List getTableNames() { - List names = new ArrayList<>(dbLocations.keySet()); + List names = new ArrayList<>(tables.keySet()); Collections.sort(names); return names; } @@ -220,7 +270,7 @@ public QueryExecutionException unknownTable(String tableName) { * @return true if the table exists, false otherwise */ public boolean tableExists(String tableName) { - return (dbLocations.containsKey(tableName) && dbSchemata.containsKey(tableName)); + return tables.containsKey(tableName); } /** @@ -230,9 +280,10 @@ public boolean tableExists(String tableName) { * @return true if the column exists in the table, false otherwise */ public boolean columnExists(String tableName, String columnName) { - if (!tableExists(tableName)) { + TableMeta meta = tables.get(tableName); + if (meta == null) { return false; } - return dbSchemata.get(tableName).containsKey(columnName.toLowerCase()); + return meta.schema().containsKey(columnName.toLowerCase()); } } diff --git a/engine/src/main/java/com/github/jinba1/cuckoodb/QueryPlanner.java b/engine/src/main/java/com/github/jinba1/cuckoodb/QueryPlanner.java index fdf51f8..ad1ce3d 100644 --- a/engine/src/main/java/com/github/jinba1/cuckoodb/QueryPlanner.java +++ b/engine/src/main/java/com/github/jinba1/cuckoodb/QueryPlanner.java @@ -79,8 +79,6 @@ public static PlannedQuery planQuery(String filename) { * with an {@link ErrorCode} and a message the caller can act on */ public static PlannedQuery planQuery(String filename, QueryConfig config) { - PlanContext ctx = new PlanContext(config); - Statement statement; try { statement = CCJSqlParserUtil.parse(new FileReader(filename)); @@ -91,6 +89,53 @@ public static PlannedQuery planQuery(String filename, QueryConfig config) { throw new QueryExecutionException(ErrorCode.DATA_ERROR, "Could not read query file '" + filename + "': " + e.getMessage()); } + return planFrom(statement, config); + } + + /** + * Plans one query from SQL held in memory — the library/REST entry point. Parses the + * text, then runs the identical statement-level pipeline as the file overload, so a + * query planned from a string and the same query planned from a file are byte-for-byte + * the same plan (and EXPLAIN renders identically). + * + *

Named {@code planSql} rather than overloading {@code planQuery(String, QueryConfig)} + * because that signature already exists for the file path; both take a {@code String}, so + * a distinct name is the only unambiguous way to mean "this argument is SQL text, not a + * path". Read-only-by-construction holds across the parser: a non-SELECT statement is + * rejected as {@link ErrorCode#UNSUPPORTED_SQL} and multi-statement input fails to parse + * ({@link ErrorCode#PARSE_ERROR}). Unlike the file overload there is no I/O, so no + * {@code DATA_ERROR} branch. + * @param sql the SQL query text (optionally EXPLAIN-prefixed); not a file path + * @param config the per-query planner configuration + * @return the planned query; the root is never null + * @throws QueryExecutionException with an {@link ErrorCode} the caller can act on + */ + public static PlannedQuery planSql(String sql, QueryConfig config) { + if (sql == null || sql.isBlank()) { + // Keep the "every failure is a classified QueryExecutionException" invariant: + // CCJSqlParserUtil.parse(null) throws a bare NPE that would otherwise escape as + // an unclassified RuntimeException (the server would map it to 500, not 400). + throw new QueryExecutionException(ErrorCode.PARSE_ERROR, + "SQL text must not be null or blank"); + } + Statement statement; + try { + statement = CCJSqlParserUtil.parse(sql); + } catch (JSQLParserException e) { + throw new QueryExecutionException(ErrorCode.PARSE_ERROR, + "SQL syntax error: " + parserMessage(e)); + } + return planFrom(statement, config); + } + + /** + * The shared planning core: takes an already-parsed statement and produces the optimized, + * executable plan (plus EXPLAIN text when the statement is EXPLAIN-prefixed). Both source + * overloads delegate here so source format can never change the resulting plan. EXPLAIN + * detection and optimization happen at this statement level, identically for every source. + */ + private static PlannedQuery planFrom(Statement statement, QueryConfig config) { + PlanContext ctx = new PlanContext(config); boolean explain = false; Select select; diff --git a/engine/src/main/java/com/github/jinba1/cuckoodb/QueryResultSet.java b/engine/src/main/java/com/github/jinba1/cuckoodb/QueryResultSet.java new file mode 100644 index 0000000..7008442 --- /dev/null +++ b/engine/src/main/java/com/github/jinba1/cuckoodb/QueryResultSet.java @@ -0,0 +1,21 @@ +package com.github.jinba1.cuckoodb; + +import java.util.List; + +/** + * A fully-materialized query result held in memory, for callers (the REST gateway) that + * need the rows as data rather than written to a CSV file. The CLI path keeps writing to + * a file via {@link CuckooDB#execute}; both share one drain helper so they cannot diverge + * in iteration, row count, or truncation semantics. + * + *

Rows are positional and aligned with {@code columns} by index — see {@link ColumnMeta} + * for why callers must not rely on column names being unique. + * + * @param columns one entry per output column, in column order + * @param rows the result rows; each row's values align with {@code columns} by index + * @param truncated true when a LIMIT stopped the query although more rows existed + * @param hint how to refine the query when truncated; null otherwise + */ +public record QueryResultSet(List columns, List> rows, + boolean truncated, String hint) { +} diff --git a/engine/src/main/java/com/github/jinba1/cuckoodb/TableMeta.java b/engine/src/main/java/com/github/jinba1/cuckoodb/TableMeta.java new file mode 100644 index 0000000..79154bf --- /dev/null +++ b/engine/src/main/java/com/github/jinba1/cuckoodb/TableMeta.java @@ -0,0 +1,20 @@ +package com.github.jinba1.cuckoodb; + +import java.nio.file.Path; +import java.util.List; +import java.util.Map; + +/** + * Durable metadata for one base table, held as a single value in {@link DBCatalog}. + * Merging location, schema, and column types into one record lets the catalog publish + * a table with a single atomic map write — a table is either fully absent or fully + * present, with no torn read in any direction. This is the precondition for safe + * runtime table registration (REST upload) while queries read concurrently. + * + * @param path the CSV file backing the table; {@link com.github.jinba1.cuckoodb.operator.ScanOperator} + * re-opens it on every {@code reset()}, so it must outlive the query + * @param schema column name (lowercased) to position, unmodifiable + * @param types column types in column order, unmodifiable + */ +public record TableMeta(Path path, Map schema, List types) { +} diff --git a/engine/src/main/java/com/github/jinba1/cuckoodb/operator/ScanOperator.java b/engine/src/main/java/com/github/jinba1/cuckoodb/operator/ScanOperator.java index e727594..64cee45 100644 --- a/engine/src/main/java/com/github/jinba1/cuckoodb/operator/ScanOperator.java +++ b/engine/src/main/java/com/github/jinba1/cuckoodb/operator/ScanOperator.java @@ -7,6 +7,7 @@ import com.github.jinba1.cuckoodb.PlanContext; import com.github.jinba1.cuckoodb.QueryExecutionException; import com.github.jinba1.cuckoodb.StringValue; +import com.github.jinba1.cuckoodb.TableMeta; import com.github.jinba1.cuckoodb.Tuple; import com.github.jinba1.cuckoodb.Value; @@ -43,11 +44,14 @@ public class ScanOperator extends Operator { public ScanOperator(PlanContext ctx, String tableName) { super(ctx); this.tableName = tableName; - tablePath = DBCatalog.getInstance().getDBLocation(tableName); - this.columnTypes = DBCatalog.getInstance().getColumnTypes(tableName); - if (tablePath == null || columnTypes == null) { + // One atomic lookup: path and types come from the same table registration, + // so a concurrent runtime upload can never expose one without the other. + TableMeta meta = DBCatalog.getInstance().getTableMeta(tableName); + if (meta == null) { throw DBCatalog.getInstance().unknownTable(tableName); } + this.tablePath = meta.path(); + this.columnTypes = meta.types(); child = null; // Scan cannot have child operator this.schemaRegistered = true; diff --git a/engine/src/test/java/com/github/jinba1/cuckoodb/CuckooDBResultSetTest.java b/engine/src/test/java/com/github/jinba1/cuckoodb/CuckooDBResultSetTest.java new file mode 100644 index 0000000..2e9b6c5 --- /dev/null +++ b/engine/src/test/java/com/github/jinba1/cuckoodb/CuckooDBResultSetTest.java @@ -0,0 +1,200 @@ +package com.github.jinba1.cuckoodb; + +import static org.junit.jupiter.api.Assertions.*; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import com.github.jinba1.cuckoodb.operator.Operator; + +/** + * E2/E3: in-memory result materialization ({@link CuckooDB#executeToResultSet}) with + * positional columns, qualified names for join-duplicate disambiguation, best-effort + * runtime type inference, and truncation/hint parity with the CSV-file path. + */ +class CuckooDBResultSetTest { + + private static final String SAMPLE_DB_DIR = "samples/db"; + + @TempDir + Path tempDir; + + @BeforeEach + void initCatalog() throws IOException { + DBCatalog.resetDBCatalog(); + DBCatalog.initDBCatalog(SAMPLE_DB_DIR); + // A table with one int column and one string column, registered at runtime, + // so type inference and the empty-result case are exercised on known types. + Path typed = tempDir.resolve("typed.csv"); + Files.writeString(typed, "n,s\n1,alice\n2,bob\n"); + DBCatalog.getInstance().registerTable("Typed", typed); + } + + @AfterEach + void resetCatalog() { + DBCatalog.resetDBCatalog(); + } + + private QueryResultSet run(String sql) { + Operator root = QueryPlanner.planSql(sql, QueryConfig.defaults()).root(); + return CuckooDB.executeToResultSet(root); + } + + @Test + void joinDuplicateNamesGetDistinctQualifiedNames() { + QueryResultSet rs = run("SELECT * FROM Student, Enrolled WHERE Student.A = Enrolled.A"); + + List names = rs.columns().stream().map(ColumnMeta::name).toList(); + assertEquals(List.of("a", "b", "c", "d", "a", "e", "h"), names, + "join SELECT * emits duplicate bare 'a' (verified header shape)"); + + assertEquals("student.a", rs.columns().get(0).qualifiedName()); + assertEquals("enrolled.a", rs.columns().get(4).qualifiedName()); + assertNotEquals(rs.columns().get(0).qualifiedName(), rs.columns().get(4).qualifiedName(), + "the two 'a' columns must be distinguishable by qualifiedName"); + } + + @Test + void typeInferredFromFirstRow() { + QueryResultSet rs = run("SELECT * FROM Typed"); + + assertEquals(2, rs.columns().size()); + assertEquals(ColumnType.INT, rs.columns().get(0).type(), "n column is all-int"); + assertEquals(ColumnType.STRING, rs.columns().get(1).type(), "s column is string"); + assertFalse(rs.truncated()); + assertNull(rs.hint()); + assertEquals(2, rs.rows().size()); + } + + @Test + void emptyResultKeepsNamesButHasNullTypes() { + QueryResultSet rs = run("SELECT * FROM Typed WHERE Typed.n > 999"); + + assertTrue(rs.rows().isEmpty(), "no rows match"); + List names = rs.columns().stream().map(ColumnMeta::name).toList(); + assertEquals(List.of("n", "s"), names, "column names survive an empty result"); + for (ColumnMeta col : rs.columns()) { + assertNull(col.type(), "no row means no inferable type"); + } + } + + @Test + void truncationAndHintMatchTheFilePath() throws IOException { + // Student has 6 rows; LIMIT 2 truncates. + QueryResultSet rs = run("SELECT * FROM Student LIMIT 2"); + assertTrue(rs.truncated()); + assertEquals(2, rs.rows().size()); + assertEquals(QueryResult.truncated(2).hint(), rs.hint(), + "hint text must reuse the CLI path's wording verbatim"); + } + + @Test + void limitAtOrAboveSizeIsNotTruncated() { + QueryResultSet rs = run("SELECT * FROM Student LIMIT 100"); + assertFalse(rs.truncated()); + assertNull(rs.hint()); + assertEquals(6, rs.rows().size()); + } + + @Test + void resultSetRowsMatchTheCsvFileBytesForSameQuery() throws IOException { + String sql = "SELECT Student.A, Student.D FROM Student WHERE Student.D > 30"; + + // File path + Path out = tempDir.resolve("out.csv"); + Operator fileRoot = QueryPlanner.planSql(sql, QueryConfig.defaults()).root(); + CuckooDB.execute(fileRoot, out.toString()); + List fileLines = Files.readAllLines(out); + List fileDataLines = fileLines.subList(1, fileLines.size()); // drop header + + // Result-set path + QueryResultSet rs = run(sql); + List rsDataLines = new ArrayList<>(); + for (List row : rs.rows()) { + List fields = new ArrayList<>(); + for (Value v : row) { + fields.add(v.toString()); + } + rsDataLines.add(String.join(",", fields)); + } + + assertEquals(fileDataLines, rsDataLines, + "the two drain paths must produce identical row data"); + } + + /** + * Regression for the review's HIGH finding: a one-sided WHERE filter makes the optimizer + * push a selection under the join, so the join source is an intermediate (temp_) schema + * that carries both a base-qualified key (enrolled.a) and an internal temp_.a key at + * the same index. qualifiedName must report the real origin, never the internal id. + */ + @Test + void qualifiedNameNeverLeaksInternalSchemaIdsAcrossPushdownJoin() { + QueryResultSet rs = run( + "SELECT * FROM Student, Enrolled WHERE Student.A = Enrolled.A AND Enrolled.H > 0"); + + List names = rs.columns().stream().map(ColumnMeta::name).toList(); + assertEquals(List.of("a", "b", "c", "d", "a", "e", "h"), names); + + List quals = rs.columns().stream().map(ColumnMeta::qualifiedName).toList(); + for (String q : quals) { + assertFalse(q != null && q.startsWith("temp_"), + "qualifiedName must be a real table origin, not an internal id: " + quals); + } + assertEquals("student.a", rs.columns().get(0).qualifiedName()); + assertEquals("enrolled.a", rs.columns().get(4).qualifiedName(), + "the pushed-down side still resolves to its base table origin"); + } + + @Test + void resultSetCollectionsAreUnmodifiable() { + // The record is a value handed to the REST layer; callers must not be able to mutate + // rows or columns after the fact. + QueryResultSet rs = run("SELECT * FROM Student"); + assertThrows(UnsupportedOperationException.class, () -> rs.rows().add(List.of())); + assertThrows(UnsupportedOperationException.class, + () -> rs.columns().add(new ColumnMeta("x", null, null))); + } + + @Test + void limitZeroIsEmptyButTruncated() { + // The one state where rows.isEmpty() coincides with truncated=true: LIMIT 0 over a + // non-empty source. Names survive, every type is null, and the truncation hint is set. + QueryResultSet rs = run("SELECT * FROM Student LIMIT 0"); + + assertTrue(rs.rows().isEmpty()); + assertTrue(rs.truncated()); + assertEquals(QueryResult.truncated(0).hint(), rs.hint()); + List names = rs.columns().stream().map(ColumnMeta::name).toList(); + assertEquals(List.of("a", "b", "c", "d"), names); + for (ColumnMeta col : rs.columns()) { + assertNull(col.type(), "no row means no inferable type"); + } + } + + @Test + void aggregateColumnsHaveNullQualifiedNameAndInferredTypes() { + // SUM(Typed.n) / MIN(Typed.s) keys contain a '.', so they pin the '(' exclusion that + // keeps aggregate columns out of qualifiedName; a no-group aggregate emits exactly one + // row, so first-row inference still types every column. + QueryResultSet rs = run("SELECT COUNT(*), SUM(Typed.n), MIN(Typed.s) FROM Typed"); + + assertEquals(1, rs.rows().size(), "a no-group aggregate emits exactly one row"); + List names = rs.columns().stream().map(ColumnMeta::name).toList(); + assertEquals(List.of("count(*)", "sum(typed.n)", "min(typed.s)"), names); + for (ColumnMeta col : rs.columns()) { + assertNull(col.qualifiedName(), "aggregate columns have no dotted origin: " + col.name()); + } + assertEquals(ColumnType.INT, rs.columns().get(0).type(), "count(*) is INT"); + assertEquals(ColumnType.INT, rs.columns().get(1).type(), "sum over int is INT"); + assertEquals(ColumnType.STRING, rs.columns().get(2).type(), "min over string is STRING"); + } +} diff --git a/engine/src/test/java/com/github/jinba1/cuckoodb/DBCatalogTest.java b/engine/src/test/java/com/github/jinba1/cuckoodb/DBCatalogTest.java index f6bb940..07e1e76 100644 --- a/engine/src/test/java/com/github/jinba1/cuckoodb/DBCatalogTest.java +++ b/engine/src/test/java/com/github/jinba1/cuckoodb/DBCatalogTest.java @@ -247,4 +247,173 @@ public void loadedTableSchemaIsImmutable() throws IOException { assertThrows(UnsupportedOperationException.class, () -> DBCatalog.getInstance().getDBSchemata("T").put("c", 9)); } + + // ---- E4: single TableMeta map (getTableMeta) + runtime registration (registerTable) ---- + + @Test + public void getTableMetaExposesPathSchemaAndTypesTogether() { + DBCatalog.initDBCatalog(SAMPLE_DB_DIR); + TableMeta meta = DBCatalog.getInstance().getTableMeta("Student"); + assertNotNull(meta, "loaded table must have meta"); + assertTrue(meta.path().toString().endsWith("Student.csv")); + assertEquals(DBCatalog.getInstance().getDBSchemata("Student"), meta.schema()); + assertEquals(DBCatalog.getInstance().getColumnTypes("Student"), meta.types()); + } + + @Test + public void getTableMetaReturnsNullForMissing() { + DBCatalog.initDBCatalog(SAMPLE_DB_DIR); + assertNull(DBCatalog.getInstance().getTableMeta("NoSuchTable")); + } + + @Test + public void registerTableMakesTableQueryable() throws IOException { + DBCatalog.resetDBCatalog(); + DBCatalog catalog = DBCatalog.getInstance(); + Path csv = tempDb.resolve("upload.csv"); + Files.writeString(csv, "id,name\n1,alice\n2,bob\n"); + + boolean registered = catalog.registerTable("Uploaded", csv); + + assertTrue(registered, "first registration wins"); + assertTrue(catalog.tableExists("Uploaded")); + assertEquals(Map.of("id", 0, "name", 1), catalog.getDBSchemata("Uploaded")); + assertEquals(List.of(ColumnType.INT, ColumnType.STRING), catalog.getColumnTypes("Uploaded")); + assertEquals(csv, catalog.getDBLocation("Uploaded")); + } + + @Test + public void registerTableDuplicateReturnsFalseAndKeepsOriginal() throws IOException { + DBCatalog.resetDBCatalog(); + DBCatalog catalog = DBCatalog.getInstance(); + Path first = tempDb.resolve("first.csv"); + Files.writeString(first, "id,name\n1,alice\n"); + Path second = tempDb.resolve("second.csv"); + Files.writeString(second, "x,y,z\n1,2,3\n"); + + assertTrue(catalog.registerTable("T", first)); + boolean again = catalog.registerTable("T", second); + + assertFalse(again, "duplicate registration must report not-registered (409 signal)"); + assertEquals(first, catalog.getDBLocation("T"), "original meta must be untouched"); + assertEquals(Map.of("id", 0, "name", 1), catalog.getDBSchemata("T")); + } + + @Test + public void registerTableRejectsBadCsvWithDataError() throws IOException { + DBCatalog.resetDBCatalog(); + DBCatalog catalog = DBCatalog.getInstance(); + Path csv = tempDb.resolve("ragged.csv"); + Files.writeString(csv, "a,b\n1,2,3\n"); + + QueryExecutionException e = assertThrows(QueryExecutionException.class, + () -> catalog.registerTable("Bad", csv)); + assertEquals(ErrorCode.DATA_ERROR, e.code()); + assertFalse(catalog.tableExists("Bad"), "failed parse must not leave a partial table"); + } + + @Test + public void registeredTableSchemaIsImmutable() throws IOException { + DBCatalog.resetDBCatalog(); + DBCatalog catalog = DBCatalog.getInstance(); + Path csv = tempDb.resolve("u.csv"); + Files.writeString(csv, "a,b\n1,2\n"); + catalog.registerTable("U", csv); + assertThrows(UnsupportedOperationException.class, + () -> catalog.getDBSchemata("U").put("c", 9)); + } + + /** + * The D4 invariant under contention: while one thread registers a table, readers + * see it either fully absent or fully present — never a torn state (path without + * types, schema without path). A single putIfAbsent guarantees this. + */ + @Test + public void registerTableIsAtomicAgainstConcurrentReaders() throws Exception { + DBCatalog.resetDBCatalog(); + DBCatalog catalog = DBCatalog.getInstance(); + Path csv = tempDb.resolve("race.csv"); + Files.writeString(csv, "a,b\n1,2\n"); + + int readers = 12; + int iterationsPerReader = 5000; + var pool = java.util.concurrent.Executors.newFixedThreadPool(readers + 1); + try { + var barrier = new java.util.concurrent.CyclicBarrier(readers + 1); + var torn = new java.util.concurrent.atomic.AtomicReference(); + var futures = new java.util.ArrayList>(); + for (int i = 0; i < readers; i++) { + futures.add(pool.submit(() -> { + barrier.await(); + for (int n = 0; n < iterationsPerReader; n++) { + TableMeta meta = catalog.getTableMeta("Race"); + if (meta != null) { + // Present => every field must be present and consistent. + if (meta.path() == null || meta.schema() == null || meta.types() == null) { + torn.compareAndSet(null, "TableMeta exposed with a null field"); + } else if (meta.schema().size() != meta.types().size()) { + torn.compareAndSet(null, "schema/types width mismatch"); + } + } + } + return null; + })); + } + // Writer thread registers mid-flight. + futures.add(pool.submit(() -> { + barrier.await(); + catalog.registerTable("Race", csv); + return null; + })); + for (var f : futures) f.get(); + assertNull(torn.get(), torn.get()); + assertTrue(catalog.tableExists("Race"), "table is present after the race"); + } finally { + pool.shutdownNow(); + } + } + + /** + * The 409 contract PR 4b's upload path depends on: when N threads register the SAME name + * at once, exactly one putIfAbsent wins (returns true), the rest report false, and the + * winner's meta is installed intact — never a torn blend of two registrations. + */ + @Test + public void registerTableHasExactlyOneWinnerAmongConcurrentSameNameRegisters() throws Exception { + DBCatalog.resetDBCatalog(); + DBCatalog catalog = DBCatalog.getInstance(); + + int k = 16; + List csvs = new java.util.ArrayList<>(); + for (int i = 0; i < k; i++) { + Path p = tempDb.resolve("w" + i + ".csv"); + Files.writeString(p, "a,b\n" + i + "," + i + "\n"); + csvs.add(p); + } + + var pool = java.util.concurrent.Executors.newFixedThreadPool(k); + try { + var barrier = new java.util.concurrent.CyclicBarrier(k); + var winners = new java.util.concurrent.atomic.AtomicInteger(); + var futures = new java.util.ArrayList>(); + for (int i = 0; i < k; i++) { + Path csv = csvs.get(i); + futures.add(pool.submit(() -> { + barrier.await(); + if (catalog.registerTable("Solo", csv)) { + winners.incrementAndGet(); + } + return null; + })); + } + for (var f : futures) f.get(); + + assertEquals(1, winners.get(), "exactly one concurrent register may win"); + assertTrue(catalog.tableExists("Solo")); + assertTrue(csvs.contains(catalog.getDBLocation("Solo")), + "the installed meta is one registration's, intact (not a torn blend)"); + } finally { + pool.shutdownNow(); + } + } } diff --git a/engine/src/test/java/com/github/jinba1/cuckoodb/QueryPlannerSqlTest.java b/engine/src/test/java/com/github/jinba1/cuckoodb/QueryPlannerSqlTest.java new file mode 100644 index 0000000..fa16497 --- /dev/null +++ b/engine/src/test/java/com/github/jinba1/cuckoodb/QueryPlannerSqlTest.java @@ -0,0 +1,128 @@ +package com.github.jinba1.cuckoodb; + +import static org.junit.jupiter.api.Assertions.*; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import com.github.jinba1.cuckoodb.operator.Operator; + +/** + * E1: the String-SQL planning entrypoint ({@link QueryPlanner#planSql}) must behave + * identically to the file overload — same plan, same EXPLAIN rendering, same error + * classification — because both delegate to one shared parsed-statement core. The + * file overload itself stays exercised unchanged by SampleQueryRunner / ExplainEndToEndTest. + */ +class QueryPlannerSqlTest { + + private static final String SAMPLE_DB_DIR = "samples/db"; + + @TempDir + static Path tempDir; + + @BeforeAll + static void initCatalog() { + DBCatalog.resetDBCatalog(); + DBCatalog.initDBCatalog(SAMPLE_DB_DIR); + } + + @AfterAll + static void resetCatalog() { + DBCatalog.resetDBCatalog(); + } + + /** Plans the same SQL from a string and from a file; the rendered trees must match. */ + private void assertPlanParity(String sql) throws IOException { + QueryConfig cfg = QueryConfig.defaults(); + Path file = tempDir.resolve("q.sql"); + Files.writeString(file, sql); + + PlannedQuery fromFile = QueryPlanner.planQuery(file.toString(), cfg); + PlannedQuery fromSql = QueryPlanner.planSql(sql, cfg); + + assertEquals(PlanPrinter.print(fromFile.root()), PlanPrinter.print(fromSql.root()), + "string-source plan must render identically to file-source plan: " + sql); + assertEquals(fromFile.explainText(), fromSql.explainText(), + "EXPLAIN text must match across sources: " + sql); + } + + @Test + void selectStarParity() throws IOException { + assertPlanParity("SELECT * FROM Student"); + } + + @Test + void projectionAndWhereParity() throws IOException { + assertPlanParity("SELECT Student.A, Student.D FROM Student WHERE Student.D > 30"); + } + + @Test + void joinParity() throws IOException { + assertPlanParity( + "SELECT Student.A, Enrolled.H FROM Student, Enrolled WHERE Student.A = Enrolled.A"); + } + + @Test + void explainParityRendersBeforeAndAfter() throws IOException { + PlannedQuery planned = QueryPlanner.planSql( + "EXPLAIN SELECT Student.A FROM Student WHERE Student.D > 30", QueryConfig.defaults()); + assertNotNull(planned.explainText(), "EXPLAIN must populate explainText"); + assertTrue(planned.explainText().contains("Plan (as written)")); + assertTrue(planned.explainText().contains("Plan (optimized)")); + } + + @Test + void badSqlThrowsParseError() { + QueryExecutionException e = assertThrows(QueryExecutionException.class, + () -> QueryPlanner.planSql("SELECT FROM WHERE", QueryConfig.defaults())); + assertEquals(ErrorCode.PARSE_ERROR, e.code()); + } + + @Test + void nonSelectThrowsUnsupported() { + QueryExecutionException e = assertThrows(QueryExecutionException.class, + () -> QueryPlanner.planSql("DELETE FROM Student", QueryConfig.defaults())); + assertEquals(ErrorCode.UNSUPPORTED_SQL, e.code()); + } + + @Test + void trailingSemicolonIsAccepted() { + // A single statement with a trailing ';' is valid SQL — must plan, not error. + PlannedQuery planned = QueryPlanner.planSql("SELECT * FROM Student;", QueryConfig.defaults()); + assertNotNull(planned.root()); + assertNull(planned.explainText()); + } + + @Test + void nullSqlThrowsParseErrorNotNpe() { + // Every planSql failure must be a classified QueryExecutionException, never a raw NPE + // that the server would map to 500 instead of 400. + QueryExecutionException e = assertThrows(QueryExecutionException.class, + () -> QueryPlanner.planSql(null, QueryConfig.defaults())); + assertEquals(ErrorCode.PARSE_ERROR, e.code()); + } + + @Test + void blankSqlThrowsParseError() { + QueryExecutionException e = assertThrows(QueryExecutionException.class, + () -> QueryPlanner.planSql(" \n ", QueryConfig.defaults())); + assertEquals(ErrorCode.PARSE_ERROR, e.code()); + } + + @Test + void multipleStatementsAreRejected() { + // Read-only-by-construction: a second statement must never sneak through. JSqlParser's + // single-statement parse rejects it as a syntax error. + QueryExecutionException e = assertThrows(QueryExecutionException.class, + () -> QueryPlanner.planSql("SELECT * FROM Student; SELECT * FROM Course;", + QueryConfig.defaults())); + assertTrue(e.code() == ErrorCode.PARSE_ERROR || e.code() == ErrorCode.UNSUPPORTED_SQL, + "multi-statement input must be rejected, got " + e.code()); + } +} diff --git a/engine/src/test/java/com/github/jinba1/cuckoodb/SampleQueryByteIdenticalTest.java b/engine/src/test/java/com/github/jinba1/cuckoodb/SampleQueryByteIdenticalTest.java new file mode 100644 index 0000000..7449247 --- /dev/null +++ b/engine/src/test/java/com/github/jinba1/cuckoodb/SampleQueryByteIdenticalTest.java @@ -0,0 +1,94 @@ +package com.github.jinba1.cuckoodb; + +import static org.junit.jupiter.api.Assertions.*; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +/** + * Runs the packaged sample queries through the CLI path and asserts each output is + * byte-identical to the committed golden output — row ORDER preserved (no sort), with only + * line endings, per-line trailing whitespace, and trailing newlines normalized. + * + *

This is the byte gate {@link SampleQueryRunner} performs as a manual {@code exec:exec} + * driver, now wired into {@code mvn test}/CI. The pre-existing {@code CuckooDBTest} sample + * check sorts and trims, so it cannot catch a row-ordering or separator regression — exactly + * what the {@code execute()}->{@code drain()} refactor must not introduce. This test does + * not sort, so it does. + */ +class SampleQueryByteIdenticalTest { + + @TempDir + Path outDir; + + @Test + void allSampleQueriesAreByteIdentical() throws IOException { + Path samples = Paths.get(System.getProperty("user.dir"), "samples"); + Path dbDir = samples.resolve("db"); + Path inputDir = samples.resolve("input"); + Path expectedDir = samples.resolve("expected_output"); + + File[] queryFiles = inputDir.toFile().listFiles((d, n) -> n.endsWith(".sql")); + assertNotNull(queryFiles, "sample input directory must exist: " + inputDir); + assertTrue(queryFiles.length >= 20, + "expected at least 20 sample queries, found " + queryFiles.length); + Arrays.sort(queryFiles); + + List failures = new ArrayList<>(); + for (File queryFile : queryFiles) { + String name = queryFile.getName(); + String base = name.substring(0, name.length() - 4); + Path expectedFile = expectedDir.resolve(base + ".csv"); + if (!Files.exists(expectedFile)) { + continue; // no golden output committed for this query + } + Path outputFile = outDir.resolve(base + ".csv"); + + DBCatalog.resetDBCatalog(); + DBCatalog.initDBCatalog(dbDir.toString()); + PlannedQuery planned = QueryPlanner.planQuery(queryFile.getAbsolutePath()); + if (planned.explainText() != null) { + Files.writeString(outputFile, planned.explainText()); + } else { + CuckooDB.execute(planned.root(), outputFile.toString()); + } + + String actual = normalize(Files.readString(outputFile)); + String expected = normalize(Files.readString(expectedFile)); + if (!actual.equals(expected)) { + failures.add(base); + } + } + assertTrue(failures.isEmpty(), "byte-identical mismatch against golden output in: " + failures); + } + + /** + * Mirrors {@link SampleQueryRunner}'s normalization: {@code \n} line endings, per-line + * trailing-whitespace strip, no trailing newline. Crucially does NOT sort rows, so a + * row-ordering regression in the CLI path fails this comparison. + */ + private static String normalize(String content) { + String normalized = content.replace("\r\n", "\n").replace("\r", "\n"); + String[] lines = normalized.split("\n", -1); + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < lines.length; i++) { + if (i > 0) { + sb.append("\n"); + } + sb.append(lines[i].stripTrailing()); + } + while (sb.length() > 0 && sb.charAt(sb.length() - 1) == '\n') { + sb.deleteCharAt(sb.length() - 1); + } + return sb.toString(); + } +}