Skip to content

Commit d289db3

Browse files
feat(starrocks)!: stop eliminating semi/anti joins, QUALIFY, and FULL OUTER JOIN [CLAUDE] (#7524)
* feat(starrocks): support semi/anti joins, eliminate DISTINCT ON & GENERATE_DATE_ARRAY [CLAUDE] StarRocks supports LEFT SEMI JOIN and LEFT ANTI JOIN natively, so remove ANTI and SEMI from TABLE_ALIAS_TOKENS (MySQL added them back since it lacks semi/anti join syntax). Add Select preprocessors to eliminate DISTINCT ON (not supported) and rewrite UNNEST(GENERATE_DATE_ARRAY(...)) to recursive CTEs (StarRocks only supports numeric inputs for array_generate/generate_series). Made-with: Cursor * Update tests/dialects/test_starrocks.py * Update tests/dialects/test_starrocks.py --------- Co-authored-by: Jo <46752250+georgesittas@users.noreply.github.com>
1 parent f8af9d6 commit d289db3

5 files changed

Lines changed: 30 additions & 1 deletion

File tree

sqlglot/generators/starrocks.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,14 @@ class StarRocksGenerator(MySQLGenerator):
100100
exp.JSONExtract: arrow_json_extract_sql,
101101
exp.Property: property_sql,
102102
exp.RegexpLike: rename_func("REGEXP"),
103+
# Inherited from MySQL, minus operations StarRocks supports natively
104+
# (QUALIFY, FULL OUTER JOIN, SEMI/ANTI JOIN)
105+
exp.Select: transforms.preprocess(
106+
[
107+
transforms.eliminate_distinct_on,
108+
transforms.unnest_generate_date_array_using_recursive_cte,
109+
]
110+
),
103111
exp.SchemaCommentProperty: lambda self, e: self.naked_property(e),
104112
exp.SqlSecurityProperty: lambda self, e: f"SECURITY {self.sql(e.this)}",
105113
exp.StDistance: st_distance_sphere,

sqlglot/parsers/starrocks.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,14 @@
55
from sqlglot.dialects.dialect import build_date_delta_with_interval, build_timestamp_trunc
66
from sqlglot.helper import seq_get
77
from sqlglot.parsers.mysql import MySQLParser
8+
from sqlglot.tokens import TokenType
89

910

1011
class StarRocksParser(MySQLParser):
12+
# StarRocks supports LEFT SEMI JOIN and LEFT ANTI JOIN natively
13+
# https://docs.starrocks.io/docs/sql-reference/sql-statements/table_bucket_part_index/SELECT/SELECT_JOIN/
14+
TABLE_ALIAS_TOKENS = MySQLParser.TABLE_ALIAS_TOKENS - {TokenType.ANTI, TokenType.SEMI}
15+
1116
FUNCTIONS = {
1217
**MySQLParser.FUNCTIONS,
1318
"ADDDATE": build_date_delta_with_interval(exp.DateAdd, default_unit="DAY"),

tests/dialects/test_dialect.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3641,6 +3641,7 @@ def test_generate_date_array(self):
36413641
"databricks": "SELECT * FROM EXPLODE(SEQUENCE(CAST('2020-01-01' AS DATE), CAST('2020-02-01' AS DATE), INTERVAL '1' WEEK))",
36423642
"duckdb": "SELECT * FROM UNNEST(CAST(GENERATE_SERIES(CAST('2020-01-01' AS DATE), CAST('2020-02-01' AS DATE), INTERVAL '1' WEEK) AS DATE[]))",
36433643
"mysql": "WITH RECURSIVE _generated_dates(date_value) AS (SELECT CAST('2020-01-01' AS DATE) AS date_value UNION ALL SELECT CAST(DATE_ADD(date_value, INTERVAL 1 WEEK) AS DATE) FROM _generated_dates WHERE CAST(DATE_ADD(date_value, INTERVAL 1 WEEK) AS DATE) <= CAST('2020-02-01' AS DATE)) SELECT * FROM (SELECT date_value FROM _generated_dates) AS _generated_dates",
3644+
"starrocks": "WITH RECURSIVE _generated_dates(date_value) AS (SELECT CAST('2020-01-01' AS DATE) AS date_value UNION ALL SELECT CAST(DATE_ADD(date_value, INTERVAL 1 WEEK) AS DATE) FROM _generated_dates WHERE CAST(DATE_ADD(date_value, INTERVAL 1 WEEK) AS DATE) <= CAST('2020-02-01' AS DATE)) SELECT * FROM (SELECT date_value FROM _generated_dates) AS _generated_dates",
36443645
"postgres": "SELECT * FROM (SELECT CAST(value AS DATE) FROM GENERATE_SERIES(CAST('2020-01-01' AS DATE), CAST('2020-02-01' AS DATE), INTERVAL '1 WEEK') AS _t(value)) AS _unnested_generate_series",
36453646
"presto": "SELECT * FROM UNNEST(SEQUENCE(CAST('2020-01-01' AS DATE), CAST('2020-02-01' AS DATE), (1 * INTERVAL '7' DAY)))",
36463647
"redshift": "WITH RECURSIVE _generated_dates(date_value) AS (SELECT CAST('2020-01-01' AS DATE) AS date_value UNION ALL SELECT CAST(DATEADD(WEEK, 1, date_value) AS DATE) FROM _generated_dates WHERE CAST(DATEADD(WEEK, 1, date_value) AS DATE) <= CAST('2020-02-01' AS DATE)) SELECT * FROM (SELECT date_value FROM _generated_dates) AS _generated_dates",

tests/dialects/test_duckdb.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,7 @@ def test_duckdb(self):
233233
"snowflake": f"SELECT * FROM t1 WHERE {exists}(SELECT 1 FROM t2 WHERE t1.x = t2.x)",
234234
"spark": f"SELECT * FROM t1 {join_type} JOIN t2 ON t1.x = t2.x",
235235
"sqlite": f"SELECT * FROM t1 WHERE {exists}(SELECT 1 FROM t2 WHERE t1.x = t2.x)",
236-
"starrocks": f"SELECT * FROM t1 WHERE {exists}(SELECT 1 FROM t2 WHERE t1.x = t2.x)",
236+
"starrocks": f"SELECT * FROM t1 {join_type} JOIN t2 ON t1.x = t2.x",
237237
"teradata": f"SELECT * FROM t1 WHERE {exists}(SELECT 1 FROM t2 WHERE t1.x = t2.x)",
238238
"trino": f"SELECT * FROM t1 WHERE {exists}(SELECT 1 FROM t2 WHERE t1.x = t2.x)",
239239
"tsql": f"SELECT * FROM t1 WHERE {exists}(SELECT 1 FROM t2 WHERE t1.x = t2.x)",

tests/dialects/test_starrocks.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,21 @@ def test_starrocks(self):
2727

2828
self.validate_identity("CURRENT_VERSION()")
2929

30+
self.validate_identity("SELECT t1.id FROM t1 LEFT ANTI JOIN t2 ON t1.id = t2.id")
31+
self.validate_identity("SELECT t1.id FROM t1 LEFT SEMI JOIN t2 ON t1.id = t2.id")
32+
33+
def test_distinct_on(self):
34+
self.validate_identity(
35+
"SELECT DISTINCT ON (a) a, b FROM x ORDER BY c DESC",
36+
"SELECT a, b FROM (SELECT a AS a, b AS b, ROW_NUMBER() OVER (PARTITION BY a ORDER BY c DESC) AS _row_number FROM x) AS _t WHERE _row_number = 1",
37+
)
38+
39+
def test_generate_date_array(self):
40+
self.validate_identity(
41+
"SELECT * FROM UNNEST(GENERATE_DATE_ARRAY(DATE '2020-01-01', DATE '2020-02-01', INTERVAL 1 WEEK)) AS _q(date_week)",
42+
"WITH RECURSIVE _generated_dates(date_week) AS (SELECT CAST('2020-01-01' AS DATE) AS date_week UNION ALL SELECT CAST(DATE_ADD(date_week, INTERVAL 1 WEEK) AS DATE) FROM _generated_dates WHERE CAST(DATE_ADD(date_week, INTERVAL 1 WEEK) AS DATE) <= CAST('2020-02-01' AS DATE)) SELECT * FROM (SELECT date_week FROM _generated_dates) AS _generated_dates",
43+
)
44+
3045
def test_ddl(self):
3146
self.validate_identity("INSERT OVERWRITE my_table SELECT * FROM other_table")
3247
self.validate_identity("CREATE TABLE t (c INT) COMMENT 'c'")

0 commit comments

Comments
 (0)