Skip to content

Commit 1a55f07

Browse files
feat(duckdb): transpile snowflake's JAROWINKLER_SIMILARITY properly (#7501)
* fix(snowflake): transpile JAROWINKLER_SIMILARITY to DuckDB with correct scale Snowflake's JAROWINKLER_SIMILARITY returns values on a 0-100 integer scale, while DuckDB's JARO_WINKLER_SIMILARITY returns values on a 0.0-1.0 float scale. Changes: - Added integer_scale flag to JarowinklerSimilarity expression - Snowflake parser sets integer_scale=True when parsing JAROWINKLER_SIMILARITY - DuckDB generator multiplies by 100 and casts to INTEGER when flag is set - Preserves DuckDB roundtrip (no scale conversion when flag is not set) This ensures semantic equivalence when transpiling from Snowflake to DuckDB. * chore: update integration tests submodule
1 parent 9422619 commit 1a55f07

5 files changed

Lines changed: 24 additions & 5 deletions

File tree

sqlglot-integration-tests

sqlglot/expressions/math.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,12 @@ class EuclideanDistance(Expression, Func):
108108

109109

110110
class JarowinklerSimilarity(Expression, Func):
111-
arg_types = {"this": True, "expression": True, "case_insensitive": False}
111+
arg_types = {
112+
"this": True,
113+
"expression": True,
114+
"case_insensitive": False,
115+
"integer_scale": False,
116+
}
112117

113118

114119
class ManhattanDistance(Expression, Func):

sqlglot/generators/duckdb.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
getbit_sql,
2525
groupconcat_sql,
2626
inline_array_unless_query,
27-
jarowinkler_similarity,
2827
months_between_sql,
2928
no_datetime_sql,
3029
no_comment_column_constraint_sql,
@@ -1609,7 +1608,6 @@ class DuckDBGenerator(generator.Generator):
16091608
),
16101609
exp.Ceil: _ceil_floor,
16111610
exp.Floor: _ceil_floor,
1612-
exp.JarowinklerSimilarity: jarowinkler_similarity("JARO_WINKLER_SIMILARITY"),
16131611
exp.JSONBExists: rename_func("JSON_EXISTS"),
16141612
exp.JSONExtract: _arrow_json_extract_sql,
16151613
exp.JSONExtractArray: _json_extract_value_array_sql,
@@ -2285,6 +2283,21 @@ def parseip_sql(self, expression: exp.ParseIp) -> str:
22852283
self.unsupported("PARSE_IP is not supported in DuckDB")
22862284
return self.function_fallback_sql(expression)
22872285

2286+
def jarowinklersimilarity_sql(self, expression: exp.JarowinklerSimilarity) -> str:
2287+
this = expression.this
2288+
expr = expression.expression
2289+
2290+
if expression.args.get("case_insensitive"):
2291+
this = exp.Upper(this=this)
2292+
expr = exp.Upper(this=expr)
2293+
2294+
result = exp.func("JARO_WINKLER_SIMILARITY", this, expr)
2295+
2296+
if expression.args.get("integer_scale"):
2297+
result = exp.cast(result * 100, "INTEGER")
2298+
2299+
return self.sql(result)
2300+
22882301
def nthvalue_sql(self, expression: exp.NthValue) -> str:
22892302
from_first = expression.args.get("from_first", True)
22902303
if not from_first:

sqlglot/parsers/snowflake.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -539,6 +539,7 @@ class SnowflakeParser(parser.Parser):
539539
this=seq_get(args, 0),
540540
expression=seq_get(args, 1),
541541
case_insensitive=True,
542+
integer_scale=True,
542543
),
543544
"MD5_HEX": exp.MD5.from_arg_list,
544545
"MD5_BINARY": exp.MD5Digest.from_arg_list,

tests/dialects/test_snowflake.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ def test_snowflake(self):
221221
"JAROWINKLER_SIMILARITY('hello', 'world')",
222222
write={
223223
"snowflake": "JAROWINKLER_SIMILARITY('hello', 'world')",
224-
"duckdb": "JARO_WINKLER_SIMILARITY(UPPER('hello'), UPPER('world'))",
224+
"duckdb": "CAST(JARO_WINKLER_SIMILARITY(UPPER('hello'), UPPER('world')) * 100 AS INT)",
225225
"clickhouse": "jaroWinklerSimilarity(UPPER('hello'), UPPER('world'))",
226226
},
227227
)

0 commit comments

Comments
 (0)