Skip to content

Commit daca12a

Browse files
fix(mysql): accept quoted charset in USING. (#7522)
* fix(mysql): accept quoted charset in `USING`. MySQL allows backtick-quoted charset names in both `CONVERT(... USING)` and `CHAR(... USING)`. Both called `_parse_var`, which only matches unquoted tokens. Adding `TokenType.IDENTIFIER` to the token set accepts quoted charsets without changing behavior for unquoted ones. * fix(mysql): Support for `CHAR(... USING BINARY)`. Handle parsing of `SELECT CHAR(65 USING BINARY)`, which is a valid MySQL character set. * fix(mysql): preserve quoted charset names. Charset names requiring quotes (e.g. spaces from custom XML-registered charsets per https://dev.mysql.com/doc/refman/8.0/en/identifiers.html) were roundtripping to invalid SQL. For MySQL, these are now retained as quoted identifiers while safe names still emit bare. * fix(mysql): Reuse `SAFE_IDENTFIER_RE`. Reuse `SAFE_IDENTFIER_RE` instead of using a dialect-specific parallel copy. * Update sqlglot/parsers/mysql.py * Update sqlglot/parser.py --------- Co-authored-by: Jo <46752250+georgesittas@users.noreply.github.com>
1 parent 8583c2d commit daca12a

3 files changed

Lines changed: 49 additions & 4 deletions

File tree

sqlglot/parser.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7599,10 +7599,19 @@ def _parse_char(self) -> exp.Chr:
75997599
return self.expression(
76007600
exp.Chr(
76017601
expressions=self._parse_csv(self._parse_assignment),
7602-
charset=self._match(TokenType.USING) and self._parse_var(),
7602+
charset=self._match(TokenType.USING) and self._parse_charset_name(),
76037603
)
76047604
)
76057605

7606+
def _parse_charset_name(self) -> exp.Expr | None:
7607+
"""
7608+
Parse a charset name after USING or CHARACTER SET. Dialects that need to preserve quoting
7609+
for specific name shapes override this.
7610+
"""
7611+
return self._parse_var(
7612+
tokens={TokenType.BINARY, TokenType.IDENTIFIER},
7613+
)
7614+
76067615
def _parse_cast(self, strict: bool, safe: bool | None = None) -> exp.Expr:
76077616
this = self._parse_assignment()
76087617

@@ -7719,9 +7728,7 @@ def _parse_convert(self, strict: bool, safe: bool | None = None) -> exp.Expr | N
77197728
this = self._parse_bitwise()
77207729

77217730
if self._match(TokenType.USING):
7722-
to: exp.Expr | None = exp.DType.CHARACTER_SET.into_expr(
7723-
kind=self._parse_var(tokens={TokenType.BINARY})
7724-
)
7731+
to: exp.Expr | None = exp.DType.CHARACTER_SET.into_expr(kind=self._parse_charset_name())
77257732
elif self._match(TokenType.COMMA):
77267733
to = self._parse_types()
77277734
else:

sqlglot/parsers/mysql.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -486,6 +486,20 @@ def _parse_set_item_charset(self, kind: str) -> exp.Expr:
486486
this = self._parse_string() or self._parse_unquoted_field()
487487
return self.expression(exp.SetItem(this=this, kind=kind))
488488

489+
def _parse_charset_name(self) -> exp.Expr | None:
490+
"""
491+
Preserve quoting when a charset name has characters that require it (e.g. spaces, as allowed
492+
for custom XML-registered charsets). Safe names unwrap to a bare Var so roundtrips remain minimal.
493+
"""
494+
identifier = self._parse_identifier()
495+
if identifier:
496+
return (
497+
exp.Var(this=name)
498+
if exp.SAFE_IDENTIFIER_RE.match(name := identifier.name)
499+
else identifier
500+
)
501+
return self._parse_var(tokens={TokenType.BINARY})
502+
489503
def _parse_set_item_names(self) -> exp.Expr:
490504
charset = self._parse_string() or self._parse_unquoted_field()
491505
if self._match_text_seq("COLLATE"):

tests/dialects/test_mysql.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -634,6 +634,30 @@ def test_convert(self):
634634
self.validate_identity(
635635
"CONVERT('a' USING binary)", "CAST('a' AS CHAR CHARACTER SET binary)"
636636
)
637+
self.validate_identity(
638+
"SELECT CONVERT(`col` USING `utf8mb4`)",
639+
"SELECT CAST(`col` AS CHAR CHARACTER SET utf8mb4)",
640+
)
641+
self.validate_identity(
642+
"SELECT CHAR(0xC3A9 USING `utf8mb4`)",
643+
"SELECT CHAR(x'C3A9' USING utf8mb4)",
644+
)
645+
self.validate_identity("SELECT CHAR(65 USING BINARY)")
646+
self.validate_identity(
647+
"SELECT CHAR(65 USING `binary`)",
648+
"SELECT CHAR(65 USING binary)",
649+
)
650+
self.validate_identity(
651+
"SELECT CONVERT(x USING `binary`)",
652+
"SELECT CAST(x AS CHAR CHARACTER SET binary)",
653+
)
654+
self.validate_identity(
655+
"SELECT CONVERT(x USING `my charset`)",
656+
"SELECT CAST(x AS CHAR CHARACTER SET `my charset`)",
657+
)
658+
self.validate_identity(
659+
"SELECT CHAR(65 USING `my charset`)",
660+
)
637661

638662
def test_match_against(self):
639663
self.validate_all(

0 commit comments

Comments
 (0)