From 02798831d40f3a5deab837635a88ce8a31ec4c47 Mon Sep 17 00:00:00 2001 From: Roman Borschel Date: Wed, 9 Apr 2025 18:43:54 +0200 Subject: [PATCH 1/5] Fix tokenization of qualified identifiers with numeric prefix. Queries with qualified identifiers having numeric prefixes currently fail to parse due to incorrect tokenization. Currently, "t.123abc" tokenizes as "t" (Word) followed by ".123abc" (Number). --- src/tokenizer.rs | 76 ++++++++++++++++++++++++----- tests/sqlparser_mysql.rs | 100 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 164 insertions(+), 12 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index d33a7d8af9..9d2f2ce53f 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -895,7 +895,7 @@ impl<'a> Tokenizer<'a> { }; let mut location = state.location(); - while let Some(token) = self.next_token(&mut state)? { + while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? { let span = location.span_to(state.location()); buf.push(TokenWithSpan { token, span }); @@ -932,7 +932,11 @@ impl<'a> Tokenizer<'a> { } /// Get the next token or return None - fn next_token(&self, chars: &mut State) -> Result, TokenizerError> { + fn next_token( + &self, + chars: &mut State, + prev_token: Option<&Token>, + ) -> Result, TokenizerError> { match chars.peek() { Some(&ch) => match ch { ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)), @@ -1211,17 +1215,29 @@ impl<'a> Tokenizer<'a> { chars.next(); } + // If the dialect supports identifiers that start with a numeric prefix + // and we have now consumed a dot, check if the previous token was a Word. + // If so, what follows is definitely not part of a decimal number and + // we should yield the dot as a dedicated token so compound identifiers + // starting with digits can be parsed correctly. + if s == "." && self.dialect.supports_numeric_prefix() { + if let Some(Token::Word(_)) = prev_token { + return Ok(Some(Token::Period)); + } + } + + // Consume fractional digits. s += &peeking_next_take_while(chars, |ch, next_ch| { ch.is_ascii_digit() || is_number_separator(ch, next_ch) }); - // No number -> Token::Period + // No fraction -> Token::Period if s == "." { return Ok(Some(Token::Period)); } - let mut exponent_part = String::new(); // Parse exponent as number + let mut exponent_part = String::new(); if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') { let mut char_clone = chars.peekable.clone(); exponent_part.push(char_clone.next().unwrap()); @@ -1250,14 +1266,23 @@ impl<'a> Tokenizer<'a> { } } - // mysql dialect supports identifiers that start with a numeric prefix, - // as long as they aren't an exponent number. - if self.dialect.supports_numeric_prefix() && exponent_part.is_empty() { - let word = - peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch)); - - if !word.is_empty() { - s += word.as_str(); + // If the dialect supports identifiers that start with a numeric prefix, + // we need to check if the value is in fact an identifier and must thus + // be tokenized as a word. + if self.dialect.supports_numeric_prefix() { + if exponent_part.is_empty() { + // If it is not a number with an exponent, it may be + // an unqualified identifier starting with digits. + let word = + peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch)); + + if !word.is_empty() { + s += word.as_str(); + return Ok(Some(Token::make_word(s.as_str(), None))); + } + } else if prev_token == Some(&Token::Period) { + // If the previous token was a period, thus not belonging to a number, + // the value we have is part of an identifier. return Ok(Some(Token::make_word(s.as_str(), None))); } } @@ -3960,4 +3985,31 @@ mod tests { ], ); } + + #[test] + fn test_tokenize_identifiers_numeric_prefix() { + all_dialects_where(|dialect| dialect.supports_numeric_prefix()) + .tokenizes_to("123abc", vec![Token::make_word("123abc", None)]); + + all_dialects_where(|dialect| dialect.supports_numeric_prefix()) + .tokenizes_to("12e34", vec![Token::Number("12e34".to_string(), false)]); + + all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to( + "t.12e34", + vec![ + Token::make_word("t", None), + Token::Period, + Token::make_word("12e34", None), + ], + ); + + all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to( + "t.1two3", + vec![ + Token::make_word("t", None), + Token::Period, + Token::make_word("1two3", None), + ], + ); + } } diff --git a/tests/sqlparser_mysql.rs b/tests/sqlparser_mysql.rs index 1d4fd6a0d5..aab2690159 100644 --- a/tests/sqlparser_mysql.rs +++ b/tests/sqlparser_mysql.rs @@ -1926,6 +1926,106 @@ fn parse_select_with_numeric_prefix_column_name() { } } +#[test] +fn parse_qualified_identifiers_with_numeric_prefix() { + // Case 1: Qualified column name that starts with digits. + mysql().verified_stmt("SELECT t.15to29 FROM my_table AS t"); + match mysql() + .parse_sql_statements("SELECT t.15to29 FROM my_table AS t") + .unwrap() + .pop() + { + Some(Statement::Query(q)) => match *q.body { + SetExpr::Select(s) => match s.projection.last() { + Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => { + assert_eq!(&[Ident::new("t"), Ident::new("15to29")], &parts[..]); + } + proj => panic!("Unexpected projection: {:?}", proj), + }, + body => panic!("Unexpected statement body: {:?}", body), + }, + stmt => panic!("Unexpected statement: {:?}", stmt), + } + + // Case 2: Qualified column name that starts with digits and on its own represents a number. + mysql().verified_stmt("SELECT t.15e29 FROM my_table AS t"); + match mysql() + .parse_sql_statements("SELECT t.15e29 FROM my_table AS t") + .unwrap() + .pop() + { + Some(Statement::Query(q)) => match *q.body { + SetExpr::Select(s) => match s.projection.last() { + Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => { + assert_eq!(&[Ident::new("t"), Ident::new("15e29")], &parts[..]); + } + proj => panic!("Unexpected projection: {:?}", proj), + }, + body => panic!("Unexpected statement body: {:?}", body), + }, + stmt => panic!("Unexpected statement: {:?}", stmt), + } + + // Case 3: Unqualified, the same token is parsed as a number. + match mysql() + .parse_sql_statements("SELECT 15e29 FROM my_table") + .unwrap() + .pop() + { + Some(Statement::Query(q)) => match *q.body { + SetExpr::Select(s) => match s.projection.last() { + Some(SelectItem::UnnamedExpr(Expr::Value(ValueWithSpan { value, .. }))) => { + assert_eq!(&number("15e29"), value); + } + proj => panic!("Unexpected projection: {:?}", proj), + }, + body => panic!("Unexpected statement body: {:?}", body), + }, + stmt => panic!("Unexpected statement: {:?}", stmt), + } + + // Case 4: Quoted simple identifier. + mysql().verified_stmt("SELECT `15e29` FROM my_table"); + match mysql() + .parse_sql_statements("SELECT `15e29` FROM my_table") + .unwrap() + .pop() + { + Some(Statement::Query(q)) => match *q.body { + SetExpr::Select(s) => match s.projection.last() { + Some(SelectItem::UnnamedExpr(Expr::Identifier(name))) => { + assert_eq!(&Ident::with_quote('`', "15e29"), name); + } + proj => panic!("Unexpected projection: {:?}", proj), + }, + body => panic!("Unexpected statement body: {:?}", body), + }, + stmt => panic!("Unexpected statement: {:?}", stmt), + } + + // Case 5: Quoted compound identifier. + mysql().verified_stmt("SELECT t.`15e29` FROM my_table"); + match mysql() + .parse_sql_statements("SELECT t.`15e29` FROM my_table AS t") + .unwrap() + .pop() + { + Some(Statement::Query(q)) => match *q.body { + SetExpr::Select(s) => match s.projection.last() { + Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => { + assert_eq!( + &[Ident::new("t"), Ident::with_quote('`', "15e29")], + &parts[..] + ); + } + proj => panic!("Unexpected projection: {:?}", proj), + }, + body => panic!("Unexpected statement body: {:?}", body), + }, + stmt => panic!("Unexpected statement: {:?}", stmt), + } +} + // Don't run with bigdecimal as it fails like this on rust beta: // // 'parse_select_with_concatenation_of_exp_number_and_numeric_prefix_column' From 0eda729f875ff00d28893c83903bcdc799a40265 Mon Sep 17 00:00:00 2001 From: Roman Borschel Date: Wed, 9 Apr 2025 21:30:58 +0200 Subject: [PATCH 2/5] Update inline comment. --- src/tokenizer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 9d2f2ce53f..13bce0c0d9 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1272,7 +1272,7 @@ impl<'a> Tokenizer<'a> { if self.dialect.supports_numeric_prefix() { if exponent_part.is_empty() { // If it is not a number with an exponent, it may be - // an unqualified identifier starting with digits. + // an identifier starting with digits. let word = peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch)); From e32c3c880509eb60fbe50bd2052968489e217864 Mon Sep 17 00:00:00 2001 From: Roman Borschel Date: Thu, 10 Apr 2025 11:36:07 +0200 Subject: [PATCH 3/5] Improve tests as suggested in code review. --- tests/sqlparser_mysql.rs | 78 +++++++++++++++++++++++++--------------- 1 file changed, 50 insertions(+), 28 deletions(-) diff --git a/tests/sqlparser_mysql.rs b/tests/sqlparser_mysql.rs index aab2690159..ceb6424182 100644 --- a/tests/sqlparser_mysql.rs +++ b/tests/sqlparser_mysql.rs @@ -1929,13 +1929,8 @@ fn parse_select_with_numeric_prefix_column_name() { #[test] fn parse_qualified_identifiers_with_numeric_prefix() { // Case 1: Qualified column name that starts with digits. - mysql().verified_stmt("SELECT t.15to29 FROM my_table AS t"); - match mysql() - .parse_sql_statements("SELECT t.15to29 FROM my_table AS t") - .unwrap() - .pop() - { - Some(Statement::Query(q)) => match *q.body { + match mysql().verified_stmt("SELECT t.15to29 FROM my_table AS t") { + Statement::Query(q) => match *q.body { SetExpr::Select(s) => match s.projection.last() { Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => { assert_eq!(&[Ident::new("t"), Ident::new("15to29")], &parts[..]); @@ -1948,13 +1943,8 @@ fn parse_qualified_identifiers_with_numeric_prefix() { } // Case 2: Qualified column name that starts with digits and on its own represents a number. - mysql().verified_stmt("SELECT t.15e29 FROM my_table AS t"); - match mysql() - .parse_sql_statements("SELECT t.15e29 FROM my_table AS t") - .unwrap() - .pop() - { - Some(Statement::Query(q)) => match *q.body { + match mysql().verified_stmt("SELECT t.15e29 FROM my_table AS t") { + Statement::Query(q) => match *q.body { SetExpr::Select(s) => match s.projection.last() { Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => { assert_eq!(&[Ident::new("t"), Ident::new("15e29")], &parts[..]); @@ -1985,13 +1975,8 @@ fn parse_qualified_identifiers_with_numeric_prefix() { } // Case 4: Quoted simple identifier. - mysql().verified_stmt("SELECT `15e29` FROM my_table"); - match mysql() - .parse_sql_statements("SELECT `15e29` FROM my_table") - .unwrap() - .pop() - { - Some(Statement::Query(q)) => match *q.body { + match mysql().verified_stmt("SELECT `15e29` FROM my_table") { + Statement::Query(q) => match *q.body { SetExpr::Select(s) => match s.projection.last() { Some(SelectItem::UnnamedExpr(Expr::Identifier(name))) => { assert_eq!(&Ident::with_quote('`', "15e29"), name); @@ -2004,13 +1989,8 @@ fn parse_qualified_identifiers_with_numeric_prefix() { } // Case 5: Quoted compound identifier. - mysql().verified_stmt("SELECT t.`15e29` FROM my_table"); - match mysql() - .parse_sql_statements("SELECT t.`15e29` FROM my_table AS t") - .unwrap() - .pop() - { - Some(Statement::Query(q)) => match *q.body { + match mysql().verified_stmt("SELECT t.`15e29` FROM my_table AS t") { + Statement::Query(q) => match *q.body { SetExpr::Select(s) => match s.projection.last() { Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => { assert_eq!( @@ -2024,6 +2004,48 @@ fn parse_qualified_identifiers_with_numeric_prefix() { }, stmt => panic!("Unexpected statement: {:?}", stmt), } + + // Case 6: Multi-level compound identifiers. + match mysql().verified_stmt("SELECT 1db.1table.1column") { + Statement::Query(q) => match *q.body { + SetExpr::Select(s) => match s.projection.last() { + Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => { + assert_eq!( + &[ + Ident::new("1db"), + Ident::new("1table"), + Ident::new("1column") + ], + &parts[..] + ); + } + proj => panic!("Unexpected projection: {:?}", proj), + }, + body => panic!("Unexpected statement body: {:?}", body), + }, + stmt => panic!("Unexpected statement: {:?}", stmt), + } + + // Case 7: Multi-level compound quoted identifiers. + match mysql().verified_stmt("SELECT `1`.`2`.`3`") { + Statement::Query(q) => match *q.body { + SetExpr::Select(s) => match s.projection.last() { + Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => { + assert_eq!( + &[ + Ident::with_quote('`', "1"), + Ident::with_quote('`', "2"), + Ident::with_quote('`', "3") + ], + &parts[..] + ); + } + proj => panic!("Unexpected projection: {:?}", proj), + }, + body => panic!("Unexpected statement body: {:?}", body), + }, + stmt => panic!("Unexpected statement: {:?}", stmt), + } } // Don't run with bigdecimal as it fails like this on rust beta: From aab48d43b53d647a371b7443c11d1ca66a17a72f Mon Sep 17 00:00:00 2001 From: Roman Borschel Date: Thu, 10 Apr 2025 15:41:35 +0200 Subject: [PATCH 4/5] Alternative fix implementated in the parser. --- src/parser/mod.rs | 36 ++++++++++++++++++++ src/tokenizer.rs | 72 +++++++--------------------------------- tests/sqlparser_mysql.rs | 57 +++++++++++++++++++++++++++++-- 3 files changed, 103 insertions(+), 62 deletions(-) diff --git a/src/parser/mod.rs b/src/parser/mod.rs index b9076bb777..bac5712e83 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -1714,6 +1714,42 @@ impl<'a> Parser<'a> { && self.peek_token_ref().token == Token::LBracket { self.parse_multi_dim_subscript(&mut chain)?; + } else if self.dialect.supports_numeric_prefix() { + // When we get a Word or Number token while parsing a compound expression that + // starts with a dot (.), and using a dialect that supports identifiers with numeric + // prefixes, these tokens are part of qualified, unquoted identifiers and must be + // split up accordingly. + match self.peek_token_ref() { + TokenWithSpan { + token: Token::Word(w), + span, + } if w.value.starts_with(".") => { + let ident = w.value[1..].to_string(); + let new_span = Span::new( + Location::new(span.start.line, span.start.column + 1), + span.end, + ); + let expr = Expr::Identifier(Ident::with_span(new_span, ident)); + chain.push(AccessExpr::Dot(expr)); + self.advance_token(); + } + TokenWithSpan { + token: Token::Number(n, _), + span, + } if n.to_string().starts_with(".") => { + let ident = n.to_string()[1..].to_string(); + let new_span = Span::new( + Location::new(span.start.line, span.start.column + 1), + span.end, + ); + let expr = Expr::Identifier(Ident::with_span(new_span, ident)); + chain.push(AccessExpr::Dot(expr)); + self.advance_token(); + } + _ => { + break; + } + } } else { break; } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 13bce0c0d9..df376d3b5e 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -895,7 +895,7 @@ impl<'a> Tokenizer<'a> { }; let mut location = state.location(); - while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? { + while let Some(token) = self.next_token(&mut state)? { let span = location.span_to(state.location()); buf.push(TokenWithSpan { token, span }); @@ -932,11 +932,7 @@ impl<'a> Tokenizer<'a> { } /// Get the next token or return None - fn next_token( - &self, - chars: &mut State, - prev_token: Option<&Token>, - ) -> Result, TokenizerError> { + fn next_token(&self, chars: &mut State) -> Result, TokenizerError> { match chars.peek() { Some(&ch) => match ch { ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)), @@ -1215,28 +1211,17 @@ impl<'a> Tokenizer<'a> { chars.next(); } - // If the dialect supports identifiers that start with a numeric prefix - // and we have now consumed a dot, check if the previous token was a Word. - // If so, what follows is definitely not part of a decimal number and - // we should yield the dot as a dedicated token so compound identifiers - // starting with digits can be parsed correctly. - if s == "." && self.dialect.supports_numeric_prefix() { - if let Some(Token::Word(_)) = prev_token { - return Ok(Some(Token::Period)); - } - } - // Consume fractional digits. s += &peeking_next_take_while(chars, |ch, next_ch| { ch.is_ascii_digit() || is_number_separator(ch, next_ch) }); - // No fraction -> Token::Period + // No fraction -> Token::Period. if s == "." { return Ok(Some(Token::Period)); } - // Parse exponent as number + // Parse exponent, if present. let mut exponent_part = String::new(); if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') { let mut char_clone = chars.peekable.clone(); @@ -1269,20 +1254,14 @@ impl<'a> Tokenizer<'a> { // If the dialect supports identifiers that start with a numeric prefix, // we need to check if the value is in fact an identifier and must thus // be tokenized as a word. - if self.dialect.supports_numeric_prefix() { - if exponent_part.is_empty() { - // If it is not a number with an exponent, it may be - // an identifier starting with digits. - let word = - peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch)); - - if !word.is_empty() { - s += word.as_str(); - return Ok(Some(Token::make_word(s.as_str(), None))); - } - } else if prev_token == Some(&Token::Period) { - // If the previous token was a period, thus not belonging to a number, - // the value we have is part of an identifier. + if self.dialect.supports_numeric_prefix() && exponent_part.is_empty() { + // If it is not a number with an exponent, it may be + // an identifier starting with digits. + let word = + peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch)); + + if !word.is_empty() { + s += word.as_str(); return Ok(Some(Token::make_word(s.as_str(), None))); } } @@ -3985,31 +3964,4 @@ mod tests { ], ); } - - #[test] - fn test_tokenize_identifiers_numeric_prefix() { - all_dialects_where(|dialect| dialect.supports_numeric_prefix()) - .tokenizes_to("123abc", vec![Token::make_word("123abc", None)]); - - all_dialects_where(|dialect| dialect.supports_numeric_prefix()) - .tokenizes_to("12e34", vec![Token::Number("12e34".to_string(), false)]); - - all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to( - "t.12e34", - vec![ - Token::make_word("t", None), - Token::Period, - Token::make_word("12e34", None), - ], - ); - - all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to( - "t.1two3", - vec![ - Token::make_word("t", None), - Token::Period, - Token::make_word("1two3", None), - ], - ); - } } diff --git a/tests/sqlparser_mysql.rs b/tests/sqlparser_mysql.rs index ceb6424182..429194b602 100644 --- a/tests/sqlparser_mysql.rs +++ b/tests/sqlparser_mysql.rs @@ -25,9 +25,9 @@ use matches::assert_matches; use sqlparser::ast::MysqlInsertPriority::{Delayed, HighPriority, LowPriority}; use sqlparser::ast::*; use sqlparser::dialect::{GenericDialect, MySqlDialect}; -use sqlparser::parser::{ParserError, ParserOptions}; +use sqlparser::parser::{Parser, ParserError, ParserOptions}; use sqlparser::tokenizer::Span; -use sqlparser::tokenizer::Token; +use sqlparser::tokenizer::{Location, Token}; use test_utils::*; #[macro_use] @@ -1926,6 +1926,59 @@ fn parse_select_with_numeric_prefix_column_name() { } } +#[test] +fn test_qualified_identifiers_with_numeric_prefix_span() { + match Parser::new(&MySqlDialect {}) + .try_with_sql("SELECT t.15to29 FROM my_table AS t") + .unwrap() + .parse_statement() + .unwrap() + { + Statement::Query(q) => match *q.body { + SetExpr::Select(s) => match s.projection.last() { + Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => { + assert_eq!( + Span::new(Location::new(1, 8), Location::new(1, 9)), + parts[0].span, + ); + assert_eq!( + Span::new(Location::new(1, 10), Location::new(1, 16)), + parts[1].span, + ); + } + proj => panic!("Unexpected projection: {:?}", proj), + }, + body => panic!("Unexpected statement body: {:?}", body), + }, + stmt => panic!("Unexpected statement: {:?}", stmt), + } + + match Parser::new(&MySqlDialect {}) + .try_with_sql("SELECT t.15e29 FROM my_table AS t") + .unwrap() + .parse_statement() + .unwrap() + { + Statement::Query(q) => match *q.body { + SetExpr::Select(s) => match s.projection.last() { + Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => { + assert_eq!( + Span::new(Location::new(1, 8), Location::new(1, 9)), + parts[0].span, + ); + assert_eq!( + Span::new(Location::new(1, 10), Location::new(1, 15)), + parts[1].span, + ); + } + proj => panic!("Unexpected projection: {:?}", proj), + }, + body => panic!("Unexpected statement body: {:?}", body), + }, + stmt => panic!("Unexpected statement: {:?}", stmt), + } +} + #[test] fn parse_qualified_identifiers_with_numeric_prefix() { // Case 1: Qualified column name that starts with digits. From a47e2ab751ee0eb3c3e4ec663ce3720b0e4af02d Mon Sep 17 00:00:00 2001 From: Roman Borschel Date: Fri, 11 Apr 2025 20:11:30 +0200 Subject: [PATCH 5/5] Revert "Alternative fix implementated in the parser." This reverts commit aab48d43b53d647a371b7443c11d1ca66a17a72f. --- src/parser/mod.rs | 36 -------------------- src/tokenizer.rs | 72 +++++++++++++++++++++++++++++++++------- tests/sqlparser_mysql.rs | 57 ++----------------------------- 3 files changed, 62 insertions(+), 103 deletions(-) diff --git a/src/parser/mod.rs b/src/parser/mod.rs index bac5712e83..b9076bb777 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -1714,42 +1714,6 @@ impl<'a> Parser<'a> { && self.peek_token_ref().token == Token::LBracket { self.parse_multi_dim_subscript(&mut chain)?; - } else if self.dialect.supports_numeric_prefix() { - // When we get a Word or Number token while parsing a compound expression that - // starts with a dot (.), and using a dialect that supports identifiers with numeric - // prefixes, these tokens are part of qualified, unquoted identifiers and must be - // split up accordingly. - match self.peek_token_ref() { - TokenWithSpan { - token: Token::Word(w), - span, - } if w.value.starts_with(".") => { - let ident = w.value[1..].to_string(); - let new_span = Span::new( - Location::new(span.start.line, span.start.column + 1), - span.end, - ); - let expr = Expr::Identifier(Ident::with_span(new_span, ident)); - chain.push(AccessExpr::Dot(expr)); - self.advance_token(); - } - TokenWithSpan { - token: Token::Number(n, _), - span, - } if n.to_string().starts_with(".") => { - let ident = n.to_string()[1..].to_string(); - let new_span = Span::new( - Location::new(span.start.line, span.start.column + 1), - span.end, - ); - let expr = Expr::Identifier(Ident::with_span(new_span, ident)); - chain.push(AccessExpr::Dot(expr)); - self.advance_token(); - } - _ => { - break; - } - } } else { break; } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index df376d3b5e..13bce0c0d9 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -895,7 +895,7 @@ impl<'a> Tokenizer<'a> { }; let mut location = state.location(); - while let Some(token) = self.next_token(&mut state)? { + while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? { let span = location.span_to(state.location()); buf.push(TokenWithSpan { token, span }); @@ -932,7 +932,11 @@ impl<'a> Tokenizer<'a> { } /// Get the next token or return None - fn next_token(&self, chars: &mut State) -> Result, TokenizerError> { + fn next_token( + &self, + chars: &mut State, + prev_token: Option<&Token>, + ) -> Result, TokenizerError> { match chars.peek() { Some(&ch) => match ch { ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)), @@ -1211,17 +1215,28 @@ impl<'a> Tokenizer<'a> { chars.next(); } + // If the dialect supports identifiers that start with a numeric prefix + // and we have now consumed a dot, check if the previous token was a Word. + // If so, what follows is definitely not part of a decimal number and + // we should yield the dot as a dedicated token so compound identifiers + // starting with digits can be parsed correctly. + if s == "." && self.dialect.supports_numeric_prefix() { + if let Some(Token::Word(_)) = prev_token { + return Ok(Some(Token::Period)); + } + } + // Consume fractional digits. s += &peeking_next_take_while(chars, |ch, next_ch| { ch.is_ascii_digit() || is_number_separator(ch, next_ch) }); - // No fraction -> Token::Period. + // No fraction -> Token::Period if s == "." { return Ok(Some(Token::Period)); } - // Parse exponent, if present. + // Parse exponent as number let mut exponent_part = String::new(); if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') { let mut char_clone = chars.peekable.clone(); @@ -1254,14 +1269,20 @@ impl<'a> Tokenizer<'a> { // If the dialect supports identifiers that start with a numeric prefix, // we need to check if the value is in fact an identifier and must thus // be tokenized as a word. - if self.dialect.supports_numeric_prefix() && exponent_part.is_empty() { - // If it is not a number with an exponent, it may be - // an identifier starting with digits. - let word = - peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch)); - - if !word.is_empty() { - s += word.as_str(); + if self.dialect.supports_numeric_prefix() { + if exponent_part.is_empty() { + // If it is not a number with an exponent, it may be + // an identifier starting with digits. + let word = + peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch)); + + if !word.is_empty() { + s += word.as_str(); + return Ok(Some(Token::make_word(s.as_str(), None))); + } + } else if prev_token == Some(&Token::Period) { + // If the previous token was a period, thus not belonging to a number, + // the value we have is part of an identifier. return Ok(Some(Token::make_word(s.as_str(), None))); } } @@ -3964,4 +3985,31 @@ mod tests { ], ); } + + #[test] + fn test_tokenize_identifiers_numeric_prefix() { + all_dialects_where(|dialect| dialect.supports_numeric_prefix()) + .tokenizes_to("123abc", vec![Token::make_word("123abc", None)]); + + all_dialects_where(|dialect| dialect.supports_numeric_prefix()) + .tokenizes_to("12e34", vec![Token::Number("12e34".to_string(), false)]); + + all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to( + "t.12e34", + vec![ + Token::make_word("t", None), + Token::Period, + Token::make_word("12e34", None), + ], + ); + + all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to( + "t.1two3", + vec![ + Token::make_word("t", None), + Token::Period, + Token::make_word("1two3", None), + ], + ); + } } diff --git a/tests/sqlparser_mysql.rs b/tests/sqlparser_mysql.rs index 429194b602..ceb6424182 100644 --- a/tests/sqlparser_mysql.rs +++ b/tests/sqlparser_mysql.rs @@ -25,9 +25,9 @@ use matches::assert_matches; use sqlparser::ast::MysqlInsertPriority::{Delayed, HighPriority, LowPriority}; use sqlparser::ast::*; use sqlparser::dialect::{GenericDialect, MySqlDialect}; -use sqlparser::parser::{Parser, ParserError, ParserOptions}; +use sqlparser::parser::{ParserError, ParserOptions}; use sqlparser::tokenizer::Span; -use sqlparser::tokenizer::{Location, Token}; +use sqlparser::tokenizer::Token; use test_utils::*; #[macro_use] @@ -1926,59 +1926,6 @@ fn parse_select_with_numeric_prefix_column_name() { } } -#[test] -fn test_qualified_identifiers_with_numeric_prefix_span() { - match Parser::new(&MySqlDialect {}) - .try_with_sql("SELECT t.15to29 FROM my_table AS t") - .unwrap() - .parse_statement() - .unwrap() - { - Statement::Query(q) => match *q.body { - SetExpr::Select(s) => match s.projection.last() { - Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => { - assert_eq!( - Span::new(Location::new(1, 8), Location::new(1, 9)), - parts[0].span, - ); - assert_eq!( - Span::new(Location::new(1, 10), Location::new(1, 16)), - parts[1].span, - ); - } - proj => panic!("Unexpected projection: {:?}", proj), - }, - body => panic!("Unexpected statement body: {:?}", body), - }, - stmt => panic!("Unexpected statement: {:?}", stmt), - } - - match Parser::new(&MySqlDialect {}) - .try_with_sql("SELECT t.15e29 FROM my_table AS t") - .unwrap() - .parse_statement() - .unwrap() - { - Statement::Query(q) => match *q.body { - SetExpr::Select(s) => match s.projection.last() { - Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => { - assert_eq!( - Span::new(Location::new(1, 8), Location::new(1, 9)), - parts[0].span, - ); - assert_eq!( - Span::new(Location::new(1, 10), Location::new(1, 15)), - parts[1].span, - ); - } - proj => panic!("Unexpected projection: {:?}", proj), - }, - body => panic!("Unexpected statement body: {:?}", body), - }, - stmt => panic!("Unexpected statement: {:?}", stmt), - } -} - #[test] fn parse_qualified_identifiers_with_numeric_prefix() { // Case 1: Qualified column name that starts with digits.