Merging upstream version 25.5.1.

Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-02-13 21:41:14 +01:00 · 2025-02-13 21:41:14 +01:00 · 029b9c2c73
commit 029b9c2c73
parent 298e7a8147
136 changed files with 80990 additions and 72541 deletions
--- a/sqlglotrs/Cargo.lock
+++ b/sqlglotrs/Cargo.lock
@ -188,7 +188,7 @@ checksum = "4dccd0940a2dcdf68d092b8cbab7dc0ad8fa938bf95787e1b916b0e3d0e8e970"

 [[package]]
 name = "sqlglotrs"
-version = "0.2.6"
+version = "0.2.8"
 dependencies = [
 "pyo3",
 ]
--- a/sqlglotrs/Cargo.toml
+++ b/sqlglotrs/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "sqlglotrs"
-version = "0.2.6"
+version = "0.2.8"
 edition = "2021"

 [lib]
--- a/sqlglotrs/src/settings.rs
+++ b/sqlglotrs/src/settings.rs
@ -76,6 +76,7 @@ pub struct TokenizerSettings {
    pub commands: HashSet<TokenType>,
    pub command_prefix_tokens: HashSet<TokenType>,
    pub heredoc_tag_is_identifier: bool,
+    pub string_escapes_allowed_in_raw_strings: bool,
 }

 #[pymethods]
@ -98,6 +99,7 @@ impl TokenizerSettings {
        commands: HashSet<TokenType>,
        command_prefix_tokens: HashSet<TokenType>,
        heredoc_tag_is_identifier: bool,
+        string_escapes_allowed_in_raw_strings: bool,
    ) -> Self {
        let to_char = |v: &String| {
            if v.len() == 1 {
@ -147,6 +149,7 @@ impl TokenizerSettings {
            commands,
            command_prefix_tokens,
            heredoc_tag_is_identifier,
+            string_escapes_allowed_in_raw_strings,
        }
    }
 }
--- a/sqlglotrs/src/tokenizer.rs
+++ b/sqlglotrs/src/tokenizer.rs
@ -361,10 +361,24 @@ impl<'a> TokenizerState<'a> {
            // Skip the comment's start delimiter.
            self.advance(comment_start_size as isize)?;

+            let mut comment_count = 1;
            let comment_end_size = comment_end.len();

-            while !self.is_end && self.chars(comment_end_size) != *comment_end {
+            while !self.is_end {
+                if self.chars(comment_end_size) == *comment_end {
+                    comment_count -= 1;
+                    if comment_count == 0 {
+                        break;
+                    }
+                }
+
                self.advance(1)?;
+
+                // Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres
+                if !self.is_end && self.chars(comment_start_size) == *comment_start {
+                    self.advance(comment_start_size as isize)?;
+                    comment_count += 1
+                }
            }

            let text = self.text();
@ -410,7 +424,7 @@ impl<'a> TokenizerState<'a> {
                let tag = if self.current_char.to_string() == *end {
                    String::from("")
                } else {
-                    self.extract_string(end, false, false, !self.settings.heredoc_tag_is_identifier)?
+                    self.extract_string(end, false, true, !self.settings.heredoc_tag_is_identifier)?
                };

                if !tag.is_empty()
@ -435,7 +449,7 @@ impl<'a> TokenizerState<'a> {
        };

        self.advance(start.len() as isize)?;
-        let text = self.extract_string(&end, false, token_type != self.token_types.raw_string, true)?;
+        let text = self.extract_string(&end, false, token_type == self.token_types.raw_string, true)?;

        if let Some(b) = base {
            if u64::from_str_radix(&text, b).is_err() {
@ -581,7 +595,7 @@ impl<'a> TokenizerState<'a> {

    fn scan_identifier(&mut self, identifier_end: &str) -> Result<(), TokenizerError> {
        self.advance(1)?;
-        let text = self.extract_string(identifier_end, true, true, true)?;
+        let text = self.extract_string(identifier_end, true, false, true)?;
        self.add(self.token_types.identifier, Some(text))
    }

@ -589,7 +603,7 @@ impl<'a> TokenizerState<'a> {
        &mut self,
        delimiter: &str,
        use_identifier_escapes: bool,
-        unescape_sequences: bool,
+        raw_string: bool,
        raise_unmatched: bool,
    ) -> Result<String, TokenizerError> {
        let mut text = String::from("");
@ -602,7 +616,7 @@ impl<'a> TokenizerState<'a> {
            };
            let peek_char_str = self.peek_char.to_string();

-            if unescape_sequences
+            if !raw_string
                && !self.dialect_settings.unescaped_sequences.is_empty()
                && !self.peek_char.is_whitespace()
                && self.settings.string_escapes.contains(&self.current_char)
@ -617,7 +631,8 @@ impl<'a> TokenizerState<'a> {
                }
            }

-            if escapes.contains(&self.current_char)
+            if (self.settings.string_escapes_allowed_in_raw_strings || !raw_string)
+                && escapes.contains(&self.current_char)
                && (peek_char_str == delimiter || escapes.contains(&self.peek_char))
                && (self.current_char == self.peek_char
                    || !self