Adding upstream version 26.1.3.

Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-02-13 21:59:41 +01:00 · 2025-02-13 21:59:41 +01:00 · d908bee480
commit d908bee480
parent 09521056ff
119 changed files with 71635 additions and 68059 deletions
--- a/sqlglotrs/src/lib.rs
+++ b/sqlglotrs/src/lib.rs
@ -1,90 +1,13 @@
 use pyo3::prelude::*;
-use pyo3::types::{PyList, PyNone, PyString};
+use pyo3::{pymodule, types::PyModule, Bound, PyResult};
+use settings::{TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings};
+use token::Token;
+use tokenizer::Tokenizer;

-mod settings;
-mod tokenizer;
-mod trie;
-
-pub use self::settings::{
-    TokenType, TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings,
-};
-pub use self::tokenizer::Tokenizer;
-
-#[derive(Debug)]
-#[pyclass]
-pub struct Token {
-    #[pyo3(get, name = "token_type_index")]
-    pub token_type: TokenType,
-    #[pyo3(get, set, name = "token_type")]
-    pub token_type_py: PyObject,
-    #[pyo3(get)]
-    pub text: Py<PyString>,
-    #[pyo3(get)]
-    pub line: usize,
-    #[pyo3(get)]
-    pub col: usize,
-    #[pyo3(get)]
-    pub start: usize,
-    #[pyo3(get)]
-    pub end: usize,
-    #[pyo3(get)]
-    pub comments: Py<PyList>,
-}
-
-impl Token {
-    pub fn new(
-        token_type: TokenType,
-        text: String,
-        line: usize,
-        col: usize,
-        start: usize,
-        end: usize,
-        comments: Vec<String>,
-    ) -> Token {
-        Python::with_gil(|py| Token {
-            token_type,
-            token_type_py: PyNone::get_bound(py).into_py(py),
-            text: PyString::new_bound(py, &text).into_py(py),
-            line,
-            col,
-            start,
-            end,
-            comments: PyList::new_bound(py, &comments).into(),
-        })
-    }
-
-    pub fn append_comments(&self, comments: &mut Vec<String>) {
-        Python::with_gil(|py| {
-            let pylist = self.comments.bind(py);
-            for comment in comments.iter() {
-                if let Err(_) = pylist.append(comment) {
-                    panic!("Failed to append comments to the Python list");
-                }
-            }
-        });
-        // Simulate `Vec::append`.
-        let _ = std::mem::replace(comments, Vec::new());
-    }
-}
-
-#[pymethods]
-impl Token {
-    #[pyo3(name = "__repr__")]
-    fn python_repr(&self) -> PyResult<String> {
-        Python::with_gil(|py| {
-            Ok(format!(
-                "<Token token_type: {}, text: {}, line: {}, col: {}, start: {}, end: {}, comments: {}>",
-                self.token_type_py.bind(py).repr()?,
-                self.text.bind(py).repr()?,
-                self.line,
-                self.col,
-                self.start,
-                self.end,
-                self.comments.bind(py).repr()?,
-            ))
-        })
-    }
-}
+pub mod settings;
+pub mod token;
+pub mod tokenizer;
+pub mod trie;

 #[pymodule]
 fn sqlglotrs(m: &Bound<'_, PyModule>) -> PyResult<()> {
--- a/sqlglotrs/src/settings.rs
+++ b/sqlglotrs/src/settings.rs
@ -1,10 +1,12 @@
-use pyo3::prelude::*;
 use std::collections::{HashMap, HashSet};

+use pyo3::prelude::*;
+
 pub type TokenType = u16;

 #[derive(Clone, Debug)]
 #[pyclass]
+#[cfg_attr(feature = "profiling", derive(serde::Serialize, serde::Deserialize))]
 pub struct TokenTypeSettings {
    pub bit_string: TokenType,
    pub break_: TokenType,
@ -41,7 +43,7 @@ impl TokenTypeSettings {
        heredoc_string_alternative: TokenType,
        hint: TokenType,
    ) -> Self {
-        TokenTypeSettings {
+        let token_type_settings = TokenTypeSettings {
            bit_string,
            break_,
            dcolon,
@ -56,12 +58,31 @@ impl TokenTypeSettings {
            var,
            heredoc_string_alternative,
            hint,
+        };
+
+        #[cfg(feature = "profiling")]
+        {
+            token_type_settings.write_json_to_string();
        }
+
+        token_type_settings
+    }
+}
+
+#[cfg(feature = "profiling")]
+impl TokenTypeSettings {
+    pub fn write_json_to_string(&self) {
+        let json = serde_json::to_string(self).unwrap();
+        let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
+            .join("benches/token_type_settings.json");
+        // Write to file
+        std::fs::write(path, &json).unwrap();
    }
 }

 #[derive(Clone, Debug)]
 #[pyclass]
+#[cfg_attr(feature = "profiling", derive(serde::Serialize, serde::Deserialize))]
 pub struct TokenizerSettings {
    pub white_space: HashMap<char, TokenType>,
    pub single_tokens: HashMap<char, TokenType>,
@ -141,7 +162,7 @@ impl TokenizerSettings {
        let var_single_tokens_native: HashSet<char> =
            var_single_tokens.iter().map(&to_char).collect();

-        TokenizerSettings {
+        let tokenizer_settings = TokenizerSettings {
            white_space: white_space_native,
            single_tokens: single_tokens_native,
            keywords,
@ -162,15 +183,35 @@ impl TokenizerSettings {
            string_escapes_allowed_in_raw_strings,
            nested_comments,
            hint_start,
+        };
+
+        #[cfg(feature = "profiling")]
+        {
+            tokenizer_settings.write_json_to_string();
        }
+
+        tokenizer_settings
+    }
+}
+
+#[cfg(feature = "profiling")]
+impl TokenizerSettings {
+    pub fn write_json_to_string(&self) {
+        let json = serde_json::to_string(self).unwrap();
+        let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
+            .join("benches/tokenizer_settings.json");
+        // Write to file
+        std::fs::write(path, &json).unwrap();
    }
 }

 #[derive(Clone, Debug)]
 #[pyclass]
+#[cfg_attr(feature = "profiling", derive(serde::Serialize, serde::Deserialize))]
 pub struct TokenizerDialectSettings {
    pub unescaped_sequences: HashMap<String, String>,
    pub identifiers_can_start_with_digit: bool,
+    pub numbers_can_be_underscore_separated: bool,
 }

 #[pymethods]
@ -179,10 +220,29 @@ impl TokenizerDialectSettings {
    pub fn new(
        unescaped_sequences: HashMap<String, String>,
        identifiers_can_start_with_digit: bool,
+        numbers_can_be_underscore_separated: bool,
    ) -> Self {
-        TokenizerDialectSettings {
+        let settings = TokenizerDialectSettings {
            unescaped_sequences,
            identifiers_can_start_with_digit,
+            numbers_can_be_underscore_separated,
+        };
+
+        #[cfg(feature = "profiling")]
+        {
+            settings.write_json_to_string();
        }
+
+        settings
+    }
+}
+
+#[cfg(feature = "profiling")]
+impl TokenizerDialectSettings {
+    pub fn write_json_to_string(&self) {
+        let json = serde_json::to_string(self).unwrap();
+        let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
+            .join("benches/tokenizer_dialect_settings.json");
+        std::fs::write(path, &json).unwrap();
    }
 }
--- a/sqlglotrs/src/token.rs
+++ b/sqlglotrs/src/token.rs
@ -0,0 +1,61 @@
+use crate::settings::TokenType;
+use pyo3::prelude::PyListMethods;
+use pyo3::types::{PyList, PyNone, PyString};
+use pyo3::{pyclass, IntoPy, Py, PyObject, Python};
+
+#[derive(Debug)]
+#[pyclass]
+pub struct Token {
+    #[pyo3(get, name = "token_type_index")]
+    pub token_type: TokenType,
+    #[pyo3(get, set, name = "token_type")]
+    pub token_type_py: PyObject,
+    #[pyo3(get)]
+    pub text: Py<PyString>,
+    #[pyo3(get)]
+    pub line: usize,
+    #[pyo3(get)]
+    pub col: usize,
+    #[pyo3(get)]
+    pub start: usize,
+    #[pyo3(get)]
+    pub end: usize,
+    #[pyo3(get)]
+    pub comments: Py<PyList>,
+}
+
+impl Token {
+    pub fn new(
+        token_type: TokenType,
+        text: String,
+        line: usize,
+        col: usize,
+        start: usize,
+        end: usize,
+        comments: Vec<String>,
+    ) -> Token {
+        Python::with_gil(|py| Token {
+            token_type,
+            token_type_py: PyNone::get_bound(py).into_py(py),
+            text: PyString::new_bound(py, &text).into_py(py),
+            line,
+            col,
+            start,
+            end,
+            comments: PyList::new_bound(py, &comments).into(),
+        })
+    }
+
+    pub fn append_comments(&self, comments: &mut Vec<String>) {
+        Python::with_gil(|py| {
+            let pylist = self.comments.bind(py);
+            for comment in comments.iter() {
+                if let Err(_) = pylist.append(comment) {
+                    panic!("Failed to append comments to the Python list");
+                }
+            }
+        });
+        // Simulate `Vec::append`.
+        let _ = std::mem::replace(comments, Vec::new());
+    }
+}
--- a/sqlglotrs/src/tokenizer.rs
+++ b/sqlglotrs/src/tokenizer.rs
@ -1,5 +1,6 @@
+use crate::settings::TokenType;
 use crate::trie::{Trie, TrieResult};
-use crate::{Token, TokenType, TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings};
+use crate::{Token, TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings};
 use pyo3::exceptions::PyException;
 use pyo3::prelude::*;
 use std::cmp::{max, min};
@ -375,7 +376,10 @@ impl<'a> TokenizerState<'a> {
                self.advance(1)?;

                // Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres
-                if self.settings.nested_comments && !self.is_end && self.chars(comment_start_size) == *comment_start {
+                if self.settings.nested_comments
+                    && !self.is_end
+                    && self.chars(comment_start_size) == *comment_start
+                {
                    self.advance(comment_start_size as isize)?;
                    comment_count += 1
                }
@ -397,7 +401,11 @@ impl<'a> TokenizerState<'a> {

        if comment_start == self.settings.hint_start
            && self.tokens.last().is_some()
-            && self.settings.tokens_preceding_hint.contains(&self.tokens.last().unwrap().token_type) {
+            && self
+                .settings
+                .tokens_preceding_hint
+                .contains(&self.tokens.last().unwrap().token_type)
+        {
            self.add(self.token_types.hint, None)?;
        }

@ -443,7 +451,7 @@ impl<'a> TokenizerState<'a> {

                    self.advance(-(tag.len() as isize))?;
                    self.add(self.token_types.heredoc_string_alternative, None)?;
-                    return Ok(true)
+                    return Ok(true);
                }

                (None, *token_type, format!("{}{}{}", start, tag, end))
@ -455,10 +463,11 @@ impl<'a> TokenizerState<'a> {
        };

        self.advance(start.len() as isize)?;
-        let text = self.extract_string(&end, false, token_type == self.token_types.raw_string, true)?;
+        let text =
+            self.extract_string(&end, false, token_type == self.token_types.raw_string, true)?;

        if let Some(b) = base {
-            if u64::from_str_radix(&text, b).is_err() {
+            if u128::from_str_radix(&text, b).is_err() {
                return self.error_result(format!(
                    "Numeric string contains invalid characters from {}:{}",
                    self.line, self.start
@ -531,10 +540,16 @@ impl<'a> TokenizerState<'a> {
                    )
                    .map(|x| *x);

+                let replaced = literal.replace("_", "");
+
                if let Some(unwrapped_token_type) = token_type {
                    self.add(self.token_types.number, Some(number_text))?;
                    self.add(self.token_types.dcolon, Some("::".to_string()))?;
                    self.add(unwrapped_token_type, Some(literal))?;
+                } else if self.dialect_settings.numbers_can_be_underscore_separated
+                    && self.is_numeric(&replaced)
+                {
+                    self.add(self.token_types.number, Some(number_text + &replaced))?;
                } else if self.dialect_settings.identifiers_can_start_with_digit {
                    self.add(self.token_types.var, None)?;
                } else {
@ -673,7 +688,7 @@ impl<'a> TokenizerState<'a> {
                if self.is_end {
                    if !raise_unmatched {
                        text.push(self.current_char);
-                        return Ok(text)
+                        return Ok(text);
                    }

                    return self.error_result(format!(
@ -699,11 +714,17 @@ impl<'a> TokenizerState<'a> {
    }

    fn is_identifier(&mut self, s: &str) -> bool {
-        s.chars().enumerate().all(
-            |(i, c)|
-            if i == 0 { self.is_alphabetic_or_underscore(c) }
-            else { self.is_alphabetic_or_underscore(c) || c.is_digit(10) }
-        )
+        s.chars().enumerate().all(|(i, c)| {
+            if i == 0 {
+                self.is_alphabetic_or_underscore(c)
+            } else {
+                self.is_alphabetic_or_underscore(c) || c.is_digit(10)
+            }
+        })
+    }
+
+    fn is_numeric(&mut self, s: &str) -> bool {
+        s.chars().all(|c| c.is_digit(10))
    }

    fn extract_value(&mut self) -> Result<String, TokenizerError> {