1
0
Fork 0

Adding upstream version 26.1.3.

Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
Daniel Baumann 2025-02-13 21:59:41 +01:00
parent 09521056ff
commit d908bee480
Signed by: daniel
GPG key ID: FBB4F0E80A80222F
119 changed files with 71635 additions and 68059 deletions

View file

@ -1,90 +1,13 @@
use pyo3::prelude::*;
use pyo3::types::{PyList, PyNone, PyString};
use pyo3::{pymodule, types::PyModule, Bound, PyResult};
use settings::{TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings};
use token::Token;
use tokenizer::Tokenizer;
mod settings;
mod tokenizer;
mod trie;
pub use self::settings::{
TokenType, TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings,
};
pub use self::tokenizer::Tokenizer;
#[derive(Debug)]
#[pyclass]
pub struct Token {
#[pyo3(get, name = "token_type_index")]
pub token_type: TokenType,
#[pyo3(get, set, name = "token_type")]
pub token_type_py: PyObject,
#[pyo3(get)]
pub text: Py<PyString>,
#[pyo3(get)]
pub line: usize,
#[pyo3(get)]
pub col: usize,
#[pyo3(get)]
pub start: usize,
#[pyo3(get)]
pub end: usize,
#[pyo3(get)]
pub comments: Py<PyList>,
}
impl Token {
pub fn new(
token_type: TokenType,
text: String,
line: usize,
col: usize,
start: usize,
end: usize,
comments: Vec<String>,
) -> Token {
Python::with_gil(|py| Token {
token_type,
token_type_py: PyNone::get_bound(py).into_py(py),
text: PyString::new_bound(py, &text).into_py(py),
line,
col,
start,
end,
comments: PyList::new_bound(py, &comments).into(),
})
}
pub fn append_comments(&self, comments: &mut Vec<String>) {
Python::with_gil(|py| {
let pylist = self.comments.bind(py);
for comment in comments.iter() {
if let Err(_) = pylist.append(comment) {
panic!("Failed to append comments to the Python list");
}
}
});
// Simulate `Vec::append`.
let _ = std::mem::replace(comments, Vec::new());
}
}
#[pymethods]
impl Token {
#[pyo3(name = "__repr__")]
fn python_repr(&self) -> PyResult<String> {
Python::with_gil(|py| {
Ok(format!(
"<Token token_type: {}, text: {}, line: {}, col: {}, start: {}, end: {}, comments: {}>",
self.token_type_py.bind(py).repr()?,
self.text.bind(py).repr()?,
self.line,
self.col,
self.start,
self.end,
self.comments.bind(py).repr()?,
))
})
}
}
pub mod settings;
pub mod token;
pub mod tokenizer;
pub mod trie;
#[pymodule]
fn sqlglotrs(m: &Bound<'_, PyModule>) -> PyResult<()> {

View file

@ -1,10 +1,12 @@
use pyo3::prelude::*;
use std::collections::{HashMap, HashSet};
use pyo3::prelude::*;
pub type TokenType = u16;
#[derive(Clone, Debug)]
#[pyclass]
#[cfg_attr(feature = "profiling", derive(serde::Serialize, serde::Deserialize))]
pub struct TokenTypeSettings {
pub bit_string: TokenType,
pub break_: TokenType,
@ -41,7 +43,7 @@ impl TokenTypeSettings {
heredoc_string_alternative: TokenType,
hint: TokenType,
) -> Self {
TokenTypeSettings {
let token_type_settings = TokenTypeSettings {
bit_string,
break_,
dcolon,
@ -56,12 +58,31 @@ impl TokenTypeSettings {
var,
heredoc_string_alternative,
hint,
};
#[cfg(feature = "profiling")]
{
token_type_settings.write_json_to_string();
}
token_type_settings
}
}
#[cfg(feature = "profiling")]
impl TokenTypeSettings {
pub fn write_json_to_string(&self) {
let json = serde_json::to_string(self).unwrap();
let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
.join("benches/token_type_settings.json");
// Write to file
std::fs::write(path, &json).unwrap();
}
}
#[derive(Clone, Debug)]
#[pyclass]
#[cfg_attr(feature = "profiling", derive(serde::Serialize, serde::Deserialize))]
pub struct TokenizerSettings {
pub white_space: HashMap<char, TokenType>,
pub single_tokens: HashMap<char, TokenType>,
@ -141,7 +162,7 @@ impl TokenizerSettings {
let var_single_tokens_native: HashSet<char> =
var_single_tokens.iter().map(&to_char).collect();
TokenizerSettings {
let tokenizer_settings = TokenizerSettings {
white_space: white_space_native,
single_tokens: single_tokens_native,
keywords,
@ -162,15 +183,35 @@ impl TokenizerSettings {
string_escapes_allowed_in_raw_strings,
nested_comments,
hint_start,
};
#[cfg(feature = "profiling")]
{
tokenizer_settings.write_json_to_string();
}
tokenizer_settings
}
}
#[cfg(feature = "profiling")]
impl TokenizerSettings {
pub fn write_json_to_string(&self) {
let json = serde_json::to_string(self).unwrap();
let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
.join("benches/tokenizer_settings.json");
// Write to file
std::fs::write(path, &json).unwrap();
}
}
#[derive(Clone, Debug)]
#[pyclass]
#[cfg_attr(feature = "profiling", derive(serde::Serialize, serde::Deserialize))]
pub struct TokenizerDialectSettings {
pub unescaped_sequences: HashMap<String, String>,
pub identifiers_can_start_with_digit: bool,
pub numbers_can_be_underscore_separated: bool,
}
#[pymethods]
@ -179,10 +220,29 @@ impl TokenizerDialectSettings {
pub fn new(
unescaped_sequences: HashMap<String, String>,
identifiers_can_start_with_digit: bool,
numbers_can_be_underscore_separated: bool,
) -> Self {
TokenizerDialectSettings {
let settings = TokenizerDialectSettings {
unescaped_sequences,
identifiers_can_start_with_digit,
numbers_can_be_underscore_separated,
};
#[cfg(feature = "profiling")]
{
settings.write_json_to_string();
}
settings
}
}
#[cfg(feature = "profiling")]
impl TokenizerDialectSettings {
pub fn write_json_to_string(&self) {
let json = serde_json::to_string(self).unwrap();
let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
.join("benches/tokenizer_dialect_settings.json");
std::fs::write(path, &json).unwrap();
}
}

61
sqlglotrs/src/token.rs Normal file
View file

@ -0,0 +1,61 @@
use crate::settings::TokenType;
use pyo3::prelude::PyListMethods;
use pyo3::types::{PyList, PyNone, PyString};
use pyo3::{pyclass, IntoPy, Py, PyObject, Python};
#[derive(Debug)]
#[pyclass]
pub struct Token {
#[pyo3(get, name = "token_type_index")]
pub token_type: TokenType,
#[pyo3(get, set, name = "token_type")]
pub token_type_py: PyObject,
#[pyo3(get)]
pub text: Py<PyString>,
#[pyo3(get)]
pub line: usize,
#[pyo3(get)]
pub col: usize,
#[pyo3(get)]
pub start: usize,
#[pyo3(get)]
pub end: usize,
#[pyo3(get)]
pub comments: Py<PyList>,
}
impl Token {
pub fn new(
token_type: TokenType,
text: String,
line: usize,
col: usize,
start: usize,
end: usize,
comments: Vec<String>,
) -> Token {
Python::with_gil(|py| Token {
token_type,
token_type_py: PyNone::get_bound(py).into_py(py),
text: PyString::new_bound(py, &text).into_py(py),
line,
col,
start,
end,
comments: PyList::new_bound(py, &comments).into(),
})
}
pub fn append_comments(&self, comments: &mut Vec<String>) {
Python::with_gil(|py| {
let pylist = self.comments.bind(py);
for comment in comments.iter() {
if let Err(_) = pylist.append(comment) {
panic!("Failed to append comments to the Python list");
}
}
});
// Simulate `Vec::append`.
let _ = std::mem::replace(comments, Vec::new());
}
}

View file

@ -1,5 +1,6 @@
use crate::settings::TokenType;
use crate::trie::{Trie, TrieResult};
use crate::{Token, TokenType, TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings};
use crate::{Token, TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings};
use pyo3::exceptions::PyException;
use pyo3::prelude::*;
use std::cmp::{max, min};
@ -375,7 +376,10 @@ impl<'a> TokenizerState<'a> {
self.advance(1)?;
// Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres
if self.settings.nested_comments && !self.is_end && self.chars(comment_start_size) == *comment_start {
if self.settings.nested_comments
&& !self.is_end
&& self.chars(comment_start_size) == *comment_start
{
self.advance(comment_start_size as isize)?;
comment_count += 1
}
@ -397,7 +401,11 @@ impl<'a> TokenizerState<'a> {
if comment_start == self.settings.hint_start
&& self.tokens.last().is_some()
&& self.settings.tokens_preceding_hint.contains(&self.tokens.last().unwrap().token_type) {
&& self
.settings
.tokens_preceding_hint
.contains(&self.tokens.last().unwrap().token_type)
{
self.add(self.token_types.hint, None)?;
}
@ -443,7 +451,7 @@ impl<'a> TokenizerState<'a> {
self.advance(-(tag.len() as isize))?;
self.add(self.token_types.heredoc_string_alternative, None)?;
return Ok(true)
return Ok(true);
}
(None, *token_type, format!("{}{}{}", start, tag, end))
@ -455,10 +463,11 @@ impl<'a> TokenizerState<'a> {
};
self.advance(start.len() as isize)?;
let text = self.extract_string(&end, false, token_type == self.token_types.raw_string, true)?;
let text =
self.extract_string(&end, false, token_type == self.token_types.raw_string, true)?;
if let Some(b) = base {
if u64::from_str_radix(&text, b).is_err() {
if u128::from_str_radix(&text, b).is_err() {
return self.error_result(format!(
"Numeric string contains invalid characters from {}:{}",
self.line, self.start
@ -531,10 +540,16 @@ impl<'a> TokenizerState<'a> {
)
.map(|x| *x);
let replaced = literal.replace("_", "");
if let Some(unwrapped_token_type) = token_type {
self.add(self.token_types.number, Some(number_text))?;
self.add(self.token_types.dcolon, Some("::".to_string()))?;
self.add(unwrapped_token_type, Some(literal))?;
} else if self.dialect_settings.numbers_can_be_underscore_separated
&& self.is_numeric(&replaced)
{
self.add(self.token_types.number, Some(number_text + &replaced))?;
} else if self.dialect_settings.identifiers_can_start_with_digit {
self.add(self.token_types.var, None)?;
} else {
@ -673,7 +688,7 @@ impl<'a> TokenizerState<'a> {
if self.is_end {
if !raise_unmatched {
text.push(self.current_char);
return Ok(text)
return Ok(text);
}
return self.error_result(format!(
@ -699,11 +714,17 @@ impl<'a> TokenizerState<'a> {
}
fn is_identifier(&mut self, s: &str) -> bool {
s.chars().enumerate().all(
|(i, c)|
if i == 0 { self.is_alphabetic_or_underscore(c) }
else { self.is_alphabetic_or_underscore(c) || c.is_digit(10) }
)
s.chars().enumerate().all(|(i, c)| {
if i == 0 {
self.is_alphabetic_or_underscore(c)
} else {
self.is_alphabetic_or_underscore(c) || c.is_digit(10)
}
})
}
fn is_numeric(&mut self, s: &str) -> bool {
s.chars().all(|c| c.is_digit(10))
}
fn extract_value(&mut self) -> Result<String, TokenizerError> {