Adding upstream version 26.1.3.
Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
parent
09521056ff
commit
d908bee480
119 changed files with 71635 additions and 68059 deletions
|
@ -1,90 +1,13 @@
|
|||
use pyo3::prelude::*;
|
||||
use pyo3::types::{PyList, PyNone, PyString};
|
||||
use pyo3::{pymodule, types::PyModule, Bound, PyResult};
|
||||
use settings::{TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings};
|
||||
use token::Token;
|
||||
use tokenizer::Tokenizer;
|
||||
|
||||
mod settings;
|
||||
mod tokenizer;
|
||||
mod trie;
|
||||
|
||||
pub use self::settings::{
|
||||
TokenType, TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings,
|
||||
};
|
||||
pub use self::tokenizer::Tokenizer;
|
||||
|
||||
#[derive(Debug)]
|
||||
#[pyclass]
|
||||
pub struct Token {
|
||||
#[pyo3(get, name = "token_type_index")]
|
||||
pub token_type: TokenType,
|
||||
#[pyo3(get, set, name = "token_type")]
|
||||
pub token_type_py: PyObject,
|
||||
#[pyo3(get)]
|
||||
pub text: Py<PyString>,
|
||||
#[pyo3(get)]
|
||||
pub line: usize,
|
||||
#[pyo3(get)]
|
||||
pub col: usize,
|
||||
#[pyo3(get)]
|
||||
pub start: usize,
|
||||
#[pyo3(get)]
|
||||
pub end: usize,
|
||||
#[pyo3(get)]
|
||||
pub comments: Py<PyList>,
|
||||
}
|
||||
|
||||
impl Token {
|
||||
pub fn new(
|
||||
token_type: TokenType,
|
||||
text: String,
|
||||
line: usize,
|
||||
col: usize,
|
||||
start: usize,
|
||||
end: usize,
|
||||
comments: Vec<String>,
|
||||
) -> Token {
|
||||
Python::with_gil(|py| Token {
|
||||
token_type,
|
||||
token_type_py: PyNone::get_bound(py).into_py(py),
|
||||
text: PyString::new_bound(py, &text).into_py(py),
|
||||
line,
|
||||
col,
|
||||
start,
|
||||
end,
|
||||
comments: PyList::new_bound(py, &comments).into(),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn append_comments(&self, comments: &mut Vec<String>) {
|
||||
Python::with_gil(|py| {
|
||||
let pylist = self.comments.bind(py);
|
||||
for comment in comments.iter() {
|
||||
if let Err(_) = pylist.append(comment) {
|
||||
panic!("Failed to append comments to the Python list");
|
||||
}
|
||||
}
|
||||
});
|
||||
// Simulate `Vec::append`.
|
||||
let _ = std::mem::replace(comments, Vec::new());
|
||||
}
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl Token {
|
||||
#[pyo3(name = "__repr__")]
|
||||
fn python_repr(&self) -> PyResult<String> {
|
||||
Python::with_gil(|py| {
|
||||
Ok(format!(
|
||||
"<Token token_type: {}, text: {}, line: {}, col: {}, start: {}, end: {}, comments: {}>",
|
||||
self.token_type_py.bind(py).repr()?,
|
||||
self.text.bind(py).repr()?,
|
||||
self.line,
|
||||
self.col,
|
||||
self.start,
|
||||
self.end,
|
||||
self.comments.bind(py).repr()?,
|
||||
))
|
||||
})
|
||||
}
|
||||
}
|
||||
pub mod settings;
|
||||
pub mod token;
|
||||
pub mod tokenizer;
|
||||
pub mod trie;
|
||||
|
||||
#[pymodule]
|
||||
fn sqlglotrs(m: &Bound<'_, PyModule>) -> PyResult<()> {
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
use pyo3::prelude::*;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use pyo3::prelude::*;
|
||||
|
||||
pub type TokenType = u16;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
#[pyclass]
|
||||
#[cfg_attr(feature = "profiling", derive(serde::Serialize, serde::Deserialize))]
|
||||
pub struct TokenTypeSettings {
|
||||
pub bit_string: TokenType,
|
||||
pub break_: TokenType,
|
||||
|
@ -41,7 +43,7 @@ impl TokenTypeSettings {
|
|||
heredoc_string_alternative: TokenType,
|
||||
hint: TokenType,
|
||||
) -> Self {
|
||||
TokenTypeSettings {
|
||||
let token_type_settings = TokenTypeSettings {
|
||||
bit_string,
|
||||
break_,
|
||||
dcolon,
|
||||
|
@ -56,12 +58,31 @@ impl TokenTypeSettings {
|
|||
var,
|
||||
heredoc_string_alternative,
|
||||
hint,
|
||||
};
|
||||
|
||||
#[cfg(feature = "profiling")]
|
||||
{
|
||||
token_type_settings.write_json_to_string();
|
||||
}
|
||||
|
||||
token_type_settings
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "profiling")]
|
||||
impl TokenTypeSettings {
|
||||
pub fn write_json_to_string(&self) {
|
||||
let json = serde_json::to_string(self).unwrap();
|
||||
let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("benches/token_type_settings.json");
|
||||
// Write to file
|
||||
std::fs::write(path, &json).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
#[pyclass]
|
||||
#[cfg_attr(feature = "profiling", derive(serde::Serialize, serde::Deserialize))]
|
||||
pub struct TokenizerSettings {
|
||||
pub white_space: HashMap<char, TokenType>,
|
||||
pub single_tokens: HashMap<char, TokenType>,
|
||||
|
@ -141,7 +162,7 @@ impl TokenizerSettings {
|
|||
let var_single_tokens_native: HashSet<char> =
|
||||
var_single_tokens.iter().map(&to_char).collect();
|
||||
|
||||
TokenizerSettings {
|
||||
let tokenizer_settings = TokenizerSettings {
|
||||
white_space: white_space_native,
|
||||
single_tokens: single_tokens_native,
|
||||
keywords,
|
||||
|
@ -162,15 +183,35 @@ impl TokenizerSettings {
|
|||
string_escapes_allowed_in_raw_strings,
|
||||
nested_comments,
|
||||
hint_start,
|
||||
};
|
||||
|
||||
#[cfg(feature = "profiling")]
|
||||
{
|
||||
tokenizer_settings.write_json_to_string();
|
||||
}
|
||||
|
||||
tokenizer_settings
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "profiling")]
|
||||
impl TokenizerSettings {
|
||||
pub fn write_json_to_string(&self) {
|
||||
let json = serde_json::to_string(self).unwrap();
|
||||
let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("benches/tokenizer_settings.json");
|
||||
// Write to file
|
||||
std::fs::write(path, &json).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
#[pyclass]
|
||||
#[cfg_attr(feature = "profiling", derive(serde::Serialize, serde::Deserialize))]
|
||||
pub struct TokenizerDialectSettings {
|
||||
pub unescaped_sequences: HashMap<String, String>,
|
||||
pub identifiers_can_start_with_digit: bool,
|
||||
pub numbers_can_be_underscore_separated: bool,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
|
@ -179,10 +220,29 @@ impl TokenizerDialectSettings {
|
|||
pub fn new(
|
||||
unescaped_sequences: HashMap<String, String>,
|
||||
identifiers_can_start_with_digit: bool,
|
||||
numbers_can_be_underscore_separated: bool,
|
||||
) -> Self {
|
||||
TokenizerDialectSettings {
|
||||
let settings = TokenizerDialectSettings {
|
||||
unescaped_sequences,
|
||||
identifiers_can_start_with_digit,
|
||||
numbers_can_be_underscore_separated,
|
||||
};
|
||||
|
||||
#[cfg(feature = "profiling")]
|
||||
{
|
||||
settings.write_json_to_string();
|
||||
}
|
||||
|
||||
settings
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "profiling")]
|
||||
impl TokenizerDialectSettings {
|
||||
pub fn write_json_to_string(&self) {
|
||||
let json = serde_json::to_string(self).unwrap();
|
||||
let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("benches/tokenizer_dialect_settings.json");
|
||||
std::fs::write(path, &json).unwrap();
|
||||
}
|
||||
}
|
||||
|
|
61
sqlglotrs/src/token.rs
Normal file
61
sqlglotrs/src/token.rs
Normal file
|
@ -0,0 +1,61 @@
|
|||
use crate::settings::TokenType;
|
||||
use pyo3::prelude::PyListMethods;
|
||||
use pyo3::types::{PyList, PyNone, PyString};
|
||||
use pyo3::{pyclass, IntoPy, Py, PyObject, Python};
|
||||
|
||||
#[derive(Debug)]
|
||||
#[pyclass]
|
||||
pub struct Token {
|
||||
#[pyo3(get, name = "token_type_index")]
|
||||
pub token_type: TokenType,
|
||||
#[pyo3(get, set, name = "token_type")]
|
||||
pub token_type_py: PyObject,
|
||||
#[pyo3(get)]
|
||||
pub text: Py<PyString>,
|
||||
#[pyo3(get)]
|
||||
pub line: usize,
|
||||
#[pyo3(get)]
|
||||
pub col: usize,
|
||||
#[pyo3(get)]
|
||||
pub start: usize,
|
||||
#[pyo3(get)]
|
||||
pub end: usize,
|
||||
#[pyo3(get)]
|
||||
pub comments: Py<PyList>,
|
||||
}
|
||||
|
||||
impl Token {
|
||||
pub fn new(
|
||||
token_type: TokenType,
|
||||
text: String,
|
||||
line: usize,
|
||||
col: usize,
|
||||
start: usize,
|
||||
end: usize,
|
||||
comments: Vec<String>,
|
||||
) -> Token {
|
||||
Python::with_gil(|py| Token {
|
||||
token_type,
|
||||
token_type_py: PyNone::get_bound(py).into_py(py),
|
||||
text: PyString::new_bound(py, &text).into_py(py),
|
||||
line,
|
||||
col,
|
||||
start,
|
||||
end,
|
||||
comments: PyList::new_bound(py, &comments).into(),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn append_comments(&self, comments: &mut Vec<String>) {
|
||||
Python::with_gil(|py| {
|
||||
let pylist = self.comments.bind(py);
|
||||
for comment in comments.iter() {
|
||||
if let Err(_) = pylist.append(comment) {
|
||||
panic!("Failed to append comments to the Python list");
|
||||
}
|
||||
}
|
||||
});
|
||||
// Simulate `Vec::append`.
|
||||
let _ = std::mem::replace(comments, Vec::new());
|
||||
}
|
||||
}
|
|
@ -1,5 +1,6 @@
|
|||
use crate::settings::TokenType;
|
||||
use crate::trie::{Trie, TrieResult};
|
||||
use crate::{Token, TokenType, TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings};
|
||||
use crate::{Token, TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings};
|
||||
use pyo3::exceptions::PyException;
|
||||
use pyo3::prelude::*;
|
||||
use std::cmp::{max, min};
|
||||
|
@ -375,7 +376,10 @@ impl<'a> TokenizerState<'a> {
|
|||
self.advance(1)?;
|
||||
|
||||
// Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres
|
||||
if self.settings.nested_comments && !self.is_end && self.chars(comment_start_size) == *comment_start {
|
||||
if self.settings.nested_comments
|
||||
&& !self.is_end
|
||||
&& self.chars(comment_start_size) == *comment_start
|
||||
{
|
||||
self.advance(comment_start_size as isize)?;
|
||||
comment_count += 1
|
||||
}
|
||||
|
@ -397,7 +401,11 @@ impl<'a> TokenizerState<'a> {
|
|||
|
||||
if comment_start == self.settings.hint_start
|
||||
&& self.tokens.last().is_some()
|
||||
&& self.settings.tokens_preceding_hint.contains(&self.tokens.last().unwrap().token_type) {
|
||||
&& self
|
||||
.settings
|
||||
.tokens_preceding_hint
|
||||
.contains(&self.tokens.last().unwrap().token_type)
|
||||
{
|
||||
self.add(self.token_types.hint, None)?;
|
||||
}
|
||||
|
||||
|
@ -443,7 +451,7 @@ impl<'a> TokenizerState<'a> {
|
|||
|
||||
self.advance(-(tag.len() as isize))?;
|
||||
self.add(self.token_types.heredoc_string_alternative, None)?;
|
||||
return Ok(true)
|
||||
return Ok(true);
|
||||
}
|
||||
|
||||
(None, *token_type, format!("{}{}{}", start, tag, end))
|
||||
|
@ -455,10 +463,11 @@ impl<'a> TokenizerState<'a> {
|
|||
};
|
||||
|
||||
self.advance(start.len() as isize)?;
|
||||
let text = self.extract_string(&end, false, token_type == self.token_types.raw_string, true)?;
|
||||
let text =
|
||||
self.extract_string(&end, false, token_type == self.token_types.raw_string, true)?;
|
||||
|
||||
if let Some(b) = base {
|
||||
if u64::from_str_radix(&text, b).is_err() {
|
||||
if u128::from_str_radix(&text, b).is_err() {
|
||||
return self.error_result(format!(
|
||||
"Numeric string contains invalid characters from {}:{}",
|
||||
self.line, self.start
|
||||
|
@ -531,10 +540,16 @@ impl<'a> TokenizerState<'a> {
|
|||
)
|
||||
.map(|x| *x);
|
||||
|
||||
let replaced = literal.replace("_", "");
|
||||
|
||||
if let Some(unwrapped_token_type) = token_type {
|
||||
self.add(self.token_types.number, Some(number_text))?;
|
||||
self.add(self.token_types.dcolon, Some("::".to_string()))?;
|
||||
self.add(unwrapped_token_type, Some(literal))?;
|
||||
} else if self.dialect_settings.numbers_can_be_underscore_separated
|
||||
&& self.is_numeric(&replaced)
|
||||
{
|
||||
self.add(self.token_types.number, Some(number_text + &replaced))?;
|
||||
} else if self.dialect_settings.identifiers_can_start_with_digit {
|
||||
self.add(self.token_types.var, None)?;
|
||||
} else {
|
||||
|
@ -673,7 +688,7 @@ impl<'a> TokenizerState<'a> {
|
|||
if self.is_end {
|
||||
if !raise_unmatched {
|
||||
text.push(self.current_char);
|
||||
return Ok(text)
|
||||
return Ok(text);
|
||||
}
|
||||
|
||||
return self.error_result(format!(
|
||||
|
@ -699,11 +714,17 @@ impl<'a> TokenizerState<'a> {
|
|||
}
|
||||
|
||||
fn is_identifier(&mut self, s: &str) -> bool {
|
||||
s.chars().enumerate().all(
|
||||
|(i, c)|
|
||||
if i == 0 { self.is_alphabetic_or_underscore(c) }
|
||||
else { self.is_alphabetic_or_underscore(c) || c.is_digit(10) }
|
||||
)
|
||||
s.chars().enumerate().all(|(i, c)| {
|
||||
if i == 0 {
|
||||
self.is_alphabetic_or_underscore(c)
|
||||
} else {
|
||||
self.is_alphabetic_or_underscore(c) || c.is_digit(10)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn is_numeric(&mut self, s: &str) -> bool {
|
||||
s.chars().all(|c| c.is_digit(10))
|
||||
}
|
||||
|
||||
fn extract_value(&mut self) -> Result<String, TokenizerError> {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue