1
0
Fork 0
sqlglot/sqlglotrs/src/settings.rs
Daniel Baumann 4d0635d636
Merging upstream version 26.2.1.
Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-02-13 22:00:08 +01:00

248 lines
7.4 KiB
Rust

use pyo3::prelude::*;
use rustc_hash::FxHashMap as HashMap;
use rustc_hash::FxHashSet as HashSet;
pub type TokenType = u16;
#[derive(Clone, Debug)]
#[pyclass]
#[cfg_attr(feature = "profiling", derive(serde::Serialize, serde::Deserialize))]
pub struct TokenTypeSettings {
pub bit_string: TokenType,
pub break_: TokenType,
pub dcolon: TokenType,
pub heredoc_string: TokenType,
pub raw_string: TokenType,
pub hex_string: TokenType,
pub identifier: TokenType,
pub number: TokenType,
pub parameter: TokenType,
pub semicolon: TokenType,
pub string: TokenType,
pub var: TokenType,
pub heredoc_string_alternative: TokenType,
pub hint: TokenType,
}
#[pymethods]
impl TokenTypeSettings {
#[new]
pub fn new(
bit_string: TokenType,
break_: TokenType,
dcolon: TokenType,
heredoc_string: TokenType,
raw_string: TokenType,
hex_string: TokenType,
identifier: TokenType,
number: TokenType,
parameter: TokenType,
semicolon: TokenType,
string: TokenType,
var: TokenType,
heredoc_string_alternative: TokenType,
hint: TokenType,
) -> Self {
let token_type_settings = TokenTypeSettings {
bit_string,
break_,
dcolon,
heredoc_string,
raw_string,
hex_string,
identifier,
number,
parameter,
semicolon,
string,
var,
heredoc_string_alternative,
hint,
};
#[cfg(feature = "profiling")]
{
token_type_settings.write_json_to_string();
}
token_type_settings
}
}
#[cfg(feature = "profiling")]
impl TokenTypeSettings {
pub fn write_json_to_string(&self) {
let json = serde_json::to_string(self).unwrap();
let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
.join("benches/token_type_settings.json");
// Write to file
std::fs::write(path, &json).unwrap();
}
}
#[derive(Clone, Debug)]
#[pyclass]
#[cfg_attr(feature = "profiling", derive(serde::Serialize, serde::Deserialize))]
pub struct TokenizerSettings {
pub white_space: HashMap<char, TokenType>,
pub single_tokens: HashMap<char, TokenType>,
pub keywords: HashMap<String, TokenType>,
pub numeric_literals: HashMap<String, String>,
pub identifiers: HashMap<char, char>,
pub identifier_escapes: HashSet<char>,
pub string_escapes: HashSet<char>,
pub quotes: HashMap<String, String>,
pub format_strings: HashMap<String, (String, TokenType)>,
pub has_bit_strings: bool,
pub has_hex_strings: bool,
pub comments: HashMap<String, Option<String>>,
pub var_single_tokens: HashSet<char>,
pub commands: HashSet<TokenType>,
pub command_prefix_tokens: HashSet<TokenType>,
pub tokens_preceding_hint: HashSet<TokenType>,
pub heredoc_tag_is_identifier: bool,
pub string_escapes_allowed_in_raw_strings: bool,
pub nested_comments: bool,
pub hint_start: String,
}
#[pymethods]
impl TokenizerSettings {
#[new]
pub fn new(
white_space: HashMap<String, TokenType>,
single_tokens: HashMap<String, TokenType>,
keywords: HashMap<String, TokenType>,
numeric_literals: HashMap<String, String>,
identifiers: HashMap<String, String>,
identifier_escapes: HashSet<String>,
string_escapes: HashSet<String>,
quotes: HashMap<String, String>,
format_strings: HashMap<String, (String, TokenType)>,
has_bit_strings: bool,
has_hex_strings: bool,
comments: HashMap<String, Option<String>>,
var_single_tokens: HashSet<String>,
commands: HashSet<TokenType>,
command_prefix_tokens: HashSet<TokenType>,
tokens_preceding_hint: HashSet<TokenType>,
heredoc_tag_is_identifier: bool,
string_escapes_allowed_in_raw_strings: bool,
nested_comments: bool,
hint_start: String,
) -> Self {
let to_char = |v: &String| {
if v.len() == 1 {
v.chars().next().unwrap()
} else {
panic!("Invalid char: {}", v)
}
};
let white_space_native: HashMap<char, TokenType> = white_space
.into_iter()
.map(|(k, v)| (to_char(&k), v))
.collect();
let single_tokens_native: HashMap<char, TokenType> = single_tokens
.into_iter()
.map(|(k, v)| (to_char(&k), v))
.collect();
let identifiers_native: HashMap<char, char> = identifiers
.iter()
.map(|(k, v)| (to_char(k), to_char(v)))
.collect();
let identifier_escapes_native: HashSet<char> =
identifier_escapes.iter().map(&to_char).collect();
let string_escapes_native: HashSet<char> = string_escapes.iter().map(&to_char).collect();
let var_single_tokens_native: HashSet<char> =
var_single_tokens.iter().map(&to_char).collect();
let tokenizer_settings = TokenizerSettings {
white_space: white_space_native,
single_tokens: single_tokens_native,
keywords,
numeric_literals,
identifiers: identifiers_native,
identifier_escapes: identifier_escapes_native,
string_escapes: string_escapes_native,
quotes,
format_strings,
has_bit_strings,
has_hex_strings,
comments,
var_single_tokens: var_single_tokens_native,
commands,
command_prefix_tokens,
tokens_preceding_hint,
heredoc_tag_is_identifier,
string_escapes_allowed_in_raw_strings,
nested_comments,
hint_start,
};
#[cfg(feature = "profiling")]
{
tokenizer_settings.write_json_to_string();
}
tokenizer_settings
}
}
#[cfg(feature = "profiling")]
impl TokenizerSettings {
pub fn write_json_to_string(&self) {
let json = serde_json::to_string(self).unwrap();
let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
.join("benches/tokenizer_settings.json");
// Write to file
std::fs::write(path, &json).unwrap();
}
}
#[derive(Clone, Debug)]
#[pyclass]
#[cfg_attr(feature = "profiling", derive(serde::Serialize, serde::Deserialize))]
pub struct TokenizerDialectSettings {
pub unescaped_sequences: HashMap<String, String>,
pub identifiers_can_start_with_digit: bool,
pub numbers_can_be_underscore_separated: bool,
}
#[pymethods]
impl TokenizerDialectSettings {
#[new]
pub fn new(
unescaped_sequences: HashMap<String, String>,
identifiers_can_start_with_digit: bool,
numbers_can_be_underscore_separated: bool,
) -> Self {
let settings = TokenizerDialectSettings {
unescaped_sequences,
identifiers_can_start_with_digit,
numbers_can_be_underscore_separated,
};
#[cfg(feature = "profiling")]
{
settings.write_json_to_string();
}
settings
}
}
#[cfg(feature = "profiling")]
impl TokenizerDialectSettings {
pub fn write_json_to_string(&self) {
let json = serde_json::to_string(self).unwrap();
let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
.join("benches/tokenizer_dialect_settings.json");
std::fs::write(path, &json).unwrap();
}
}