use std::collections::{HashMap, HashSet}; use pyo3::prelude::*; pub type TokenType = u16; #[derive(Clone, Debug)] #[pyclass] #[cfg_attr(feature = "profiling", derive(serde::Serialize, serde::Deserialize))] pub struct TokenTypeSettings { pub bit_string: TokenType, pub break_: TokenType, pub dcolon: TokenType, pub heredoc_string: TokenType, pub raw_string: TokenType, pub hex_string: TokenType, pub identifier: TokenType, pub number: TokenType, pub parameter: TokenType, pub semicolon: TokenType, pub string: TokenType, pub var: TokenType, pub heredoc_string_alternative: TokenType, pub hint: TokenType, } #[pymethods] impl TokenTypeSettings { #[new] pub fn new( bit_string: TokenType, break_: TokenType, dcolon: TokenType, heredoc_string: TokenType, raw_string: TokenType, hex_string: TokenType, identifier: TokenType, number: TokenType, parameter: TokenType, semicolon: TokenType, string: TokenType, var: TokenType, heredoc_string_alternative: TokenType, hint: TokenType, ) -> Self { let token_type_settings = TokenTypeSettings { bit_string, break_, dcolon, heredoc_string, raw_string, hex_string, identifier, number, parameter, semicolon, string, var, heredoc_string_alternative, hint, }; #[cfg(feature = "profiling")] { token_type_settings.write_json_to_string(); } token_type_settings } } #[cfg(feature = "profiling")] impl TokenTypeSettings { pub fn write_json_to_string(&self) { let json = serde_json::to_string(self).unwrap(); let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR")) .join("benches/token_type_settings.json"); // Write to file std::fs::write(path, &json).unwrap(); } } #[derive(Clone, Debug)] #[pyclass] #[cfg_attr(feature = "profiling", derive(serde::Serialize, serde::Deserialize))] pub struct TokenizerSettings { pub white_space: HashMap<char, TokenType>, pub single_tokens: HashMap<char, TokenType>, pub keywords: HashMap<String, TokenType>, pub numeric_literals: HashMap<String, String>, pub identifiers: HashMap<char, char>, pub identifier_escapes: HashSet<char>, pub string_escapes: HashSet<char>, pub quotes: HashMap<String, String>, pub format_strings: HashMap<String, (String, TokenType)>, pub has_bit_strings: bool, pub has_hex_strings: bool, pub comments: HashMap<String, Option<String>>, pub var_single_tokens: HashSet<char>, pub commands: HashSet<TokenType>, pub command_prefix_tokens: HashSet<TokenType>, pub tokens_preceding_hint: HashSet<TokenType>, pub heredoc_tag_is_identifier: bool, pub string_escapes_allowed_in_raw_strings: bool, pub nested_comments: bool, pub hint_start: String, } #[pymethods] impl TokenizerSettings { #[new] pub fn new( white_space: HashMap<String, TokenType>, single_tokens: HashMap<String, TokenType>, keywords: HashMap<String, TokenType>, numeric_literals: HashMap<String, String>, identifiers: HashMap<String, String>, identifier_escapes: HashSet<String>, string_escapes: HashSet<String>, quotes: HashMap<String, String>, format_strings: HashMap<String, (String, TokenType)>, has_bit_strings: bool, has_hex_strings: bool, comments: HashMap<String, Option<String>>, var_single_tokens: HashSet<String>, commands: HashSet<TokenType>, command_prefix_tokens: HashSet<TokenType>, tokens_preceding_hint: HashSet<TokenType>, heredoc_tag_is_identifier: bool, string_escapes_allowed_in_raw_strings: bool, nested_comments: bool, hint_start: String, ) -> Self { let to_char = |v: &String| { if v.len() == 1 { v.chars().next().unwrap() } else { panic!("Invalid char: {}", v) } }; let white_space_native: HashMap<char, TokenType> = white_space .into_iter() .map(|(k, v)| (to_char(&k), v)) .collect(); let single_tokens_native: HashMap<char, TokenType> = single_tokens .into_iter() .map(|(k, v)| (to_char(&k), v)) .collect(); let identifiers_native: HashMap<char, char> = identifiers .iter() .map(|(k, v)| (to_char(k), to_char(v))) .collect(); let identifier_escapes_native: HashSet<char> = identifier_escapes.iter().map(&to_char).collect(); let string_escapes_native: HashSet<char> = string_escapes.iter().map(&to_char).collect(); let var_single_tokens_native: HashSet<char> = var_single_tokens.iter().map(&to_char).collect(); let tokenizer_settings = TokenizerSettings { white_space: white_space_native, single_tokens: single_tokens_native, keywords, numeric_literals, identifiers: identifiers_native, identifier_escapes: identifier_escapes_native, string_escapes: string_escapes_native, quotes, format_strings, has_bit_strings, has_hex_strings, comments, var_single_tokens: var_single_tokens_native, commands, command_prefix_tokens, tokens_preceding_hint, heredoc_tag_is_identifier, string_escapes_allowed_in_raw_strings, nested_comments, hint_start, }; #[cfg(feature = "profiling")] { tokenizer_settings.write_json_to_string(); } tokenizer_settings } } #[cfg(feature = "profiling")] impl TokenizerSettings { pub fn write_json_to_string(&self) { let json = serde_json::to_string(self).unwrap(); let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR")) .join("benches/tokenizer_settings.json"); // Write to file std::fs::write(path, &json).unwrap(); } } #[derive(Clone, Debug)] #[pyclass] #[cfg_attr(feature = "profiling", derive(serde::Serialize, serde::Deserialize))] pub struct TokenizerDialectSettings { pub unescaped_sequences: HashMap<String, String>, pub identifiers_can_start_with_digit: bool, pub numbers_can_be_underscore_separated: bool, } #[pymethods] impl TokenizerDialectSettings { #[new] pub fn new( unescaped_sequences: HashMap<String, String>, identifiers_can_start_with_digit: bool, numbers_can_be_underscore_separated: bool, ) -> Self { let settings = TokenizerDialectSettings { unescaped_sequences, identifiers_can_start_with_digit, numbers_can_be_underscore_separated, }; #[cfg(feature = "profiling")] { settings.write_json_to_string(); } settings } } #[cfg(feature = "profiling")] impl TokenizerDialectSettings { pub fn write_json_to_string(&self) { let json = serde_json::to_string(self).unwrap(); let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR")) .join("benches/tokenizer_dialect_settings.json"); std::fs::write(path, &json).unwrap(); } }