1
0
Fork 0

Adding upstream version 26.1.3.

Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
Daniel Baumann 2025-02-13 21:59:41 +01:00
parent 09521056ff
commit d908bee480
Signed by: daniel
GPG key ID: FBB4F0E80A80222F
119 changed files with 71635 additions and 68059 deletions

552
sqlglotrs/Cargo.lock generated
View file

@ -1,6 +1,27 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
version = 4
[[package]]
name = "aho-corasick"
version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
dependencies = [
"memchr",
]
[[package]]
name = "anes"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
[[package]]
name = "anstyle"
version = "1.0.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
[[package]]
name = "autocfg"
@ -8,18 +29,171 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]]
name = "bumpalo"
version = "3.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
[[package]]
name = "cast"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "ciborium"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e"
dependencies = [
"ciborium-io",
"ciborium-ll",
"serde",
]
[[package]]
name = "ciborium-io"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757"
[[package]]
name = "ciborium-ll"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9"
dependencies = [
"ciborium-io",
"half",
]
[[package]]
name = "clap"
version = "4.5.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3135e7ec2ef7b10c6ed8950f0f792ed96ee093fa088608f1c76e569722700c84"
dependencies = [
"clap_builder",
]
[[package]]
name = "clap_builder"
version = "4.5.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30582fc632330df2bd26877bde0c1f4470d57c582bbc070376afcd04d8cb4838"
dependencies = [
"anstyle",
"clap_lex",
]
[[package]]
name = "clap_lex"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6"
[[package]]
name = "criterion"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f"
dependencies = [
"anes",
"cast",
"ciborium",
"clap",
"criterion-plot",
"is-terminal",
"itertools",
"num-traits",
"once_cell",
"oorandom",
"plotters",
"rayon",
"regex",
"serde",
"serde_derive",
"serde_json",
"tinytemplate",
"walkdir",
]
[[package]]
name = "criterion-plot"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
dependencies = [
"cast",
"itertools",
]
[[package]]
name = "crossbeam-deque"
version = "0.8.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
dependencies = [
"crossbeam-epoch",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-epoch"
version = "0.9.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
dependencies = [
"crossbeam-utils",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
[[package]]
name = "crunchy"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
[[package]]
name = "either"
version = "1.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
[[package]]
name = "half"
version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888"
dependencies = [
"cfg-if",
"crunchy",
]
[[package]]
name = "heck"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "hermit-abi"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fbf6a919d6cf397374f7dfeeea91d974c7c0a7221d0d0f4f20d859d329e53fcc"
[[package]]
name = "indoc"
version = "2.0.4"
@ -27,10 +201,58 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e186cfbae8084e513daff4240b4797e342f988cecda4fb6c939150f96315fd8"
[[package]]
name = "libc"
version = "0.2.150"
name = "is-terminal"
version = "0.4.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c"
checksum = "261f68e344040fbd0edea105bef17c66edf46f984ddb1115b775ce31be948f4b"
dependencies = [
"hermit-abi",
"libc",
"windows-sys 0.52.0",
]
[[package]]
name = "itertools"
version = "0.10.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473"
dependencies = [
"either",
]
[[package]]
name = "itoa"
version = "1.0.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
[[package]]
name = "js-sys"
version = "0.3.76"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6717b6b5b077764fb5966237269cb3c64edddde4b14ce42647430a78ced9e7b7"
dependencies = [
"once_cell",
"wasm-bindgen",
]
[[package]]
name = "libc"
version = "0.2.169"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
[[package]]
name = "log"
version = "0.4.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
[[package]]
name = "memchr"
version = "2.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
[[package]]
name = "memoffset"
@ -41,12 +263,55 @@ dependencies = [
"autocfg",
]
[[package]]
name = "num-traits"
version = "0.2.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
dependencies = [
"autocfg",
]
[[package]]
name = "once_cell"
version = "1.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
[[package]]
name = "oorandom"
version = "11.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b410bbe7e14ab526a0e86877eb47c6996a2bd7746f027ba551028c925390e4e9"
[[package]]
name = "plotters"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747"
dependencies = [
"num-traits",
"plotters-backend",
"plotters-svg",
"wasm-bindgen",
"web-sys",
]
[[package]]
name = "plotters-backend"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a"
[[package]]
name = "plotters-svg"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670"
dependencies = [
"plotters-backend",
]
[[package]]
name = "portable-atomic"
version = "1.9.0"
@ -135,10 +400,110 @@ dependencies = [
]
[[package]]
name = "sqlglotrs"
version = "0.3.0"
name = "rayon"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
dependencies = [
"either",
"rayon-core",
]
[[package]]
name = "rayon-core"
version = "1.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
dependencies = [
"crossbeam-deque",
"crossbeam-utils",
]
[[package]]
name = "regex"
version = "1.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.4.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
[[package]]
name = "ryu"
version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
[[package]]
name = "same-file"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
dependencies = [
"winapi-util",
]
[[package]]
name = "serde"
version = "1.0.216"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b9781016e935a97e8beecf0c933758c97a5520d32930e460142b4cd80c6338e"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.216"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "46f859dbbf73865c6627ed570e78961cd3ac92407a2d117204c49232485da55e"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "serde_json"
version = "1.0.133"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377"
dependencies = [
"itoa",
"memchr",
"ryu",
"serde",
]
[[package]]
name = "sqlglotrs"
version = "0.3.4"
dependencies = [
"criterion",
"pyo3",
"serde",
"serde_json",
"sqlglotrs",
]
[[package]]
@ -158,6 +523,16 @@ version = "0.12.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
[[package]]
name = "tinytemplate"
version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
dependencies = [
"serde",
"serde_json",
]
[[package]]
name = "unicode-ident"
version = "1.0.12"
@ -169,3 +544,168 @@ name = "unindent"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce"
[[package]]
name = "walkdir"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
dependencies = [
"same-file",
"winapi-util",
]
[[package]]
name = "wasm-bindgen"
version = "0.2.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a474f6281d1d70c17ae7aa6a613c87fce69a127e2624002df63dcb39d6cf6396"
dependencies = [
"cfg-if",
"once_cell",
"wasm-bindgen-macro",
]
[[package]]
name = "wasm-bindgen-backend"
version = "0.2.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f89bb38646b4f81674e8f5c3fb81b562be1fd936d84320f3264486418519c79"
dependencies = [
"bumpalo",
"log",
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2cc6181fd9a7492eef6fef1f33961e3695e4579b9872a6f7c83aee556666d4fe"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
]
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2"
dependencies = [
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-backend",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6"
[[package]]
name = "web-sys"
version = "0.3.76"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04dd7223427d52553d3702c004d3b2fe07c148165faa56313cb00211e31c12bc"
dependencies = [
"js-sys",
"wasm-bindgen",
]
[[package]]
name = "winapi-util"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
dependencies = [
"windows-sys 0.59.0",
]
[[package]]
name = "windows-sys"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
dependencies = [
"windows-targets",
]
[[package]]
name = "windows-sys"
version = "0.59.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
dependencies = [
"windows-targets",
]
[[package]]
name = "windows-targets"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_gnullvm",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
[[package]]
name = "windows_i686_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
[[package]]
name = "windows_i686_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
[[package]]
name = "windows_i686_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"

View file

@ -1,12 +1,31 @@
[package]
name = "sqlglotrs"
version = "0.3.0"
version = "0.3.4"
edition = "2021"
license = "MIT"
[lib]
name = "sqlglotrs"
crate-type = ["cdylib"]
crate-type = ["cdylib", "rlib"]
[[bench]]
name = "long"
harness = false
[features]
# Enable this feature to use the serde and serde_json crates for profiling purposes
default = []
profiling = ["serde", "serde_json"]
[dependencies]
pyo3 = "0.22.6"
pyo3 = {version ="0.22.6", features = ["auto-initialize"]}
# Optional dependencies used for profiling
serde = { version = "1", features = ["derive"] , optional = true }
serde_json = { version = "1", optional = true }
[dev-dependencies]
criterion = "0.5"
serde = { version = "1", features = ["derive"] }
serde_json = { version = "1" }
sqlglotrs = { path = "." , features = ["profiling"] }

View file

@ -0,0 +1 @@
{"unescaped_sequences":{},"identifiers_can_start_with_digit":false,"numbers_can_be_underscore_separated":false}

74
sqlglotrs/benches/long.rs Normal file
View file

@ -0,0 +1,74 @@
use std::path::Path;
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use sqlglotrs::settings::{TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings};
use sqlglotrs::tokenizer::Tokenizer;
pub const LONG: &str = r#"
SELECT
"e"."employee_id" AS "Employee #",
"e"."first_name" || ' ' || "e"."last_name" AS "Name",
"e"."email" AS "Email",
"e"."phone_number" AS "Phone",
TO_CHAR("e"."hire_date", 'MM/DD/YYYY') AS "Hire Date",
TO_CHAR("e"."salary", 'L99G999D99', 'NLS_NUMERIC_CHARACTERS = ''.,'' NLS_CURRENCY = ''$''') AS "Salary",
"e"."commission_pct" AS "Commission %",
'works as ' || "j"."job_title" || ' in ' || "d"."department_name" || ' department (manager: ' || "dm"."first_name" || ' ' || "dm"."last_name" || ') and immediate supervisor: ' || "m"."first_name" || ' ' || "m"."last_name" AS "Current Job",
TO_CHAR("j"."min_salary", 'L99G999D99', 'NLS_NUMERIC_CHARACTERS = ''.,'' NLS_CURRENCY = ''$''') || ' - ' || TO_CHAR("j"."max_salary", 'L99G999D99', 'NLS_NUMERIC_CHARACTERS = ''.,'' NLS_CURRENCY = ''$''') AS "Current Salary",
"l"."street_address" || ', ' || "l"."postal_code" || ', ' || "l"."city" || ', ' || "l"."state_province" || ', ' || "c"."country_name" || ' (' || "r"."region_name" || ')' AS "Location",
"jh"."job_id" AS "History Job ID",
'worked from ' || TO_CHAR("jh"."start_date", 'MM/DD/YYYY') || ' to ' || TO_CHAR("jh"."end_date", 'MM/DD/YYYY') || ' as ' || "jj"."job_title" || ' in ' || "dd"."department_name" || ' department' AS "History Job Title",
case when 1 then 1 when 2 then 2 when 3 then 3 when 4 then 4 when 5 then 5 else a(b(c + 1 * 3 % 4)) end
FROM "employees" AS e
JOIN "jobs" AS j
ON "e"."job_id" = "j"."job_id"
LEFT JOIN "employees" AS m
ON "e"."manager_id" = "m"."employee_id"
LEFT JOIN "departments" AS d
ON "d"."department_id" = "e"."department_id"
LEFT JOIN "employees" AS dm
ON "d"."manager_id" = "dm"."employee_id"
LEFT JOIN "locations" AS l
ON "d"."location_id" = "l"."location_id"
LEFT JOIN "countries" AS c
ON "l"."country_id" = "c"."country_id"
LEFT JOIN "regions" AS r
ON "c"."region_id" = "r"."region_id"
LEFT JOIN "job_history" AS jh
ON "e"."employee_id" = "jh"."employee_id"
LEFT JOIN "jobs" AS jj
ON "jj"."job_id" = "jh"."job_id"
LEFT JOIN "departments" AS dd
ON "dd"."department_id" = "jh"."department_id"
ORDER BY
"e"."employee_id"
"#;
fn long(c: &mut Criterion) {
// Read tokenizer settings
let path = Path::new(env!("CARGO_MANIFEST_DIR")).join("benches");
let settings_file = std::fs::read_to_string(path.join("tokenizer_settings.json")).unwrap();
let tokenizer_settings = serde_json::from_str::<TokenizerSettings>(&settings_file).unwrap();
let settings_type_file =
std::fs::read_to_string(path.join("token_type_settings.json")).unwrap();
let settings_type_file =
serde_json::from_str::<TokenTypeSettings>(&settings_type_file).unwrap();
let dialect_settings = std::fs::read_to_string(path.join("dialect_settings.json")).unwrap();
let dialect_settings =
serde_json::from_str::<TokenizerDialectSettings>(&dialect_settings).unwrap();
let tokenizer = Tokenizer::new(tokenizer_settings, settings_type_file);
c.bench_function("long", |b| {
b.iter(|| black_box(tokenizer.tokenize(LONG, &dialect_settings)));
});
}
criterion_group! {
name = benches;
config = Criterion::default();
targets = long
}
criterion_main!(benches);

View file

@ -0,0 +1 @@
{"bit_string":67,"break_":55,"dcolon":11,"heredoc_string":72,"raw_string":71,"hex_string":68,"identifier":58,"number":57,"parameter":47,"semicolon":13,"string":56,"var":66,"heredoc_string_alternative":66,"hint":254}

View file

@ -0,0 +1 @@
{"unescaped_sequences":{},"identifiers_can_start_with_digit":false,"numbers_can_be_underscore_separated":false}

View file

@ -0,0 +1 @@
{"white_space":{"\n":55,"\t":54,"\r":55," ":54},"single_tokens":{"\"":320,",":6,".":7,"[":2,"*":14,":":10,"]":3,"'":320,"(":0,")":1,"?":311,"-":8,"@":47,"$":46},"keywords":{"..":7},"numeric_literals":{},"identifiers":{"\"":"\""},"identifier_escapes":["\\"],"string_escapes":["\\"],"quotes":{"'":"'"},"format_strings":{"N'":["'",70],"n'":["'",70]},"has_bit_strings":false,"has_hex_strings":false,"comments":{"{#":"#}","--":null,"/*":"*/"},"var_single_tokens":[],"commands":[237,341,205,234,324],"command_prefix_tokens":[13,197],"tokens_preceding_hint":[261,334,221,361],"heredoc_tag_is_identifier":false,"string_escapes_allowed_in_raw_strings":true,"nested_comments":true,"hint_start":"/*+"}

View file

@ -1,90 +1,13 @@
use pyo3::prelude::*;
use pyo3::types::{PyList, PyNone, PyString};
use pyo3::{pymodule, types::PyModule, Bound, PyResult};
use settings::{TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings};
use token::Token;
use tokenizer::Tokenizer;
mod settings;
mod tokenizer;
mod trie;
pub use self::settings::{
TokenType, TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings,
};
pub use self::tokenizer::Tokenizer;
#[derive(Debug)]
#[pyclass]
pub struct Token {
#[pyo3(get, name = "token_type_index")]
pub token_type: TokenType,
#[pyo3(get, set, name = "token_type")]
pub token_type_py: PyObject,
#[pyo3(get)]
pub text: Py<PyString>,
#[pyo3(get)]
pub line: usize,
#[pyo3(get)]
pub col: usize,
#[pyo3(get)]
pub start: usize,
#[pyo3(get)]
pub end: usize,
#[pyo3(get)]
pub comments: Py<PyList>,
}
impl Token {
pub fn new(
token_type: TokenType,
text: String,
line: usize,
col: usize,
start: usize,
end: usize,
comments: Vec<String>,
) -> Token {
Python::with_gil(|py| Token {
token_type,
token_type_py: PyNone::get_bound(py).into_py(py),
text: PyString::new_bound(py, &text).into_py(py),
line,
col,
start,
end,
comments: PyList::new_bound(py, &comments).into(),
})
}
pub fn append_comments(&self, comments: &mut Vec<String>) {
Python::with_gil(|py| {
let pylist = self.comments.bind(py);
for comment in comments.iter() {
if let Err(_) = pylist.append(comment) {
panic!("Failed to append comments to the Python list");
}
}
});
// Simulate `Vec::append`.
let _ = std::mem::replace(comments, Vec::new());
}
}
#[pymethods]
impl Token {
#[pyo3(name = "__repr__")]
fn python_repr(&self) -> PyResult<String> {
Python::with_gil(|py| {
Ok(format!(
"<Token token_type: {}, text: {}, line: {}, col: {}, start: {}, end: {}, comments: {}>",
self.token_type_py.bind(py).repr()?,
self.text.bind(py).repr()?,
self.line,
self.col,
self.start,
self.end,
self.comments.bind(py).repr()?,
))
})
}
}
pub mod settings;
pub mod token;
pub mod tokenizer;
pub mod trie;
#[pymodule]
fn sqlglotrs(m: &Bound<'_, PyModule>) -> PyResult<()> {

View file

@ -1,10 +1,12 @@
use pyo3::prelude::*;
use std::collections::{HashMap, HashSet};
use pyo3::prelude::*;
pub type TokenType = u16;
#[derive(Clone, Debug)]
#[pyclass]
#[cfg_attr(feature = "profiling", derive(serde::Serialize, serde::Deserialize))]
pub struct TokenTypeSettings {
pub bit_string: TokenType,
pub break_: TokenType,
@ -41,7 +43,7 @@ impl TokenTypeSettings {
heredoc_string_alternative: TokenType,
hint: TokenType,
) -> Self {
TokenTypeSettings {
let token_type_settings = TokenTypeSettings {
bit_string,
break_,
dcolon,
@ -56,12 +58,31 @@ impl TokenTypeSettings {
var,
heredoc_string_alternative,
hint,
};
#[cfg(feature = "profiling")]
{
token_type_settings.write_json_to_string();
}
token_type_settings
}
}
#[cfg(feature = "profiling")]
impl TokenTypeSettings {
pub fn write_json_to_string(&self) {
let json = serde_json::to_string(self).unwrap();
let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
.join("benches/token_type_settings.json");
// Write to file
std::fs::write(path, &json).unwrap();
}
}
#[derive(Clone, Debug)]
#[pyclass]
#[cfg_attr(feature = "profiling", derive(serde::Serialize, serde::Deserialize))]
pub struct TokenizerSettings {
pub white_space: HashMap<char, TokenType>,
pub single_tokens: HashMap<char, TokenType>,
@ -141,7 +162,7 @@ impl TokenizerSettings {
let var_single_tokens_native: HashSet<char> =
var_single_tokens.iter().map(&to_char).collect();
TokenizerSettings {
let tokenizer_settings = TokenizerSettings {
white_space: white_space_native,
single_tokens: single_tokens_native,
keywords,
@ -162,15 +183,35 @@ impl TokenizerSettings {
string_escapes_allowed_in_raw_strings,
nested_comments,
hint_start,
};
#[cfg(feature = "profiling")]
{
tokenizer_settings.write_json_to_string();
}
tokenizer_settings
}
}
#[cfg(feature = "profiling")]
impl TokenizerSettings {
pub fn write_json_to_string(&self) {
let json = serde_json::to_string(self).unwrap();
let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
.join("benches/tokenizer_settings.json");
// Write to file
std::fs::write(path, &json).unwrap();
}
}
#[derive(Clone, Debug)]
#[pyclass]
#[cfg_attr(feature = "profiling", derive(serde::Serialize, serde::Deserialize))]
pub struct TokenizerDialectSettings {
pub unescaped_sequences: HashMap<String, String>,
pub identifiers_can_start_with_digit: bool,
pub numbers_can_be_underscore_separated: bool,
}
#[pymethods]
@ -179,10 +220,29 @@ impl TokenizerDialectSettings {
pub fn new(
unescaped_sequences: HashMap<String, String>,
identifiers_can_start_with_digit: bool,
numbers_can_be_underscore_separated: bool,
) -> Self {
TokenizerDialectSettings {
let settings = TokenizerDialectSettings {
unescaped_sequences,
identifiers_can_start_with_digit,
numbers_can_be_underscore_separated,
};
#[cfg(feature = "profiling")]
{
settings.write_json_to_string();
}
settings
}
}
#[cfg(feature = "profiling")]
impl TokenizerDialectSettings {
pub fn write_json_to_string(&self) {
let json = serde_json::to_string(self).unwrap();
let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
.join("benches/tokenizer_dialect_settings.json");
std::fs::write(path, &json).unwrap();
}
}

61
sqlglotrs/src/token.rs Normal file
View file

@ -0,0 +1,61 @@
use crate::settings::TokenType;
use pyo3::prelude::PyListMethods;
use pyo3::types::{PyList, PyNone, PyString};
use pyo3::{pyclass, IntoPy, Py, PyObject, Python};
#[derive(Debug)]
#[pyclass]
pub struct Token {
#[pyo3(get, name = "token_type_index")]
pub token_type: TokenType,
#[pyo3(get, set, name = "token_type")]
pub token_type_py: PyObject,
#[pyo3(get)]
pub text: Py<PyString>,
#[pyo3(get)]
pub line: usize,
#[pyo3(get)]
pub col: usize,
#[pyo3(get)]
pub start: usize,
#[pyo3(get)]
pub end: usize,
#[pyo3(get)]
pub comments: Py<PyList>,
}
impl Token {
pub fn new(
token_type: TokenType,
text: String,
line: usize,
col: usize,
start: usize,
end: usize,
comments: Vec<String>,
) -> Token {
Python::with_gil(|py| Token {
token_type,
token_type_py: PyNone::get_bound(py).into_py(py),
text: PyString::new_bound(py, &text).into_py(py),
line,
col,
start,
end,
comments: PyList::new_bound(py, &comments).into(),
})
}
pub fn append_comments(&self, comments: &mut Vec<String>) {
Python::with_gil(|py| {
let pylist = self.comments.bind(py);
for comment in comments.iter() {
if let Err(_) = pylist.append(comment) {
panic!("Failed to append comments to the Python list");
}
}
});
// Simulate `Vec::append`.
let _ = std::mem::replace(comments, Vec::new());
}
}

View file

@ -1,5 +1,6 @@
use crate::settings::TokenType;
use crate::trie::{Trie, TrieResult};
use crate::{Token, TokenType, TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings};
use crate::{Token, TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings};
use pyo3::exceptions::PyException;
use pyo3::prelude::*;
use std::cmp::{max, min};
@ -375,7 +376,10 @@ impl<'a> TokenizerState<'a> {
self.advance(1)?;
// Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres
if self.settings.nested_comments && !self.is_end && self.chars(comment_start_size) == *comment_start {
if self.settings.nested_comments
&& !self.is_end
&& self.chars(comment_start_size) == *comment_start
{
self.advance(comment_start_size as isize)?;
comment_count += 1
}
@ -397,7 +401,11 @@ impl<'a> TokenizerState<'a> {
if comment_start == self.settings.hint_start
&& self.tokens.last().is_some()
&& self.settings.tokens_preceding_hint.contains(&self.tokens.last().unwrap().token_type) {
&& self
.settings
.tokens_preceding_hint
.contains(&self.tokens.last().unwrap().token_type)
{
self.add(self.token_types.hint, None)?;
}
@ -443,7 +451,7 @@ impl<'a> TokenizerState<'a> {
self.advance(-(tag.len() as isize))?;
self.add(self.token_types.heredoc_string_alternative, None)?;
return Ok(true)
return Ok(true);
}
(None, *token_type, format!("{}{}{}", start, tag, end))
@ -455,10 +463,11 @@ impl<'a> TokenizerState<'a> {
};
self.advance(start.len() as isize)?;
let text = self.extract_string(&end, false, token_type == self.token_types.raw_string, true)?;
let text =
self.extract_string(&end, false, token_type == self.token_types.raw_string, true)?;
if let Some(b) = base {
if u64::from_str_radix(&text, b).is_err() {
if u128::from_str_radix(&text, b).is_err() {
return self.error_result(format!(
"Numeric string contains invalid characters from {}:{}",
self.line, self.start
@ -531,10 +540,16 @@ impl<'a> TokenizerState<'a> {
)
.map(|x| *x);
let replaced = literal.replace("_", "");
if let Some(unwrapped_token_type) = token_type {
self.add(self.token_types.number, Some(number_text))?;
self.add(self.token_types.dcolon, Some("::".to_string()))?;
self.add(unwrapped_token_type, Some(literal))?;
} else if self.dialect_settings.numbers_can_be_underscore_separated
&& self.is_numeric(&replaced)
{
self.add(self.token_types.number, Some(number_text + &replaced))?;
} else if self.dialect_settings.identifiers_can_start_with_digit {
self.add(self.token_types.var, None)?;
} else {
@ -673,7 +688,7 @@ impl<'a> TokenizerState<'a> {
if self.is_end {
if !raise_unmatched {
text.push(self.current_char);
return Ok(text)
return Ok(text);
}
return self.error_result(format!(
@ -699,11 +714,17 @@ impl<'a> TokenizerState<'a> {
}
fn is_identifier(&mut self, s: &str) -> bool {
s.chars().enumerate().all(
|(i, c)|
if i == 0 { self.is_alphabetic_or_underscore(c) }
else { self.is_alphabetic_or_underscore(c) || c.is_digit(10) }
)
s.chars().enumerate().all(|(i, c)| {
if i == 0 {
self.is_alphabetic_or_underscore(c)
} else {
self.is_alphabetic_or_underscore(c) || c.is_digit(10)
}
})
}
fn is_numeric(&mut self, s: &str) -> bool {
s.chars().all(|c| c.is_digit(10))
}
fn extract_value(&mut self) -> Result<String, TokenizerError> {