diff --git a/datafusion/sqllogictest/test_files/run_end_encoding.slt b/datafusion/sqllogictest/test_files/run_end_encoding.slt new file mode 100644 index 0000000000000..8883aa674cdeb --- /dev/null +++ b/datafusion/sqllogictest/test_files/run_end_encoding.slt @@ -0,0 +1,798 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Tests for querying on Run-End Encoded (REE) data + +# This table models a common pattern in event and timeseries logs, +# where a string column (e.g., a tag or type) contains long +# contiguous runs of the same value — a pattern ideal for +# Run-End Encoding. + +# There are three types of columns: +# 1. `pathway_type`: a REE-encoded Utf8 column, repeating values in long runs +# 2. `latency`: a Float64 field column with numeric values +# 3. `time`: a nanosecond timestamp field + +# The REE encoding stores: +# run_ends = [4, 8, 16] +# values = [ +# "pathway_type:full_detailed_sequence", +# "pathway_type:partial_detailed_sequence", +# "pathway_type:full_detailed_sequence" +# ] + +# statement ok +# CREATE VIEW ree_test_long_strings AS +# SELECT +# arrow_cast(column1, 'RunEndEncoded(Int32, Utf8)') AS pathway_type, +# arrow_cast(column2, 'Float64') AS latency, +# arrow_cast(column3, 'Timestamp(Nanosecond, None)') AS time +# FROM ( +# VALUES +# ('pathway_type:full_detailed_sequence', 10.0, 1703030400000000000), +# ('pathway_type:full_detailed_sequence', 12.0, 1703031000000000000), +# ('pathway_type:full_detailed_sequence', 11.5, 1703031600000000000), +# ('pathway_type:full_detailed_sequence', 11.0, 1703032200000000000), +# ('pathway_type:partial_detailed_sequence',15.0, 1703032800000000000), +# ('pathway_type:partial_detailed_sequence',15.2, 1703033400000000000), +# ('pathway_type:partial_detailed_sequence',15.1, 1703034000000000000), +# ('pathway_type:partial_detailed_sequence',15.3, 1703034600000000000), +# ('pathway_type:full_detailed_sequence', 12.0, 1703035200000000000), +# ('pathway_type:full_detailed_sequence', 12.1, 1703035800000000000), +# ('pathway_type:full_detailed_sequence', 12.3, 1703036400000000000), +# ('pathway_type:full_detailed_sequence', 12.5, 1703037000000000000), +# ('pathway_type:full_detailed_sequence', 12.4, 1703037600000000000), +# ('pathway_type:full_detailed_sequence', 12.6, 1703038200000000000), +# ('pathway_type:full_detailed_sequence', 12.7, 1703038800000000000), +# ('pathway_type:full_detailed_sequence', 12.8, 1703039400000000000) +# ); +# statement ok +# CREATE VIEW ree_test_long_strings AS +# SELECT +# arrow_cast(column1, 'RunEndEncoded(Int32, Utf8)') AS pathway_type, +# arrow_cast(column2, 'Float64') AS latency, +# arrow_cast(column3, 'Timestamp(Nanosecond, None)') AS time +# FROM ( +# VALUES +# ('pathway_type:full_detailed_sequence', 10.0, 1703030400000000000), +# ('pathway_type:full_detailed_sequence', 12.0, 1703031000000000000), +# ('pathway_type:full_detailed_sequence', 11.5, 1703031600000000000), +# ('pathway_type:full_detailed_sequence', 11.0, 1703032200000000000), +# ('pathway_type:partial_detailed_sequence',15.0, 1703032800000000000), +# ('pathway_type:partial_detailed_sequence',15.2, 1703033400000000000), +# ('pathway_type:partial_detailed_sequence',15.1, 1703034000000000000), +# ('pathway_type:partial_detailed_sequence',15.3, 1703034600000000000), +# ('pathway_type:full_detailed_sequence', 12.0, 1703035200000000000), +# ('pathway_type:full_detailed_sequence', 12.1, 1703035800000000000), +# ('pathway_type:full_detailed_sequence', 12.3, 1703036400000000000), +# ('pathway_type:full_detailed_sequence', 12.5, 1703037000000000000), +# ('pathway_type:full_detailed_sequence', 12.4, 1703037600000000000), +# ('pathway_type:full_detailed_sequence', 12.6, 1703038200000000000), +# ('pathway_type:full_detailed_sequence', 12.7, 1703038800000000000), +# ('pathway_type:full_detailed_sequence', 12.8, 1703039400000000000) +# ); + +# expanded view of REE +# query TRI +# select * from ree_test_long_strings; +# ---- +# pathway_type:full_detailed_sequence 10.0 1703030400000000000 +# pathway_type:full_detailed_sequence 12.0 1703031000000000000 +# pathway_type:full_detailed_sequence 11.5 1703031600000000000 +# pathway_type:full_detailed_sequence 11.0 1703032200000000000 +# pathway_type:partial_detailed_sequence 15.0 1703032800000000000 +# pathway_type:partial_detailed_sequence 15.2 1703033400000000000 +# pathway_type:partial_detailed_sequence 15.1 1703034000000000 +# pathway_type:partial_detailed_sequence 15.3 1703034600000000000 +# pathway_type:full_detailed_sequence 12.0 1703035200000000000 +# pathway_type:full_detailed_sequence 12.1 1703035800000000000 +# pathway_type:full_detailed_sequence 12.3 1703036400000000000 +# pathway_type:full_detailed_sequence 12.5 1703037000000000000 +# pathway_type:full_detailed_sequence 12.4 1703037600000000000 +# pathway_type:full_detailed_sequence 12.6 1703038200000000000 +# pathway_type:full_detailed_sequence 12.7 1703038800000000000 +# pathway_type:full_detailed_sequence 12.8 1703039400000000000 +# query TRI +# select * from ree_test_long_strings; +# ---- +# pathway_type:full_detailed_sequence 10.0 1703030400000000000 +# pathway_type:full_detailed_sequence 12.0 1703031000000000000 +# pathway_type:full_detailed_sequence 11.5 1703031600000000000 +# pathway_type:full_detailed_sequence 11.0 1703032200000000000 +# pathway_type:partial_detailed_sequence 15.0 1703032800000000000 +# pathway_type:partial_detailed_sequence 15.2 1703033400000000000 +# pathway_type:partial_detailed_sequence 15.1 1703034000000000 +# pathway_type:partial_detailed_sequence 15.3 1703034600000000000 +# pathway_type:full_detailed_sequence 12.0 1703035200000000000 +# pathway_type:full_detailed_sequence 12.1 1703035800000000000 +# pathway_type:full_detailed_sequence 12.3 1703036400000000000 +# pathway_type:full_detailed_sequence 12.5 1703037000000000000 +# pathway_type:full_detailed_sequence 12.4 1703037600000000000 +# pathway_type:full_detailed_sequence 12.6 1703038200000000000 +# pathway_type:full_detailed_sequence 12.7 1703038800000000000 +# pathway_type:full_detailed_sequence 12.8 1703039400000000000 + +# dataTypes of table must be correct +# query TTT +# DESCRIBE ree_test_long_strings; +# ---- +# pathway_type RunEndEncoded(Int32, Utf8) YES +# latency Float64 YES +# time Timestamp(Nanosecond,None) YES +# query TTT +# DESCRIBE ree_test_long_strings; +# ---- +# pathway_type RunEndEncoded(Int32, Utf8) YES +# latency Float64 YES +# time Timestamp(Nanosecond,None) YES + +# Basic filtering tests +# query TRI +# SELECT pathway_type, latency, time FROM ree_test_long_strings WHERE latency > 15.0; +# ---- +# pathway_type:partial_detailed_sequence 15.0 1703032800000000000 +# pathway_type:partial_detailed_sequence 15.2 1703033400000000000 +# pathway_type:partial_detailed_sequence 15.1 1703034000000000000 +# pathway_type:partial_detailed_sequence 15.3 1703034600000000000 +# query TRI +# SELECT pathway_type, latency, time FROM ree_test_long_strings WHERE latency > 15.0; +# ---- +# pathway_type:partial_detailed_sequence 15.0 1703032800000000000 +# pathway_type:partial_detailed_sequence 15.2 1703033400000000000 +# pathway_type:partial_detailed_sequence 15.1 1703034000000000000 +# pathway_type:partial_detailed_sequence 15.3 1703034600000000000 + +# query TRI +# SELECT pathway_type, latency, time FROM ree_test_long_strings WHERE pathway_type = 'pathway_type:full_detailed_sequence' AND latency < 12.0; +# ---- +# pathway_type:full_detailed_sequence 10.0 1703030400000000000 +# pathway_type:full_detailed_sequence 11.5 1703031600000000000 +# pathway_type:full_detailed_sequence 11.0 1703032200000000000 +# query TRI +# SELECT pathway_type, latency, time FROM ree_test_long_strings WHERE pathway_type = 'pathway_type:full_detailed_sequence' AND latency < 12.0; +# ---- +# pathway_type:full_detailed_sequence 10.0 1703030400000000000 +# pathway_type:full_detailed_sequence 11.5 1703031600000000000 +# pathway_type:full_detailed_sequence 11.0 1703032200000000000 + +# COUNT tests, +# query I +# SELECT COUNT(*) FROM ree_test_long_strings; +# ---- +# 16 +# query I +# SELECT COUNT(*) FROM ree_test_long_strings; +# ---- +# 16 + +# query I +# SELECT COUNT(DISTINCT pathway_type) FROM ree_test_long_strings; +# ---- +# 2 +# query I +# SELECT COUNT(DISTINCT pathway_type) FROM ree_test_long_strings; +# ---- +# 2 + +# DISTINCT tests +# query T +# SELECT DISTINCT pathway_type FROM ree_test_long_strings ORDER BY pathway_type; +# ---- +# pathway_type:full_detailed_sequence +# pathway_type:partial_detailed_sequence + +# Validate REE column type preservation +# query TTT +# DESCRIBE TABLE (SELECT DISTINCT pathway_type FROM ree_test_long_strings ORDER BY pathway_type); +# ---- +# pathway_type RunEndEncoded(Int32, Utf8) YES +# query T +# SELECT DISTINCT pathway_type FROM ree_test_long_strings ORDER BY pathway_type; +# ---- +# pathway_type:full_detailed_sequence +# pathway_type:partial_detailed_sequence + +# Validate REE column type preservation +# query TTT +# DESCRIBE TABLE (SELECT DISTINCT pathway_type FROM ree_test_long_strings ORDER BY pathway_type); +# ---- +# pathway_type RunEndEncoded(Int32, Utf8) YES + +# clean up +# statement ok +# DROP VIEW ree_test_long_strings; +# statement ok +# DROP VIEW ree_test_long_strings; + +# Create table with two REE columns +# statement ok +# CREATE VIEW ree_test_two_columns AS +# SELECT +# arrow_cast(column1, 'RunEndEncoded(Int32, Utf8)') AS name, +# arrow_cast(column2, 'RunEndEncoded(Int32, Utf8)') AS category, +# arrow_cast(column3, 'Int32') AS value +# FROM ( +# VALUES +# ('Alice', 'ADMIN', 100), +# ('Alice', 'ADMIN', 101), +# ('Alice', 'ADMIN', 102), +# ('Bob', 'USER', 200), +# ('Bob', 'USER', 201), +# ('Bob', 'USER', 202), +# ('Charlie', 'ADMIN', 300), +# ('Charlie', 'ADMIN', 301), +# ('David', 'USER', 400), +# ('David', 'USER', 401), +# ('Eve', 'ADMIN', 500), +# ('Eve', 'ADMIN', 501) +# ); +# statement ok +# CREATE VIEW ree_test_two_columns AS +# SELECT +# arrow_cast(column1, 'RunEndEncoded(Int32, Utf8)') AS name, +# arrow_cast(column2, 'RunEndEncoded(Int32, Utf8)') AS category, +# arrow_cast(column3, 'Int32') AS value +# FROM ( +# VALUES +# ('Alice', 'ADMIN', 100), +# ('Alice', 'ADMIN', 101), +# ('Alice', 'ADMIN', 102), +# ('Bob', 'USER', 200), +# ('Bob', 'USER', 201), +# ('Bob', 'USER', 202), +# ('Charlie', 'ADMIN', 300), +# ('Charlie', 'ADMIN', 301), +# ('David', 'USER', 400), +# ('David', 'USER', 401), +# ('Eve', 'ADMIN', 500), +# ('Eve', 'ADMIN', 501) +# ); + +# Verify table structure +# query TTT +# DESCRIBE ree_test_two_columns; +# ---- +# name RunEndEncoded(Int32, Utf8) YES +# category RunEndEncoded(Int32, Utf8) YES +# value Int32 YES +# query TTT +# DESCRIBE ree_test_two_columns; +# ---- +# name RunEndEncoded(Int32, Utf8) YES +# category RunEndEncoded(Int32, Utf8) YES +# value Int32 YES + +# Show all data +# query TTI +# SELECT * FROM ree_test_two_columns; +# ---- +# Alice ADMIN 100 +# Alice ADMIN 101 +# Alice ADMIN 102 +# Bob USER 200 +# Bob USER 201 +# Bob USER 202 +# Charlie ADMIN 300 +# Charlie ADMIN 301 +# David USER 400 +# David USER 401 +# Eve ADMIN 500 +# Eve ADMIN 501 +# query TTI +# SELECT * FROM ree_test_two_columns; +# ---- +# Alice ADMIN 100 +# Alice ADMIN 101 +# Alice ADMIN 102 +# Bob USER 200 +# Bob USER 201 +# Bob USER 202 +# Charlie ADMIN 300 +# Charlie ADMIN 301 +# David USER 400 +# David USER 401 +# Eve ADMIN 500 +# Eve ADMIN 501 + +# LOWER function tests +# query T +# SELECT LOWER(name) FROM ree_test_two_columns WHERE name = 'Alice' LIMIT 1; +# ---- +# alice +# query T +# SELECT LOWER(name) FROM ree_test_two_columns WHERE name = 'Alice' LIMIT 1; +# ---- +# alice + +# query T +# SELECT LOWER(category) FROM ree_test_two_columns WHERE category = 'ADMIN' LIMIT 1; +# ---- +# admin +# query T +# SELECT LOWER(category) FROM ree_test_two_columns WHERE category = 'ADMIN' LIMIT 1; +# ---- +# admin + +# query TT +# SELECT LOWER(name), LOWER(category) FROM ree_test_two_columns WHERE name = 'Bob' LIMIT 1; +# ---- +# bob user +# query TT +# SELECT LOWER(name), LOWER(category) FROM ree_test_two_columns WHERE name = 'Bob' LIMIT 1; +# ---- +# bob user + +# query TTI +# SELECT LOWER(name), LOWER(category), value FROM ree_test_two_columns ORDER BY name, value LIMIT 3; +# ---- +# alice admin 100 +# alice admin 101 +# alice admin 102 + +# Validate LOWER() function return type on REE column +# query TT +# DESCRIBE TABLE (SELECT LOWER(name) AS lower_name FROM ree_test_two_columns LIMIT 1); +# ---- +# lower_name RunEndEncoded(Int32, Utf8) YES +# query TTI +# SELECT LOWER(name), LOWER(category), value FROM ree_test_two_columns ORDER BY name, value LIMIT 3; +# ---- +# alice admin 100 +# alice admin 101 +# alice admin 102 + +# Validate LOWER() function return type on REE column +# query TT +# DESCRIBE TABLE (SELECT LOWER(name) AS lower_name FROM ree_test_two_columns LIMIT 1); +# ---- +# lower_name RunEndEncoded(Int32, Utf8) YES + +# UPPER() function tests +# query T +# SELECT UPPER(name) FROM ree_test_two_columns WHERE name = 'Alice' LIMIT 1; +# ---- +# ALICE +# query T +# SELECT UPPER(name) FROM ree_test_two_columns WHERE name = 'Alice' LIMIT 1; +# ---- +# ALICE + +# query TT +# SELECT UPPER(name), UPPER(category) FROM ree_test_two_columns WHERE name = 'Charlie' LIMIT 1; +# ---- +# CHARLIE ADMIN +# query TT +# SELECT UPPER(name), UPPER(category) FROM ree_test_two_columns WHERE name = 'Charlie' LIMIT 1; +# ---- +# CHARLIE ADMIN + +# UPPER on multiple REE columns +# query TTI +# SELECT UPPER(name), UPPER(category), value FROM ree_test_two_columns WHERE name = 'David' ORDER BY value; +# ---- +# DAVID USER 400 +# DAVID USER 401 + +# Validate UPPER() function return type on REE column +# query TT +# DESCRIBE TABLE (SELECT UPPER(category) AS upper_category FROM ree_test_two_columns LIMIT 1); +# ---- +# upper_category RunEndEncoded(Int32, Utf8) YES +# query TTI +# SELECT UPPER(name), UPPER(category), value FROM ree_test_two_columns WHERE name = 'David' ORDER BY value; +# ---- +# DAVID USER 400 +# DAVID USER 401 + +# Validate UPPER() function return type on REE column +# query TT +# DESCRIBE TABLE (SELECT UPPER(category) AS upper_category FROM ree_test_two_columns LIMIT 1); +# ---- +# upper_category RunEndEncoded(Int32, Utf8) YES + +# CONCAT() function tests +# query T +# SELECT CONCAT(name, '_', category) FROM ree_test_two_columns WHERE name = 'Alice' LIMIT 1; +# ---- +# Alice_ADMIN + +# Validate CONCAT() function return type with REE columns +# query TT +# DESCRIBE TABLE (SELECT CONCAT(name, '_', category) AS combined FROM ree_test_two_columns LIMIT 1); +# ---- +# combined RunEndEncoded(Int32, Utf8) YES +# query T +# SELECT CONCAT(name, '_', category) FROM ree_test_two_columns WHERE name = 'Alice' LIMIT 1; +# ---- +# Alice_ADMIN + +# Validate CONCAT() function return type with REE columns +# query TT +# DESCRIBE TABLE (SELECT CONCAT(name, '_', category) AS combined FROM ree_test_two_columns LIMIT 1); +# ---- +# combined RunEndEncoded(Int32, Utf8) YES + +# stacking CONCAT and LOWER functions +# query T +# SELECT CONCAT(LOWER(name), '-', LOWER(category)) FROM ree_test_two_columns WHERE name = 'Bob' LIMIT 1; +# ---- +# bob-user +# query T +# SELECT CONCAT(LOWER(name), '-', LOWER(category)) FROM ree_test_two_columns WHERE name = 'Bob' LIMIT 1; +# ---- +# bob-user + +# query T +# SELECT CONCAT(name, ' is a ', category) FROM ree_test_two_columns WHERE name = 'Charlie' LIMIT 1; +# ---- +# Charlie is a ADMIN +# query T +# SELECT CONCAT(name, ' is a ', category) FROM ree_test_two_columns WHERE name = 'Charlie' LIMIT 1; +# ---- +# Charlie is a ADMIN + +# SUBSTR()/SUBSTRING() function tests +# query T +# SELECT SUBSTR(name, 1, 3) FROM ree_test_two_columns WHERE name = 'Alice' LIMIT 1; +# ---- +# Ali +# query T +# SELECT SUBSTR(name, 1, 3) FROM ree_test_two_columns WHERE name = 'Alice' LIMIT 1; +# ---- +# Ali + +# query T +# SELECT SUBSTR(category, 1, 2) FROM ree_test_two_columns WHERE category = 'ADMIN' LIMIT 1; +# ---- +# AD +# query T +# SELECT SUBSTR(category, 1, 2) FROM ree_test_two_columns WHERE category = 'ADMIN' LIMIT 1; +# ---- +# AD + +# query TT +# SELECT SUBSTR(name, 1, 2), SUBSTR(category, 1, 1) FROM ree_test_two_columns WHERE name = 'Charlie' LIMIT 1; +# ---- +# Ch A +# query TT +# SELECT SUBSTR(name, 1, 2), SUBSTR(category, 1, 1) FROM ree_test_two_columns WHERE name = 'Charlie' LIMIT 1; +# ---- +# Ch A + +# query TTI +# SELECT SUBSTR(name, 1, 3), SUBSTR(category, 1, 3), value FROM ree_test_two_columns WHERE name = 'David' ORDER BY value; +# ---- +# Dav USE 400 +# Dav USE 401 + +# Validate SUBSTR() function return type on REE column +# query TT +# DESCRIBE TABLE (SELECT SUBSTR(name, 1, 3) AS name_prefix FROM ree_test_two_columns LIMIT 1); +# ---- +# name_prefix RunEndEncoded(Int32, Utf8) YES +# query TTI +# SELECT SUBSTR(name, 1, 3), SUBSTR(category, 1, 3), value FROM ree_test_two_columns WHERE name = 'David' ORDER BY value; +# ---- +# Dav USE 400 +# Dav USE 401 + +# Validate SUBSTR() function return type on REE column +# query TT +# DESCRIBE TABLE (SELECT SUBSTR(name, 1, 3) AS name_prefix FROM ree_test_two_columns LIMIT 1); +# ---- +# name_prefix RunEndEncoded(Int32, Utf8) YES + +# REPLACE() function tests +# query T +# SELECT REPLACE(name, 'i', 'y') FROM ree_test_two_columns WHERE name = 'Alice' LIMIT 1; +# ---- +# Alyce +# query T +# SELECT REPLACE(name, 'i', 'y') FROM ree_test_two_columns WHERE name = 'Alice' LIMIT 1; +# ---- +# Alyce + +# query T +# SELECT REPLACE(category, 'ADMIN', 'MANAGER') FROM ree_test_two_columns WHERE category = 'ADMIN' LIMIT 1; +# ---- +# MANAGER +# query T +# SELECT REPLACE(category, 'ADMIN', 'MANAGER') FROM ree_test_two_columns WHERE category = 'ADMIN' LIMIT 1; +# ---- +# MANAGER + +# query T +# SELECT REPLACE(name, 'e', 'a') FROM ree_test_two_columns WHERE name = 'Eve' LIMIT 1; +# ---- +# Eva + +# Validate REPLACE() function return type on REE column +# query TT +# DESCRIBE TABLE (SELECT REPLACE(name, 'i', 'y') AS replaced_name FROM ree_test_two_columns LIMIT 1); +# ---- +# replaced_name RunEndEncoded(Int32, Utf8) YES +# query T +# SELECT REPLACE(name, 'e', 'a') FROM ree_test_two_columns WHERE name = 'Eve' LIMIT 1; +# ---- +# Eva + +# Validate REPLACE() function return type on REE column +# query TT +# DESCRIBE TABLE (SELECT REPLACE(name, 'i', 'y') AS replaced_name FROM ree_test_two_columns LIMIT 1); +# ---- +# replaced_name RunEndEncoded(Int32, Utf8) YES + +# REVERSE() function tests +# query T +# SELECT REVERSE(category) FROM ree_test_two_columns WHERE category = 'ADMIN' LIMIT 1; +# ---- +# NIMDA +# query T +# SELECT REVERSE(category) FROM ree_test_two_columns WHERE category = 'ADMIN' LIMIT 1; +# ---- +# NIMDA + +# query TT +# SELECT REVERSE(name), REVERSE(category) FROM ree_test_two_columns WHERE name = 'Charlie' LIMIT 1; +# ---- +# eilrahC NIMDA + +# Validate REVERSE() function return type on REE column +# query TT +# DESCRIBE TABLE (SELECT REVERSE(category) AS reversed_category FROM ree_test_two_columns LIMIT 1); +# ---- +# reversed_category RunEndEncoded(Int32, Utf8) YES + +# query TTI +# SELECT REVERSE(name), REVERSE(category), value FROM ree_test_two_columns WHERE name = 'Eve' ORDER BY value; +# ---- +# evE NIMDA 500 +# evE NIMDA 501 +# query TT +# SELECT REVERSE(name), REVERSE(category) FROM ree_test_two_columns WHERE name = 'Charlie' LIMIT 1; +# ---- +# eilrahC NIMDA + +# Validate REVERSE() function return type on REE column +# query TT +# DESCRIBE TABLE (SELECT REVERSE(category) AS reversed_category FROM ree_test_two_columns LIMIT 1); +# ---- +# reversed_category RunEndEncoded(Int32, Utf8) YES + +# query TTI +# SELECT REVERSE(name), REVERSE(category), value FROM ree_test_two_columns WHERE name = 'Eve' ORDER BY value; +# ---- +# evE NIMDA 500 +# evE NIMDA 501 + +# Combined string function tests +# query T +# SELECT UPPER(SUBSTR(name, 1, 2)) FROM ree_test_two_columns WHERE name = 'Alice' LIMIT 1; +# ---- +# AL +# query T +# SELECT UPPER(SUBSTR(name, 1, 2)) FROM ree_test_two_columns WHERE name = 'Alice' LIMIT 1; +# ---- +# AL + +# query T +# SELECT LOWER(REVERSE(category)) FROM ree_test_two_columns WHERE category = 'USER' LIMIT 1; +# ---- +# resu +# query T +# SELECT LOWER(REVERSE(category)) FROM ree_test_two_columns WHERE category = 'USER' LIMIT 1; +# ---- +# resu + +# query T +# SELECT CONCAT(SUBSTR(name, 1, 1), '_', LOWER(category)) FROM ree_test_two_columns WHERE name = 'Charlie' LIMIT 1; +# ---- +# C_admin +# query T +# SELECT CONCAT(SUBSTR(name, 1, 1), '_', LOWER(category)) FROM ree_test_two_columns WHERE name = 'Charlie' LIMIT 1; +# ---- +# C_admin + +# query TT +# SELECT +# CONCAT(SUBSTR(name, 1, 2), '_', LOWER(category)), +# REVERSE(REPLACE(name, 'e', 'a')) +# FROM ree_test_two_columns WHERE name = 'Eve' LIMIT 1; +# ---- +# Ev_admin avE +# query TT +# SELECT +# CONCAT(SUBSTR(name, 1, 2), '_', LOWER(category)), +# REVERSE(REPLACE(name, 'e', 'a')) +# FROM ree_test_two_columns WHERE name = 'Eve' LIMIT 1; +# ---- +# Ev_admin avE + +# String functions with filtering +# query T +# SELECT LOWER(name) FROM ree_test_two_columns WHERE UPPER(category) = 'ADMIN' LIMIT 3; +# ---- +# alice +# alice +# alice +# query T +# SELECT LOWER(name) FROM ree_test_two_columns WHERE UPPER(category) = 'ADMIN' LIMIT 3; +# ---- +# alice +# alice +# alice + +# query T +# SELECT SUBSTR(name, 1, 3) FROM ree_test_two_columns WHERE LOWER(category) = 'user' LIMIT 3; +# ---- +# Bob +# Bob +# Bob +# query T +# SELECT SUBSTR(name, 1, 3) FROM ree_test_two_columns WHERE LOWER(category) = 'user' LIMIT 3; +# ---- +# Bob +# Bob +# Bob + +# query T +# SELECT REVERSE(name) FROM ree_test_two_columns WHERE SUBSTR(category, 1, 1) = 'A' LIMIT 3; +# ---- +# ecilA +# ecilA +# ecilA + +# Cleanup +# statement ok +# DROP VIEW ree_test_two_columns; + +# Test REE with NULL values & non repeated values +# statement ok +# CREATE VIEW ree_test_edge_cases AS +# SELECT +# arrow_cast(column1, 'RunEndEncoded(Int32, Utf8)') AS status, +# arrow_cast(column2, 'RunEndEncoded(Int32, Utf8)') AS unique_id, +# arrow_cast(column3, 'Int32') AS value +# FROM ( +# VALUES +# ('active', 'user_001', 100), +# ('active', 'user_002', 101), +# (NULL, 'user_003', 102), +# (NULL, 'user_004', 103), +# ('inactive', 'user_005', 104), +# ('active', 'user_006', 105), +# (NULL, 'user_007', 106), +# ('inactive', 'user_008', 107) +# ); + +# Verify NULL handling and unique values in REE +# query TTI +# SELECT * FROM ree_test_edge_cases; +# ---- +# active user_001 100 +# active user_002 101 +# NULL user_003 102 +# NULL user_004 103 +# inactive user_005 104 +# active user_006 105 +# NULL user_007 106 +# inactive user_008 107 + +# query TTT +# DESCRIBE ree_test_edge_cases; +# ---- +# status RunEndEncoded(Int32, Utf8) YES +# unique_id RunEndEncoded(Int32, Utf8) YES +# value Int32 YES + +# Test filtering with NULLs on status column +# query TTI +# SELECT * FROM ree_test_edge_cases WHERE status IS NULL; +# ---- +# NULL user_003 102 +# NULL user_004 103 +# NULL user_007 106 + +# Validate that the datatype of the column is still RunEndEncoded even if the values are null +# query TTT +# DESCRIBE TABLE (SELECT status FROM ree_test_edge_cases WHERE status IS NULL); +# ---- +# status RunEndEncoded(Int32, Utf8) YES + +# query TTI +# SELECT * FROM ree_test_edge_cases WHERE status IS NOT NULL; +# ---- +# active user_001 100 +# active user_002 101 +# inactive user_005 104 +# active user_006 105 +# inactive user_008 107 + +# Test IS DISTINCT FROM with REE columns containing NULLs +query TTI +SELECT status, unique_id, value FROM ree_test_edge_cases +WHERE status IS DISTINCT FROM 'active' AND unique_id IS DISTINCT FROM 'user_001'; +---- +NULL user_003 102 +NULL user_004 103 +inactive user_005 104 +NULL user_007 106 +inactive user_008 107 + +# Test aggregation with NULLs on status column +# query I +# SELECT COUNT(status) FROM ree_test_edge_cases; +# ---- +# 5 + +# query I +# SELECT COUNT(DISTINCT status) FROM ree_test_edge_cases; +# ---- +# 3 + +# Test that REE still works correctly with no runs (unique_id column) +# query I +# SELECT COUNT(DISTINCT unique_id) FROM ree_test_edge_cases; +# ---- +# 8 + +# Test string functions on unique values (unique_id column) +# query T +# SELECT SUBSTR(unique_id, 1, 4) FROM ree_test_edge_cases WHERE status = 'active' LIMIT 2; +# ---- +# user +# user + +# Test string functions with NULLs (status column) +# query T +# SELECT UPPER(status) FROM ree_test_edge_cases WHERE status IS NOT NULL LIMIT 3; +# ---- +# ACTIVE +# ACTIVE +# INACTIVE + +# Test combined operations on both REE columns +# query TT +# SELECT LOWER(status), SUBSTR(unique_id, 1, 4) FROM ree_test_edge_cases WHERE status IS NOT NULL LIMIT 3; +# ---- +# active user +# active user +# inactive user + +# Test filtering using string functions on unique values +# query TTI +# SELECT * FROM ree_test_edge_cases WHERE SUBSTR(unique_id, 1, 4) = 'user' AND status IS NOT NULL; +# ---- +# active user_001 100 +# active user_002 101 +# inactive user_005 104 +# active user_006 105 +# inactive user_008 107 + +# Test aggregation on both REE columns +# query II +# SELECT COUNT(DISTINCT status), COUNT(DISTINCT unique_id) FROM ree_test_edge_cases; +# ---- +# 3 8 + +# Cleanup +# statement ok +# DROP VIEW ree_test_edge_cases; +# statement ok +# DROP VIEW ree_test_edge_cases; + +#TBD: Look over logical_plan's being generated with 'explain', ref:https://github.com/apache/datafusion/blob/main/datafusion/sqllogictest/test_files/dictionary.slt line 438 \ No newline at end of file