diff --git a/rust/datafusion/src/datasource/csv.rs b/rust/datafusion/src/datasource/csv.rs index c2b9b572d928..8fc27bd1cc91 100644 --- a/rust/datafusion/src/datasource/csv.rs +++ b/rust/datafusion/src/datasource/csv.rs @@ -53,7 +53,7 @@ pub struct CsvFile { has_header: bool, delimiter: u8, file_extension: String, - statistics: Option, + statistics: Statistics, } impl CsvFile { @@ -77,7 +77,7 @@ impl CsvFile { has_header: options.has_header, delimiter: options.delimiter, file_extension: String::from(options.file_extension), - statistics: None, + statistics: Statistics::default(), }) } } @@ -108,7 +108,7 @@ impl TableProvider for CsvFile { )?)) } - fn statistics(&self) -> Option { + fn statistics(&self) -> Statistics { self.statistics.clone() } } diff --git a/rust/datafusion/src/datasource/datasource.rs b/rust/datafusion/src/datasource/datasource.rs index 5882336fe176..af77346c385b 100644 --- a/rust/datafusion/src/datasource/datasource.rs +++ b/rust/datafusion/src/datasource/datasource.rs @@ -26,12 +26,12 @@ use crate::physical_plan::ExecutionPlan; /// This table statistics are estimates. /// It can not be used directly in the precise compute -#[derive(Clone)] +#[derive(Clone, Default)] pub struct Statistics { /// The number of table rows - pub num_rows: usize, + pub num_rows: Option, /// total byte of the table rows - pub total_byte_size: usize, + pub total_byte_size: Option, } /// Source table @@ -52,5 +52,5 @@ pub trait TableProvider { /// Returns the table Statistics /// Statistics should be optional because not all data sources can provide statistics. - fn statistics(&self) -> Option; + fn statistics(&self) -> Statistics; } diff --git a/rust/datafusion/src/datasource/memory.rs b/rust/datafusion/src/datasource/memory.rs index 5b4f9fd7301a..f00a3813197b 100644 --- a/rust/datafusion/src/datasource/memory.rs +++ b/rust/datafusion/src/datasource/memory.rs @@ -36,21 +36,28 @@ use crate::physical_plan::ExecutionPlan; pub struct MemTable { schema: SchemaRef, batches: Vec>, - statistics: Option, + statistics: Statistics, } impl MemTable { /// Create a new in-memory table from the provided schema and record batches pub fn try_new(schema: SchemaRef, partitions: Vec>) -> Result { - if partitions.iter().all(|partition| { - partition + if partitions + .iter() + .flatten() + .all(|batches| batches.schema() == schema) + { + let num_rows: usize = partitions .iter() - .all(|batches| batches.schema().as_ref() == schema.as_ref()) - }) { + .flat_map(|batches| batches.iter().map(RecordBatch::num_rows)) + .sum(); Ok(Self { schema, batches: partitions, - statistics: None, + statistics: Statistics { + num_rows: Some(num_rows), + total_byte_size: None, + }, }) } else { Err(DataFusionError::Plan( @@ -136,7 +143,7 @@ impl TableProvider for MemTable { )?)) } - fn statistics(&self) -> Option { + fn statistics(&self) -> Statistics { self.statistics.clone() } } @@ -167,6 +174,8 @@ mod tests { let provider = MemTable::try_new(schema, vec![vec![batch]])?; + assert_eq!(provider.statistics().num_rows, Some(3)); + // scan with projection let exec = provider.scan(&Some(vec![2, 1]), 1024)?; let mut it = exec.execute(0).await?; diff --git a/rust/datafusion/src/datasource/parquet.rs b/rust/datafusion/src/datasource/parquet.rs index 6d7ec028fb09..9ac9c7ba871e 100644 --- a/rust/datafusion/src/datasource/parquet.rs +++ b/rust/datafusion/src/datasource/parquet.rs @@ -33,7 +33,7 @@ use crate::physical_plan::ExecutionPlan; pub struct ParquetTable { path: String, schema: SchemaRef, - statistics: Option, + statistics: Statistics, } impl ParquetTable { @@ -44,7 +44,7 @@ impl ParquetTable { Ok(Self { path: path.to_string(), schema, - statistics: None, + statistics: Statistics::default(), }) } } @@ -73,7 +73,7 @@ impl TableProvider for ParquetTable { )?)) } - fn statistics(&self) -> Option { + fn statistics(&self) -> Statistics { self.statistics.clone() } } diff --git a/rust/datafusion/tests/dataframe.rs b/rust/datafusion/tests/dataframe.rs index 6bf6c2379adf..472de2ddf225 100644 --- a/rust/datafusion/tests/dataframe.rs +++ b/rust/datafusion/tests/dataframe.rs @@ -147,8 +147,8 @@ impl TableProvider for CustomTableProvider { })) } - fn statistics(&self) -> Option { - None + fn statistics(&self) -> Statistics { + Statistics::default() } }