From af77bc60e29a104b0a0d89f2c74369bdc9bd26a0 Mon Sep 17 00:00:00 2001 From: Jack Wright <56345+ayax79@users.noreply.github.com> Date: Mon, 16 Sep 2024 16:07:13 -0700 Subject: Improved null handling when converting from nu -> dataframe. (#13855) # Description Fixes: #12726 and #13185 Previously converting columns that contained null caused polars to force a dtype of object even when using a schema. Now: 1. When using a schema, the type the schema defines for the column will always be used. 2. When a schema is not used, the previous type is used when a value is null. # User-Facing Changes - The type defined by the schema we be respected when passing in a null value `[a]; [null] | polars into-df -s {a: str}` will create a df with an str dtype column with one null value versus a column of type object. - *BREAKING CHANGE* If you define a schema, all columns must be in the schema. --- .../src/dataframe/command/aggregation/count.rs | 5 +- .../dataframe/values/nu_dataframe/conversion.rs | 312 ++++++++++++++++++--- 2 files changed, 272 insertions(+), 45 deletions(-) diff --git a/crates/nu_plugin_polars/src/dataframe/command/aggregation/count.rs b/crates/nu_plugin_polars/src/dataframe/command/aggregation/count.rs index 8bf816341..3b1b32952 100644 --- a/crates/nu_plugin_polars/src/dataframe/command/aggregation/count.rs +++ b/crates/nu_plugin_polars/src/dataframe/command/aggregation/count.rs @@ -32,12 +32,9 @@ impl PluginCommand for ExprCount { } fn examples(&self) -> Vec { - // to add an example with a result that contains a null we will need to be able to - // allow null values to be entered into the dataframe from nushell - // and retain the correct dtype. Right now null values cause the dtype to be object vec![Example { description: "Count the number of non-null values in a column", - example: r#"[[a]; ["foo"] ["bar"]] | polars into-df + example: r#"[[a]; ["foo"] ["bar"] [null]] | polars into-df | polars select (polars col a | polars count) | polars collect"#, result: Some( diff --git a/crates/nu_plugin_polars/src/dataframe/values/nu_dataframe/conversion.rs b/crates/nu_plugin_polars/src/dataframe/values/nu_dataframe/conversion.rs index fe0a95612..2e988a08b 100644 --- a/crates/nu_plugin_polars/src/dataframe/values/nu_dataframe/conversion.rs +++ b/crates/nu_plugin_polars/src/dataframe/values/nu_dataframe/conversion.rs @@ -71,8 +71,11 @@ pub struct Column { } impl Column { - pub fn new(name: String, values: Vec) -> Self { - Self { name, values } + pub fn new(name: impl Into, values: Vec) -> Self { + Self { + name: name.into(), + values, + } } pub fn new_empty(name: String) -> Self { @@ -149,7 +152,7 @@ pub fn create_column( ) -> Result { let size = to_row - from_row; let values = series_to_values(series, Some(from_row), Some(size), span)?; - Ok(Column::new(series.name().into(), values)) + Ok(Column::new(series.name(), values)) } // Adds a separator to the vector of values using the column names from the @@ -194,9 +197,25 @@ pub fn insert_value( Entry::Occupied(entry) => entry.into_mut(), }; + // If we have a schema, use that for determining how things should be added to each column + if let Some(schema) = maybe_schema { + if let Some(field) = schema.schema.get_field(&key) { + col_val.column_type = Some(field.data_type().clone()); + col_val.values.push(value); + Ok(()) + } else { + Err(ShellError::GenericError { + error: format!("Schema does not contain column: {key}"), + msg: "".into(), + span: Some(value.span()), + help: None, + inner: vec![], + }) + } + } // Checking that the type for the value is the same // for the previous value in the column - if col_val.values.is_empty() { + else if col_val.values.is_empty() { if let Some(schema) = maybe_schema { if let Some(field) = schema.schema.get_field(&key) { col_val.column_type = Some(field.data_type().clone()); @@ -206,8 +225,8 @@ pub fn insert_value( if col_val.column_type.is_none() { col_val.column_type = Some(value_to_data_type(&value)); } - col_val.values.push(value); + Ok(()) } else { let prev_value = &col_val.values[col_val.values.len() - 1]; @@ -219,6 +238,7 @@ pub fn insert_value( | (Value::Date { .. }, Value::Date { .. }) | (Value::Filesize { .. }, Value::Filesize { .. }) | (Value::Duration { .. }, Value::Duration { .. }) => col_val.values.push(value), + (_, Value::Nothing { .. }) => col_val.values.push(value), (Value::List { .. }, _) => { col_val.column_type = Some(value_to_data_type(&value)); col_val.values.push(value); @@ -228,9 +248,8 @@ pub fn insert_value( col_val.values.push(value); } } + Ok(()) } - - Ok(()) } fn value_to_data_type(value: &Value) -> DataType { @@ -269,16 +288,18 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result, _> = column .values .iter() - .map(|v| match v { - Value::Float { val, .. } => Ok(*val as f32), - Value::Int { val, .. } => Ok(*val as f32), - x => Err(ShellError::GenericError { - error: "Error converting to f32".into(), - msg: "".into(), - span: None, - help: Some(format!("Unexpected type: {x:?}")), - inner: vec![], - }), + .map(|v| { + value_to_option(v, |v| match v { + Value::Float { val, .. } => Ok(*val as f32), + Value::Int { val, .. } => Ok(*val as f32), + x => Err(ShellError::GenericError { + error: "Error converting to f32".into(), + msg: "".into(), + span: None, + help: Some(format!("Unexpected type: {x:?}")), + inner: vec![], + }), + }) }) .collect(); Ok(Series::new(name, series_values?)) @@ -287,16 +308,18 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result, _> = column .values .iter() - .map(|v| match v { - Value::Float { val, .. } => Ok(*val), - Value::Int { val, .. } => Ok(*val as f64), - x => Err(ShellError::GenericError { - error: "Error converting to f64".into(), - msg: "".into(), - span: None, - help: Some(format!("Unexpected type: {x:?}")), - inner: vec![], - }), + .map(|v| { + value_to_option(v, |v| match v { + Value::Float { val, .. } => Ok(*val), + Value::Int { val, .. } => Ok(*val as f64), + x => Err(ShellError::GenericError { + error: "Error converting to f64".into(), + msg: "".into(), + span: None, + help: Some(format!("Unexpected type: {x:?}")), + inner: vec![], + }), + }) }) .collect(); Ok(Series::new(name, series_values?)) @@ -305,7 +328,7 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result, _> = column .values .iter() - .map(|v| v.as_i64().map(|v| v as u8)) + .map(|v| value_to_option(v, |v| v.as_i64().map(|v| v as u8))) .collect(); Ok(Series::new(name, series_values?)) } @@ -313,7 +336,7 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result, _> = column .values .iter() - .map(|v| v.as_i64().map(|v| v as u16)) + .map(|v| value_to_option(v, |v| v.as_i64().map(|v| v as u16))) .collect(); Ok(Series::new(name, series_values?)) } @@ -321,7 +344,7 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result, _> = column .values .iter() - .map(|v| v.as_i64().map(|v| v as u32)) + .map(|v| value_to_option(v, |v| v.as_i64().map(|v| v as u32))) .collect(); Ok(Series::new(name, series_values?)) } @@ -329,7 +352,7 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result, _> = column .values .iter() - .map(|v| v.as_i64().map(|v| v as u64)) + .map(|v| value_to_option(v, |v| v.as_i64().map(|v| v as u64))) .collect(); Ok(Series::new(name, series_values?)) } @@ -337,7 +360,7 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result, _> = column .values .iter() - .map(|v| v.as_i64().map(|v| v as i8)) + .map(|v| value_to_option(v, |v| v.as_i64().map(|v| v as i8))) .collect(); Ok(Series::new(name, series_values?)) } @@ -345,7 +368,7 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result, _> = column .values .iter() - .map(|v| v.as_i64().map(|v| v as i16)) + .map(|v| value_to_option(v, |v| v.as_i64().map(|v| v as i16))) .collect(); Ok(Series::new(name, series_values?)) } @@ -353,23 +376,32 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result, _> = column .values .iter() - .map(|v| v.as_i64().map(|v| v as i32)) + .map(|v| value_to_option(v, |v| v.as_i64().map(|v| v as i32))) .collect(); Ok(Series::new(name, series_values?)) } DataType::Int64 => { - let series_values: Result, _> = - column.values.iter().map(|v| v.as_i64()).collect(); + let series_values: Result, _> = column + .values + .iter() + .map(|v| value_to_option(v, |v| v.as_i64())) + .collect(); Ok(Series::new(name, series_values?)) } DataType::Boolean => { - let series_values: Result, _> = - column.values.iter().map(|v| v.as_bool()).collect(); + let series_values: Result, _> = column + .values + .iter() + .map(|v| value_to_option(v, |v| v.as_bool())) + .collect(); Ok(Series::new(name, series_values?)) } DataType::String => { - let series_values: Result, _> = - column.values.iter().map(|v| v.coerce_string()).collect(); + let series_values: Result, _> = column + .values + .iter() + .map(|v| value_to_option(v, |v| v.coerce_string())) + .collect(); Ok(Series::new(name, series_values?)) } DataType::Object(_, _) => value_to_series(name, &column.values), @@ -377,7 +409,11 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result, _> = column .values .iter() - .map(|v| v.as_i64().map(|v| nanos_from_timeunit(v, *time_unit))) + .map(|v| { + value_to_option(v, |v| { + v.as_i64().map(|v| nanos_from_timeunit(v, *time_unit)) + }) + }) .collect(); Ok(Series::new(name, series_values?)) } @@ -1241,6 +1277,17 @@ fn time_from_midnight(nanos: i64, span: Span) -> Result { }) } +fn value_to_option(value: &Value, func: F) -> Result, ShellError> +where + F: FnOnce(&Value) -> Result, +{ + if value.is_nothing() { + Ok(None) + } else { + func(value).map(|v| Some(v)) + } +} + #[cfg(test)] mod tests { use indexmap::indexmap; @@ -1461,4 +1508,187 @@ mod tests { Ok(()) } + + #[test] + fn test_typed_column_to_series_f32() -> Result<(), Box> { + let column = TypedColumn { + column: Column::new( + "foo", + vec![ + Value::test_float(1.1), + Value::test_int(2), + Value::test_nothing(), + ], + ), + column_type: Some(DataType::Float32), + }; + + let result = typed_column_to_series("foo", column)?; + + assert_eq!(result, Series::new("name", [Some(1.1f32), Some(2.0), None])); + Ok(()) + } + + #[test] + fn test_typed_column_to_series_f64() -> Result<(), Box> { + let column = TypedColumn { + column: Column::new( + "foo", + vec![ + Value::test_float(1.1), + Value::test_int(2), + Value::test_nothing(), + ], + ), + column_type: Some(DataType::Float64), + }; + + let result = typed_column_to_series("foo", column)?; + + assert_eq!(result, Series::new("name", [Some(1.1f64), Some(2.0), None])); + Ok(()) + } + + #[test] + fn test_typed_column_to_series_u8() -> Result<(), Box> { + let column = TypedColumn { + column: Column::new("foo", vec![Value::test_int(1), Value::test_nothing()]), + column_type: Some(DataType::UInt8), + }; + + let result = typed_column_to_series("foo", column)?; + + assert_eq!(result, Series::new("name", [Some(1u8), None])); + Ok(()) + } + + #[test] + fn test_typed_column_to_series_u16() -> Result<(), Box> { + let column = TypedColumn { + column: Column::new("foo", vec![Value::test_int(1), Value::test_nothing()]), + column_type: Some(DataType::UInt16), + }; + + let result = typed_column_to_series("foo", column)?; + + assert_eq!(result, Series::new("name", [Some(1u16), None])); + Ok(()) + } + + #[test] + fn test_typed_column_to_series_u32() -> Result<(), Box> { + let column = TypedColumn { + column: Column::new("foo", vec![Value::test_int(1), Value::test_nothing()]), + column_type: Some(DataType::UInt32), + }; + + let result = typed_column_to_series("foo", column)?; + + assert_eq!(result, Series::new("name", [Some(1u32), None])); + Ok(()) + } + + #[test] + fn test_typed_column_to_series_u64() -> Result<(), Box> { + let column = TypedColumn { + column: Column::new("foo", vec![Value::test_int(1), Value::test_nothing()]), + column_type: Some(DataType::UInt64), + }; + + let result = typed_column_to_series("foo", column)?; + + assert_eq!(result, Series::new("name", [Some(1u64), None])); + Ok(()) + } + + #[test] + fn test_typed_column_to_series_i8() -> Result<(), Box> { + let column = TypedColumn { + column: Column::new("foo", vec![Value::test_int(1), Value::test_nothing()]), + column_type: Some(DataType::Int8), + }; + + let result = typed_column_to_series("foo", column)?; + + assert_eq!(result, Series::new("name", [Some(1i8), None])); + Ok(()) + } + + #[test] + fn test_typed_column_to_series_i16() -> Result<(), Box> { + let column = TypedColumn { + column: Column::new("foo", vec![Value::test_int(1), Value::test_nothing()]), + column_type: Some(DataType::Int16), + }; + + let result = typed_column_to_series("foo", column)?; + + assert_eq!(result, Series::new("name", [Some(1i16), None])); + Ok(()) + } + + #[test] + fn test_typed_column_to_series_i32() -> Result<(), Box> { + let column = TypedColumn { + column: Column::new("foo", vec![Value::test_int(1), Value::test_nothing()]), + column_type: Some(DataType::Int32), + }; + + let result = typed_column_to_series("foo", column)?; + + assert_eq!(result, Series::new("name", [Some(1i32), None])); + Ok(()) + } + + #[test] + fn test_typed_column_to_series_i64() -> Result<(), Box> { + let column = TypedColumn { + column: Column::new("foo", vec![Value::test_int(1), Value::test_nothing()]), + column_type: Some(DataType::Int64), + }; + + let result = typed_column_to_series("foo", column)?; + + assert_eq!(result, Series::new("name", [Some(1i64), None])); + Ok(()) + } + + #[test] + fn test_typed_column_to_series_bool() -> Result<(), Box> { + let column = TypedColumn { + column: Column::new( + "foo", + vec![ + Value::test_bool(true), + Value::test_bool(false), + Value::test_nothing(), + ], + ), + column_type: Some(DataType::Boolean), + }; + + let result = typed_column_to_series("foo", column)?; + + assert_eq!(result, Series::new("name", [Some(true), Some(false), None])); + Ok(()) + } + + #[test] + fn test_typed_column_to_series_string() -> Result<(), Box> { + let column = TypedColumn { + column: Column::new( + "foo", + vec![Value::test_string("barbaz"), Value::test_nothing()], + ), + column_type: Some(DataType::String), + }; + + let result = typed_column_to_series("foo", column)?; + + assert_eq!( + result, + Series::new("name", [Some("barbaz".to_string()), None]) + ); + Ok(()) + } } -- cgit v1.2.3-70-g09d2