summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJack Wright <56345+ayax79@users.noreply.github.com>2024-09-16 16:07:13 -0700
committerGitHub <noreply@github.com>2024-09-16 18:07:13 -0500
commitaf77bc60e29a104b0a0d89f2c74369bdc9bd26a0 (patch)
treeb7f840097189e85f195d4a5ea0b068cc40d7fa02
parent9ca0fb772ddc9ab362efa9dd15d98f12c7130b8f (diff)
Improved null handling when converting from nu -> dataframe. (#13855)
# Description Fixes: #12726 and #13185 Previously converting columns that contained null caused polars to force a dtype of object even when using a schema. Now: 1. When using a schema, the type the schema defines for the column will always be used. 2. When a schema is not used, the previous type is used when a value is null. # User-Facing Changes - The type defined by the schema we be respected when passing in a null value `[a]; [null] | polars into-df -s {a: str}` will create a df with an str dtype column with one null value versus a column of type object. - *BREAKING CHANGE* If you define a schema, all columns must be in the schema.
-rw-r--r--crates/nu_plugin_polars/src/dataframe/command/aggregation/count.rs5
-rw-r--r--crates/nu_plugin_polars/src/dataframe/values/nu_dataframe/conversion.rs312
2 files changed, 272 insertions, 45 deletions
diff --git a/crates/nu_plugin_polars/src/dataframe/command/aggregation/count.rs b/crates/nu_plugin_polars/src/dataframe/command/aggregation/count.rs
index 8bf816341..3b1b32952 100644
--- a/crates/nu_plugin_polars/src/dataframe/command/aggregation/count.rs
+++ b/crates/nu_plugin_polars/src/dataframe/command/aggregation/count.rs
@@ -32,12 +32,9 @@ impl PluginCommand for ExprCount {
}
fn examples(&self) -> Vec<Example> {
- // to add an example with a result that contains a null we will need to be able to
- // allow null values to be entered into the dataframe from nushell
- // and retain the correct dtype. Right now null values cause the dtype to be object
vec![Example {
description: "Count the number of non-null values in a column",
- example: r#"[[a]; ["foo"] ["bar"]] | polars into-df
+ example: r#"[[a]; ["foo"] ["bar"] [null]] | polars into-df
| polars select (polars col a | polars count)
| polars collect"#,
result: Some(
diff --git a/crates/nu_plugin_polars/src/dataframe/values/nu_dataframe/conversion.rs b/crates/nu_plugin_polars/src/dataframe/values/nu_dataframe/conversion.rs
index fe0a95612..2e988a08b 100644
--- a/crates/nu_plugin_polars/src/dataframe/values/nu_dataframe/conversion.rs
+++ b/crates/nu_plugin_polars/src/dataframe/values/nu_dataframe/conversion.rs
@@ -71,8 +71,11 @@ pub struct Column {
}
impl Column {
- pub fn new(name: String, values: Vec<Value>) -> Self {
- Self { name, values }
+ pub fn new(name: impl Into<String>, values: Vec<Value>) -> Self {
+ Self {
+ name: name.into(),
+ values,
+ }
}
pub fn new_empty(name: String) -> Self {
@@ -149,7 +152,7 @@ pub fn create_column(
) -> Result<Column, ShellError> {
let size = to_row - from_row;
let values = series_to_values(series, Some(from_row), Some(size), span)?;
- Ok(Column::new(series.name().into(), values))
+ Ok(Column::new(series.name(), values))
}
// Adds a separator to the vector of values using the column names from the
@@ -194,9 +197,25 @@ pub fn insert_value(
Entry::Occupied(entry) => entry.into_mut(),
};
+ // If we have a schema, use that for determining how things should be added to each column
+ if let Some(schema) = maybe_schema {
+ if let Some(field) = schema.schema.get_field(&key) {
+ col_val.column_type = Some(field.data_type().clone());
+ col_val.values.push(value);
+ Ok(())
+ } else {
+ Err(ShellError::GenericError {
+ error: format!("Schema does not contain column: {key}"),
+ msg: "".into(),
+ span: Some(value.span()),
+ help: None,
+ inner: vec![],
+ })
+ }
+ }
// Checking that the type for the value is the same
// for the previous value in the column
- if col_val.values.is_empty() {
+ else if col_val.values.is_empty() {
if let Some(schema) = maybe_schema {
if let Some(field) = schema.schema.get_field(&key) {
col_val.column_type = Some(field.data_type().clone());
@@ -206,8 +225,8 @@ pub fn insert_value(
if col_val.column_type.is_none() {
col_val.column_type = Some(value_to_data_type(&value));
}
-
col_val.values.push(value);
+ Ok(())
} else {
let prev_value = &col_val.values[col_val.values.len() - 1];
@@ -219,6 +238,7 @@ pub fn insert_value(
| (Value::Date { .. }, Value::Date { .. })
| (Value::Filesize { .. }, Value::Filesize { .. })
| (Value::Duration { .. }, Value::Duration { .. }) => col_val.values.push(value),
+ (_, Value::Nothing { .. }) => col_val.values.push(value),
(Value::List { .. }, _) => {
col_val.column_type = Some(value_to_data_type(&value));
col_val.values.push(value);
@@ -228,9 +248,8 @@ pub fn insert_value(
col_val.values.push(value);
}
}
+ Ok(())
}
-
- Ok(())
}
fn value_to_data_type(value: &Value) -> DataType {
@@ -269,16 +288,18 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result<Series, She
let series_values: Result<Vec<_>, _> = column
.values
.iter()
- .map(|v| match v {
- Value::Float { val, .. } => Ok(*val as f32),
- Value::Int { val, .. } => Ok(*val as f32),
- x => Err(ShellError::GenericError {
- error: "Error converting to f32".into(),
- msg: "".into(),
- span: None,
- help: Some(format!("Unexpected type: {x:?}")),
- inner: vec![],
- }),
+ .map(|v| {
+ value_to_option(v, |v| match v {
+ Value::Float { val, .. } => Ok(*val as f32),
+ Value::Int { val, .. } => Ok(*val as f32),
+ x => Err(ShellError::GenericError {
+ error: "Error converting to f32".into(),
+ msg: "".into(),
+ span: None,
+ help: Some(format!("Unexpected type: {x:?}")),
+ inner: vec![],
+ }),
+ })
})
.collect();
Ok(Series::new(name, series_values?))
@@ -287,16 +308,18 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result<Series, She
let series_values: Result<Vec<_>, _> = column
.values
.iter()
- .map(|v| match v {
- Value::Float { val, .. } => Ok(*val),
- Value::Int { val, .. } => Ok(*val as f64),
- x => Err(ShellError::GenericError {
- error: "Error converting to f64".into(),
- msg: "".into(),
- span: None,
- help: Some(format!("Unexpected type: {x:?}")),
- inner: vec![],
- }),
+ .map(|v| {
+ value_to_option(v, |v| match v {
+ Value::Float { val, .. } => Ok(*val),
+ Value::Int { val, .. } => Ok(*val as f64),
+ x => Err(ShellError::GenericError {
+ error: "Error converting to f64".into(),
+ msg: "".into(),
+ span: None,
+ help: Some(format!("Unexpected type: {x:?}")),
+ inner: vec![],
+ }),
+ })
})
.collect();
Ok(Series::new(name, series_values?))
@@ -305,7 +328,7 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result<Series, She
let series_values: Result<Vec<_>, _> = column
.values
.iter()
- .map(|v| v.as_i64().map(|v| v as u8))
+ .map(|v| value_to_option(v, |v| v.as_i64().map(|v| v as u8)))
.collect();
Ok(Series::new(name, series_values?))
}
@@ -313,7 +336,7 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result<Series, She
let series_values: Result<Vec<_>, _> = column
.values
.iter()
- .map(|v| v.as_i64().map(|v| v as u16))
+ .map(|v| value_to_option(v, |v| v.as_i64().map(|v| v as u16)))
.collect();
Ok(Series::new(name, series_values?))
}
@@ -321,7 +344,7 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result<Series, She
let series_values: Result<Vec<_>, _> = column
.values
.iter()
- .map(|v| v.as_i64().map(|v| v as u32))
+ .map(|v| value_to_option(v, |v| v.as_i64().map(|v| v as u32)))
.collect();
Ok(Series::new(name, series_values?))
}
@@ -329,7 +352,7 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result<Series, She
let series_values: Result<Vec<_>, _> = column
.values
.iter()
- .map(|v| v.as_i64().map(|v| v as u64))
+ .map(|v| value_to_option(v, |v| v.as_i64().map(|v| v as u64)))
.collect();
Ok(Series::new(name, series_values?))
}
@@ -337,7 +360,7 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result<Series, She
let series_values: Result<Vec<_>, _> = column
.values
.iter()
- .map(|v| v.as_i64().map(|v| v as i8))
+ .map(|v| value_to_option(v, |v| v.as_i64().map(|v| v as i8)))
.collect();
Ok(Series::new(name, series_values?))
}
@@ -345,7 +368,7 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result<Series, She
let series_values: Result<Vec<_>, _> = column
.values
.iter()
- .map(|v| v.as_i64().map(|v| v as i16))
+ .map(|v| value_to_option(v, |v| v.as_i64().map(|v| v as i16)))
.collect();
Ok(Series::new(name, series_values?))
}
@@ -353,23 +376,32 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result<Series, She
let series_values: Result<Vec<_>, _> = column
.values
.iter()
- .map(|v| v.as_i64().map(|v| v as i32))
+ .map(|v| value_to_option(v, |v| v.as_i64().map(|v| v as i32)))
.collect();
Ok(Series::new(name, series_values?))
}
DataType::Int64 => {
- let series_values: Result<Vec<_>, _> =
- column.values.iter().map(|v| v.as_i64()).collect();
+ let series_values: Result<Vec<_>, _> = column
+ .values
+ .iter()
+ .map(|v| value_to_option(v, |v| v.as_i64()))
+ .collect();
Ok(Series::new(name, series_values?))
}
DataType::Boolean => {
- let series_values: Result<Vec<_>, _> =
- column.values.iter().map(|v| v.as_bool()).collect();
+ let series_values: Result<Vec<_>, _> = column
+ .values
+ .iter()
+ .map(|v| value_to_option(v, |v| v.as_bool()))
+ .collect();
Ok(Series::new(name, series_values?))
}
DataType::String => {
- let series_values: Result<Vec<_>, _> =
- column.values.iter().map(|v| v.coerce_string()).collect();
+ let series_values: Result<Vec<_>, _> = column
+ .values
+ .iter()
+ .map(|v| value_to_option(v, |v| v.coerce_string()))
+ .collect();
Ok(Series::new(name, series_values?))
}
DataType::Object(_, _) => value_to_series(name, &column.values),
@@ -377,7 +409,11 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result<Series, She
let series_values: Result<Vec<_>, _> = column
.values
.iter()
- .map(|v| v.as_i64().map(|v| nanos_from_timeunit(v, *time_unit)))
+ .map(|v| {
+ value_to_option(v, |v| {
+ v.as_i64().map(|v| nanos_from_timeunit(v, *time_unit))
+ })
+ })
.collect();
Ok(Series::new(name, series_values?))
}
@@ -1241,6 +1277,17 @@ fn time_from_midnight(nanos: i64, span: Span) -> Result<Value, ShellError> {
})
}
+fn value_to_option<T, F>(value: &Value, func: F) -> Result<Option<T>, ShellError>
+where
+ F: FnOnce(&Value) -> Result<T, ShellError>,
+{
+ if value.is_nothing() {
+ Ok(None)
+ } else {
+ func(value).map(|v| Some(v))
+ }
+}
+
#[cfg(test)]
mod tests {
use indexmap::indexmap;
@@ -1461,4 +1508,187 @@ mod tests {
Ok(())
}
+
+ #[test]
+ fn test_typed_column_to_series_f32() -> Result<(), Box<dyn std::error::Error>> {
+ let column = TypedColumn {
+ column: Column::new(
+ "foo",
+ vec![
+ Value::test_float(1.1),
+ Value::test_int(2),
+ Value::test_nothing(),
+ ],
+ ),
+ column_type: Some(DataType::Float32),
+ };
+
+ let result = typed_column_to_series("foo", column)?;
+
+ assert_eq!(result, Series::new("name", [Some(1.1f32), Some(2.0), None]));
+ Ok(())
+ }
+
+ #[test]
+ fn test_typed_column_to_series_f64() -> Result<(), Box<dyn std::error::Error>> {
+ let column = TypedColumn {
+ column: Column::new(
+ "foo",
+ vec![
+ Value::test_float(1.1),
+ Value::test_int(2),
+ Value::test_nothing(),
+ ],
+ ),
+ column_type: Some(DataType::Float64),
+ };
+
+ let result = typed_column_to_series("foo", column)?;
+
+ assert_eq!(result, Series::new("name", [Some(1.1f64), Some(2.0), None]));
+ Ok(())
+ }
+
+ #[test]
+ fn test_typed_column_to_series_u8() -> Result<(), Box<dyn std::error::Error>> {
+ let column = TypedColumn {
+ column: Column::new("foo", vec![Value::test_int(1), Value::test_nothing()]),
+ column_type: Some(DataType::UInt8),
+ };
+
+ let result = typed_column_to_series("foo", column)?;
+
+ assert_eq!(result, Series::new("name", [Some(1u8), None]));
+ Ok(())
+ }
+
+ #[test]
+ fn test_typed_column_to_series_u16() -> Result<(), Box<dyn std::error::Error>> {
+ let column = TypedColumn {
+ column: Column::new("foo", vec![Value::test_int(1), Value::test_nothing()]),
+ column_type: Some(DataType::UInt16),
+ };
+
+ let result = typed_column_to_series("foo", column)?;
+
+ assert_eq!(result, Series::new("name", [Some(1u16), None]));
+ Ok(())
+ }
+
+ #[test]
+ fn test_typed_column_to_series_u32() -> Result<(), Box<dyn std::error::Error>> {
+ let column = TypedColumn {
+ column: Column::new("foo", vec![Value::test_int(1), Value::test_nothing()]),
+ column_type: Some(DataType::UInt32),
+ };
+
+ let result = typed_column_to_series("foo", column)?;
+
+ assert_eq!(result, Series::new("name", [Some(1u32), None]));
+ Ok(())
+ }
+
+ #[test]
+ fn test_typed_column_to_series_u64() -> Result<(), Box<dyn std::error::Error>> {
+ let column = TypedColumn {
+ column: Column::new("foo", vec![Value::test_int(1), Value::test_nothing()]),
+ column_type: Some(DataType::UInt64),
+ };
+
+ let result = typed_column_to_series("foo", column)?;
+
+ assert_eq!(result, Series::new("name", [Some(1u64), None]));
+ Ok(())
+ }
+
+ #[test]
+ fn test_typed_column_to_series_i8() -> Result<(), Box<dyn std::error::Error>> {
+ let column = TypedColumn {
+ column: Column::new("foo", vec![Value::test_int(1), Value::test_nothing()]),
+ column_type: Some(DataType::Int8),
+ };
+
+ let result = typed_column_to_series("foo", column)?;
+
+ assert_eq!(result, Series::new("name", [Some(1i8), None]));
+ Ok(())
+ }
+
+ #[test]
+ fn test_typed_column_to_series_i16() -> Result<(), Box<dyn std::error::Error>> {
+ let column = TypedColumn {
+ column: Column::new("foo", vec![Value::test_int(1), Value::test_nothing()]),
+ column_type: Some(DataType::Int16),
+ };
+
+ let result = typed_column_to_series("foo", column)?;
+
+ assert_eq!(result, Series::new("name", [Some(1i16), None]));
+ Ok(())
+ }
+
+ #[test]
+ fn test_typed_column_to_series_i32() -> Result<(), Box<dyn std::error::Error>> {
+ let column = TypedColumn {
+ column: Column::new("foo", vec![Value::test_int(1), Value::test_nothing()]),
+ column_type: Some(DataType::Int32),
+ };
+
+ let result = typed_column_to_series("foo", column)?;
+
+ assert_eq!(result, Series::new("name", [Some(1i32), None]));
+ Ok(())
+ }
+
+ #[test]
+ fn test_typed_column_to_series_i64() -> Result<(), Box<dyn std::error::Error>> {
+ let column = TypedColumn {
+ column: Column::new("foo", vec![Value::test_int(1), Value::test_nothing()]),
+ column_type: Some(DataType::Int64),
+ };
+
+ let result = typed_column_to_series("foo", column)?;
+
+ assert_eq!(result, Series::new("name", [Some(1i64), None]));
+ Ok(())
+ }
+
+ #[test]
+ fn test_typed_column_to_series_bool() -> Result<(), Box<dyn std::error::Error>> {
+ let column = TypedColumn {
+ column: Column::new(
+ "foo",
+ vec![
+ Value::test_bool(true),
+ Value::test_bool(false),
+ Value::test_nothing(),
+ ],
+ ),
+ column_type: Some(DataType::Boolean),
+ };
+
+ let result = typed_column_to_series("foo", column)?;
+
+ assert_eq!(result, Series::new("name", [Some(true), Some(false), None]));
+ Ok(())
+ }
+
+ #[test]
+ fn test_typed_column_to_series_string() -> Result<(), Box<dyn std::error::Error>> {
+ let column = TypedColumn {
+ column: Column::new(
+ "foo",
+ vec![Value::test_string("barbaz"), Value::test_nothing()],
+ ),
+ column_type: Some(DataType::String),
+ };
+
+ let result = typed_column_to_series("foo", column)?;
+
+ assert_eq!(
+ result,
+ Series::new("name", [Some("barbaz".to_string()), None])
+ );
+ Ok(())
+ }
}