Column selectors
DataFrame
provides a DSL for selecting an arbitrary set of columns.
Column selectors are used in many operations:
df.select { age and name }
df.fillNaNs { colsAtAnyDepth().colsOf<Double>() }.withZero()
df.remove { cols { it.hasNulls() } }
df.group { cols { it.data != name } }.into { "nameless" }
df.update { city }.notNull { it.lowercase() }
df.gather { colsOf<Number>() }.into("key", "value")
df.move { name.firstName and name.lastName }.after { city }
Select columns by name:
// by column name
df.select { it.name }
df.select { name }
// by column path
df.select { name.firstName }
// with a new name
df.select { name named "Full Name" }
// converted
df.select { name.firstName.map { it.lowercase() } }
// column arithmetics
df.select { 2021 - age }
// two columns
df.select { name and age }
// range of columns
df.select { name..age }
// all columns of ColumnGroup
df.select { name.allCols() }
// traversal of columns at any depth from here excluding ColumnGroups
df.select { name.colsAtAnyDepth { !it.isColumnGroup() } }
// by column name
val name by columnGroup()
df.select { it[name] }
df.select { name }
// by column path
val firstName by name.column<String>()
df.select { firstName }
// with a new name
df.select { name named "Full Name" }
// converted
df.select { firstName.map { it.lowercase() } }
// column arithmetics
val age by column<Int>()
df.select { 2021 - age }
// two columns
df.select { name and age }
// range of columns
df.select { name..age }
// all columns of ColumnGroup
df.select { name.allCols() }
// traversal of columns at any depth from here excluding ColumnGroups
df.select { name.colsAtAnyDepth { !it.isColumnGroup() } }
// by column name
df.select { it["name"] }
// by column path
df.select { it["name"]["firstName"] }
df.select { "name"["firstName"] }
// with a new name
df.select { "name" named "Full Name" }
// converted
df.select { "name"["firstName"]<String>().map { it.uppercase() } }
// column arithmetics
df.select { 2021 - "age"<Int>() }
// two columns
df.select { "name" and "age" }
// by range of names
df.select { "name".."age" }
// all columns of ColumnGroup
df.select { "name".allCols() }
// traversal of columns at any depth from here excluding ColumnGroups
df.select { "name".colsAtAnyDepth { !it.isColumnGroup() } }
Select columns by column index:
// by index
df.select { col(2) }
// by several indices
df.select { cols(0, 1, 3) }
// by range of indices
df.select { cols(1..4) }
Other column selectors:
// by condition
df.select { cols { it.name().startsWith("year") } }
df.select { nameStartsWith("year") }
// by type
df.select { colsOf<String>() }
// by type with condition
df.select { colsOf<String?> { it.countDistinct() > 5 } }
// all top-level columns
df.select { all() }
// first/last n columns
df.select { take(2) }
df.select { takeLast(2) }
// all except first/last n columns
df.select { drop(2) }
df.select { dropLast(2) }
// find the first column satisfying the condition
df.select { first { it.name.startsWith("year") } }
// find the last column inside a column group satisfying the condition
df.select {
colGroup("name").lastCol { it.name().endsWith("Name") }
}
// find the single column inside a column group satisfying the condition
df.select {
Person::name.singleCol { it.name().startsWith("first") }
}
// traversal of columns at any depth from here excluding ColumnGroups
df.select { colsAtAnyDepth { !it.isColumnGroup() } }
// traversal of columns at any depth from here including ColumnGroups
df.select { colsAtAnyDepth() }
// traversal of columns at any depth with condition
df.select { colsAtAnyDepth { it.name().contains(":") } }
// traversal of columns at any depth to find columns of given type
df.select { colsAtAnyDepth().colsOf<String>() }
// all columns except given column set
df.select { allExcept { colsOf<String>() } }
// union of column sets
df.select { take(2) and col(3) }
Modify the set of selected columns:
// first/last n value- and frame columns in column set
df.select { colsAtAnyDepth { !it.isColumnGroup() }.take(3) }
df.select { colsAtAnyDepth { !it.isColumnGroup() }.takeLast(3) }
// all except first/last n value- and frame columns in column set
df.select { colsAtAnyDepth { !it.isColumnGroup() }.drop(3) }
df.select { colsAtAnyDepth { !it.isColumnGroup() }.dropLast(3) }
// filter column set by condition
df.select { colsAtAnyDepth { !it.isColumnGroup() }.filter { it.name().startsWith("year") } }
// exclude columns from column set
df.select { colsAtAnyDepth { !it.isColumnGroup() }.except { age } }
// keep only unique columns
df.select { (colsOf<Int>() and age).distinct() }
Last modified: 29 March 2024