dataFrameOf
// DataFrame with 2 columns and 3 rows
val df = dataFrameOf(
"name" to listOf("Alice", "Bob", "Charlie"),
"age" to listOf(15, 20, 100),
)
Create DataFrame with nested columns inplace:
// DataFrame with 2 columns and 3 rows
val df = dataFrameOf(
"name" to columnOf(
"firstName" to columnOf("Alice", "Bob", "Charlie"),
"lastName" to columnOf("Cooper", "Dylan", "Daniels"),
),
"age" to columnOf(15, 20, 100),
)
// DataFrame with 2 columns
val df = dataFrameOf(
"name" to columnOf("Alice", "Bob", "Charlie"),
"age" to columnOf(15, 20, 22)
)
Returns a DataFrame with given column names and values.
// DataFrame with 2 columns and 3 rows
val df = dataFrameOf("name", "age")(
"Alice", 15,
"Bob", 20,
"Charlie", 100,
)
toDataFrame
DataFrame from Map<String, List<*>>:
val map = mapOf("name" to listOf("Alice", "Bob", "Charlie"), "age" to listOf(15, 20, 22))
// DataFrame with 2 columns
map.toDataFrame()
DataFrame from random data:
Use IntRange to generate rows filled with random values:
val categories = listOf("Electronics", "Books", "Clothing")
// DataFrame with 4 columns and 7 rows
(0 until 7).toDataFrame {
"productId" from { "P${1000 + it}" }
"category" from { categories.random() }
"price" from { Random.nextDouble(10.0, 500.0) }
"inStock" from { Random.nextInt(0..100) }
}
Generate DataFrame with nested ColumnGroup and FrameColumn:
val categories = listOf("Electronics", "Books", "Clothing")
// DataFrame with 5 columns and 7 rows
(0 until 7).toDataFrame {
"productId" from { "P${1000 + it}" }
"category" from { categories.random() }
"price" from { Random.nextDouble(10.0, 500.0) }
// Column Group
"manufacturer" {
"country" from { listOf("USA", "China", "Germany", "Japan").random() }
"yearEstablished" from { Random.nextInt(1950..2020) }
}
// Frame Column
"reviews" from {
val reviewCount = Random.nextInt(0..7)
(0 until reviewCount).toDataFrame {
val ratings: DataColumn<Int> = expr { Random.nextInt(1..5) }
val comments = ratings.map {
when (it) {
5 -> listOf("Amazing quality!", "Best purchase ever!", "Highly recommend!", "Absolutely perfect!")
4 -> listOf("Great product!", "Very satisfied", "Good value for money", "Would buy again")
3 -> listOf("It's okay", "Does the job", "Average quality", "Neither good nor bad")
2 -> listOf("Could be better", "Disappointed", "Not what I expected", "Poor quality")
else -> listOf("Terrible!", "Not worth the price", "Complete waste of money", "Do not buy!")
}.random()
}
"author" from { "User${Random.nextInt(1000..10000)}" }
ratings into "rating"
comments into "comment"
}
}
}
Use from in combination with loops to generate DataFrame:
// Multiplication table
(1..10).toDataFrame {
(1..10).forEach { x ->
"$x" from { x * it }
}
}
The return type of these overloads is a typed DataFrame. Its data schema defines the column that can be used right after the conversion for additional computations.
val names = listOf("Alice", "Bob", "Charlie")
// TODO fix with plugin???
val df = names.toDataFrame() as DataFrame<ValueProperty<String>>
df.add("length") { value.length }
This is an easy way to create a DataFrame when you have a list of Files, URLs, or a structure you want to extract data from.
In a notebook, it can be convenient to start from the column of these values to see the number of rows, their toString in a table and then iteratively add columns with the parts of the data you're interested in. It could be a File's content, a specific section of an HTML document, some metadata, etc.
val files = listOf(File("data.csv"), File("data1.csv"))
val df = files.toDataFrame(columnName = "data")
DataFrame from List<List<T>>:
This is useful for parsing text files. For example, the .srt subtitle format can be parsed like this:
val lines = """
1
00:00:05,000 --> 00:00:07,500
This is the first subtitle.
2
00:00:08,000 --> 00:00:10,250
This is the second subtitle.
""".trimIndent().lines()
lines.chunked(4) { it.take(3) }.toDataFrame(header = listOf("n", "timestamp", "text"))
data class Person(val name: String, val age: Int)
val persons = listOf(Person("Alice", 15), Person("Bob", 20), Person("Charlie", 22))
val df = persons.toDataFrame()
Scans object properties using reflection and creates a ValueColumn for every property. The scope of properties for scanning is defined at compile-time by the formal types of the objects in the Iterable, so the properties of implementation classes will not be scanned.
Specify the depth parameter to perform deep object graph traversal and convert nested objects into ColumnGroups and FrameColumns:
data class Name(val firstName: String, val lastName: String)
data class Score(val subject: String, val value: Int)
data class Student(val name: Name, val age: Int, val scores: List<Score>)
val students = listOf(
Student(Name("Alice", "Cooper"), 15, listOf(Score("math", 4), Score("biology", 3))),
Student(Name("Bob", "Marley"), 20, listOf(Score("music", 5))),
)
val df = students.toDataFrame(maxDepth = 1)
For detailed control over object graph transformations, use the configuration DSL. It allows you to exclude particular properties or classes from the object graph traversal, compute additional columns, and configure column grouping.
val df = students.toDataFrame {
// add column
"year of birth" from { 2021 - it.age }
// scan all properties
properties(maxDepth = 1) {
exclude(Score::subject) // `subject` property will be skipped from object graph traversal
preserve<Name>() // `Name` objects will be stored as-is without transformation into DataFrame
}
// add column group
"summary" {
"max score" from { it.scores.maxOf { it.value } }
"min score" from { it.scores.minOf { it.value } }
}
}