Library Overview

What is Data Frame

Main Features and Concepts

Syntax

val df = DataFrame.readCsv("titanic.csv", delimiter = ';')

// filter rows
val someSurvivors = df.filter { survived && home.endsWith("NY") && age in 10..20 }

// add column
val withBirthYearColumn = df.add("birthYear") { 1912 - age }

// sort rows
val sortedByAge = df.sortByDesc { age }

// aggregate data
val aggregated = df.groupBy { pclass }.aggregate {
    maxBy { age }.name into "oldest person"
    count { survived } into "survived"
}

The titanic.csv file could be found here.

// create columns
val fromTo by columnOf("LoNDon_paris", "MAdrid_miLAN", "londON_StockhOlm", "Budapest_PaRis", "Brussels_londOn")
val flightNumber by columnOf(10045.0, Double.NaN, 10065.0, Double.NaN, 10085.0)
val recentDelays by columnOf("23,47", null, "24, 43, 87", "13", "67, 32")
val airline by columnOf("KLM(!)", "{Air France} (12)", "(British Airways. )", "12. Air France", "'Swiss Air'")

// create dataframe
val df = dataFrameOf(fromTo, flightNumber, recentDelays, airline)

// typed accessors for columns
// that will appear during
// dataframe transformation
val origin by column<String>()
val destination by column<String>()

val dfClean = df
    // fill missing flight numbers
    .fillNA { flightNumber }.with { prev()!!.flightNumber + 10 }

    // convert flight numbers to int
    .convert { flightNumber }.toInt()

    // clean 'Airline' column
    .update { airline }.with { "([a-zA-Z\\s]+)".toRegex().find(it)?.value ?: "" }

    // split 'From_To' column into 'From' and 'To'
    .split { fromTo }.by("_").into(origin, destination)

    // clean 'From' and 'To' columns
    .update { origin and destination }.with { it.lowercase().replaceFirstChar(Char::uppercase) }

    // split lists of delays in 'RecentDelays' into separate columns
    // 'delay1', 'delay2'... and nest them inside original column `RecentDelays`
    .split { recentDelays }.inward { "delay$it" }

    // convert string values in `delay1`, `delay2` into ints
    .parse { recentDelays }

// group by flight origin
val analysed = dfClean.groupBy { From into "origin" }.aggregate {
    // we are in the context of single data group

    // number of flights from origin
    count() into "count"

    // list of flight numbers
    flightNumber into "flight numbers"

    // counts of flights per airline
    airline.valueCounts() into "airlines"

    // max delay across all delays in `delay1` and `delay2`
    recentDelays.maxOrNull { delay1 and delay2 } into "major delay"

    // separate lists of recent delays for `delay1`, `delay2` and `delay3`
    recentDelays.implode(dropNulls = true) into "recent delays"

    // total delay per city of destination
    pivot { To }.sum { recentDelays.intCols() } into "total delays to"
}

Library Overview﻿

tip

What is Data Frame﻿

Main Features and Concepts﻿

Syntax﻿

Contribute and give feedback﻿

What is Data Frame

Main Features and Concepts

Syntax

Contribute and give feedback