In the following you will find a simple data mining demo that shows modelling and forecasting in the connection of R and Exasol. If you have not performed the installation and configuration of the necessary components yet, you can find an instruction here.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
|
# Laden der benötigten Pakete
library(RODBC)
library(exasol)
library(rredis)
library(magrittr)
library(stringi)
library(rpart)
library(partykit)
# Verbindung zu Exasol und Redis herstellen
con <– odbcConnect(“exasol_vm”)
redisConnect(“172.20.248.13”)
# Erstellen eines Zufallszahlen-Vektors für die Stichprobenziehung
rnd <– rnorm(nrow(iris))
# Anfügen der Gruppenvariable (Training/Validierung)
iris$groups <– factor(NA, levels = c(“Train”, “Valid”))
# Nach Spezies geschichtete Zufallsziehung: 70% Training, 30% Validierung
for(i in unique(iris$Species)) {
logVec <– iris$Species == i
iris$groups[logVec] <– ifelse(test = rnd[logVec] > quantile(rnd[logVec],
probs = 0.3),
yes = “Train”,
no = “Valid”)
}
# Überprüfung der Stichprobenziehung
table(iris$groups, iris$Species)
# Workspace aufräumen
rm(rnd, logVec, i)
|
The iris data are now transferred to the Exasol. At first use, a database schema and an empty table are created. Afterwards, the iris data are written into the table. Once the data are uploaded they can be used time and again.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
# Datenbankschema mit dem Namen <em>my_schema</em> erstellen
odbcQuery(con, “create schema my_schema”)
# Erstellen der leeren Tabelle unter dem Namen <em>irisdb</em>
odbcQuery(con, “create or replace table my_schema.irisdb(
SepalLength DOUBLE,
SepalWidth DOUBLE,
PetalLength DOUBLE,
PetalWidth DOUBLE,
Species CHAR(20),
Groups CHAR(20));”)
# Hochladen der <em>iris</em> Daten in die Exasol
exa.writeData(con, data = iris, tableName = “my_schema.irisdb”)
|
In the first step of the analysis a decision tree is created locally. Based on this tree a forecast is done.
1
2
3
4
5
6
7
8
9
10
11
12
13
|
# Erstellen eines Trainings- und eines Validierungsdatensatzes
train <– subset(iris, subset = groups == “Train”, select = –groups)
valid <– subset(iris, subset = groups == “Valid”, select = –groups)
# Erstellen des Entscheidungsbaums mit den Trainingsdaten
localTree <– rpart(Species ~ ., data = train)
# Visualisierung des Baums
plot(as.party(localTree))
# Prognose Validierungsdaten mit Hilfe des Baums
pred <– predict(localTree, type = “class”, newdata = valid) # Prognose überprüfen table(pred, valid$Species, dnn = c(“Vorhersage”, “Tatsächlich”)) %>%
addmargins()
|
The same procedure as above now in the Exasol. The exa.script function creates an R script on the Exasol server. The function call runs the script on the Exasol clusters. This means that the analysis no longer takes place on the local R but on the R instances in the Exasol cluster. Packages which are used on the R instances on the Exasol need to be installed there. Have a look at this blog entry for more information.
The model built in the Exasol cluster will be stored in Redis, a key-value database. With Redis you can distribute models, functions and other R objects in the cluster and load them from the cluster.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
|
exa_rf <– exa.createScript(
con,
“my_schmea.exa_rf”, # Unter diesem Namen ist das R-Script über SQL verfügbar
function(data) {
# Laden der benötigten Pakete. Diese müssen ggf. in der Exasol installiert sein
require(rpart)
require(stringi)
require(rredis)
# Verbindung mit Redis
redisConnect(“172.20.248.13”, port = 6379)
# Laden aller Daten aus der Exasol Tabelle
# wird im Funktionsaufruf das <em>groupBy</em> oder <em>where</em> Argument verwendet,
# wird nur der entsprechende Teil der Daten geladen.
data$next_row(NA)
# Wandeln des <em>data</em> Objekts in einen <em>data.frame</em>
df <– data.frame(v1 = data$SepalLength,
v2 = data$SepalWidth,
v3 = data$PetalLength,
v4 = data$PetalWidth,
species = data$Species)
# Aufbereiten des data.frames
df$species <– stri_replace_all_fixed(df$species, ” “, “”)
df$species <– as.factor(df$species)
# Erstellen des Baums
rf <– rpart(species ~ ., data = df)
# Speichern des Baums in Redis
redisSet(“exa_rf”, rf)
# Rückgabe der Zeilenanzahl (zur Kontrolle)
data$emit(nrow(df))
},
inArgs = c(“SepalLength DOUBLE”,
“SepalWidth DOUBLE”,
“PetalLength DOUBLE”,
“PetalWidth DOUBLE”,
“Species CHAR(20)”),
outArgs = “Feedback INT”)
# Aufrufen der oben gebildeten Funktion. Das where Argument legt fest, dass
# das Modell auf den Trainingsdaten gebildet wird.
exa_rf(“SepalLength”, “SepalWidth”, “PetalLength”, “PetalWidth”, “Species”,
table = “my_schema.irisdb”,
where = “groups = ‘Train'”)
|
The tree model can be used for forecasting in a separate step on the Exasol. The first part is mostly identical with the above function.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
|
exa_predict_rf <– exa.createScript(
con,
“my_schema.exa_pred”,
function(data) {
require(rpart)
require(rredis)
redisConnect(“172.20.248.13”, port = 6379)
data$next_row(NA)
df <– data.frame(v1 = data$SepalLength,
v2 = data$SepalWidth,
v3 = data$PetalLength,
v4 = data$PetalWidth,
species = data$Species)
# Laden des Baum Modells aus Redis
rf <– redisGet(“exa_rf”)
# Erstellen der Prognose
pred <– predict(rf, newdata = df, type = “class”)
# Rückgabe der Prognose sowie der echten Klassenzugehörigkeit
data$emit(pred, df$species)
},
inArgs = c( “SepalLength DOUBLE”,
“SepalWidth DOUBLE”,
“PetalLength DOUBLE”,
“PetalWidth DOUBLE”,
“Species CHAR(20)”),
outArgs = c(“Prognose CHAR(20)”,
“Realwerte CHAR(20)”))
# Aufruf der oben gebildeten Funktion. Die Rückgabe wird in ein Objekt gespeichert.
exa_pred <– exa_predict_rf(“SepalLength”, “SepalWidth”, “PetalLength”, “PetalWidth”, “Species”, table = “my_schema.irisdb”, where = “groups = ‘Valid'”) # Mit Hilfe der Table Funktion kann geprüft werden, wie gut die Prognose performt. table(exa_pred$PROGNOSE, exa_pred$REALWERTE, dnn = c(“Prognose”, “Realwerte”)) %>%
addmargins()
|
In the following you will find a simple data mining demo that shows modelling and forecasting in the connection of R and Exasol. If you have not performed the installation and configuration of the necessary components yet, you can find an instruction here.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
|
# Load the required packages
library(RODBC)
library(exasol)
library(rredis)
library(magrittr)
library(stringi)
library(rpart)
library(partykit)
# Connection to Exasol and Redis
con <– odbcConnect(“exasol_vm”)
redisConnect(“172.20.248.13”)
# Random numbers vector for the sampling
rnd <– rnorm(nrow(iris))
# Add the group variable (Training/Validierung)
iris$groups <– factor(NA, levels = c(“Train”, “Valid”))
# Random draw: 70% Training, 30% Validation
for(i in unique(iris$Species)) {
logVec <– iris$Species == i
iris$groups[logVec] <– ifelse(test = rnd[logVec] > quantile(rnd[logVec],
probs = 0.3),
yes = “Train”,
no = “Valid”)
}
# Review of the sampling
table(iris$groups, iris$Species)
# Clean up the Workspace
rm(rnd, logVec, i)
|
The iris data are now transferred to the Exasol. At first use, a database schema and an empty table are created. Afterwards, the iris data are written into the table. Once the data are uploaded they can be used time and again.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
# Create a Database scheme with the name <em>my_schema</em>
odbcQuery(con, “create schema my_schema”)
# Create an empty table with the name <em>irisdb</em>
odbcQuery(con, “create or replace table my_schema.irisdb(
SepalLength DOUBLE,
SepalWidth DOUBLE,
PetalLength DOUBLE,
PetalWidth DOUBLE,
Species CHAR(20),
Groups CHAR(20));”)
# Upload of the <em>iris</em> Data in the Exasol
exa.writeData(con, data = iris, tableName = “my_schema.irisdb”)
|
In the first step of the analysis a decision tree is created locally. Based on this tree a forecast is done.
1
2
3
4
5
6
7
8
9
10
11
12
13
|
# Creating two data sets
train <– subset(iris, subset = groups == “Train”, select = –groups)
valid <– subset(iris, subset = groups == “Valid”, select = –groups)
# Decission tree
localTree <– rpart(Species ~ ., data = train)
# Visualisation of the tree
plot(as.party(localTree))
# Forecast
pred <– predict(localTree, type = “class”, newdata = valid) # Check forecast table(pred, valid$Species, dnn = c(“Vorhersage”, “Tatsächlich”)) %>%
addmargins()
|
The same procedure as above now in the Exasol. The exa.script function creates an R script on the Exasol server. The function call runs the script on the Exasol clusters. This means that the analysis no longer takes place on the local R but on the R instances in the Exasol cluster. Packages which are used on the R instances on the Exasol need to be installed there. Have a look at this blog entry for more information.
The model built in the Exasol cluster will be stored in Redis, a key-value database. With Redis you can distribute models, functions and other R objects in the cluster and load them from the cluster.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
|
exa_rf <– exa.createScript(
con,
“my_schmea.exa_rf”, # Unter diesem Namen ist das R-Script über SQL verfügbar
function(data) {
# Load the required packages
require(rpart)
require(stringi)
require(rredis)
# Connection with Redis
redisConnect(“172.20.248.13”, port = 6379)
# Loading all data from the Exasol table
data$next_row(NA)
# Transform the <em>data</em> Object into <em>data.frame</em>
df <– data.frame(v1 = data$SepalLength,
v2 = data$SepalWidth,
v3 = data$PetalLength,
v4 = data$PetalWidth,
species = data$Species)
# Prepare the data.frames
df$species <– stri_replace_all_fixed(df$species, ” “, “”)
df$species <– as.factor(df$species)
# Generate a tree
rf <– rpart(species ~ ., data = df)
# Save the tree in Redis
redisSet(“exa_rf”, rf)
# Return the line number
data$emit(nrow(df))
},
inArgs = c(“SepalLength DOUBLE”,
“SepalWidth DOUBLE”,
“PetalLength DOUBLE”,
“PetalWidth DOUBLE”,
“Species CHAR(20)”),
outArgs = “Feedback INT”)
# Call of the function.
exa_rf(“SepalLength”, “SepalWidth”, “PetalLength”, “PetalWidth”, “Species”,
table = “my_schema.irisdb”,
where = “groups = ‘Train'”)
|
The tree model can be used for forecasting in a separate step on the Exasol. The first part is mostly identical with the above function.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
|
exa_predict_rf <– exa.createScript(
con,
“my_schema.exa_pred”,
function(data) {
require(rpart)
require(rredis)
redisConnect(“172.20.248.13”, port = 6379)
data$next_row(NA)
df <– data.frame(v1 = data$SepalLength,
v2 = data$SepalWidth,
v3 = data$PetalLength,
v4 = data$PetalWidth,
species = data$Species)
# Loading the tree model out of Redis
rf <– redisGet(“exa_rf”)
# Creating a Forecast
pred <– predict(rf, newdata = df, type = “class”)
# Return of the forecast
data$emit(pred, df$species)
},
inArgs = c( “SepalLength DOUBLE”,
“SepalWidth DOUBLE”,
“PetalLength DOUBLE”,
“PetalWidth DOUBLE”,
“Species CHAR(20)”),
outArgs = c(“Prognose CHAR(20)”,
“Realwerte CHAR(20)”))
# Call of the function. The return is stored in an object.
exa_pred <– exa_predict_rf(“SepalLength”, “SepalWidth”, “PetalLength”, “PetalWidth”, “Species”, table = “my_schema.irisdb”, where = “groups = ‘Valid'”) # Performance testing table(exa_pred$PROGNOSE, exa_pred$REALWERTE, dnn = c(“Prognose”, “Realwerte”)) %>%
addmargins()
|