How can we make our daily work more efficient? Is there any straight forward answer? For me, the answer is only one word, experience.
|
Let's assume, that our dataset is couple of millions of rows and couple of hundred of columns, for example a typical production dataset for invoice records or for IOT data. In order to conclude on which is the most accurate design of the model, we will probably need to add many derived columns ( feature engineering process). According to the final accuracy, this should be repeated several times in order to have a first version for our training dataset. This is highly repeatable time-consuming process which can last for days. |
CREATE COLUMN TABLE "CHECK_tbl" (
"RN" BIGINT,
"RN1" BIGINT,
"RN2" BIGINT,
"RN3" BIGINT);
DDL of table "CHECK_tbl"
--CASE 1, USING RAND() ON ORDER BY
SELECT *
FROM "CHECK_tbl"
ORDER BY rand()
LIMIT 1000000
WITH HINT (IGNORE_PLAN_CACHE);
--CASE 2, USING RAND() ON WHERE CLAUSE
SELECT *
FROM "CHECK_tbl"
WHERE rand() < 0.9
LIMIT 1000000
WITH HINT (IGNORE_PLAN_CACHE);
--CASE 3, USING TABLESAMPLE SYNTAX
SELECT *
FROM "CHECK_tbl"
TABLESAMPLE SYSTEM (90)
LIMIT 1000000
WITH HINT (IGNORE_PLAN_CACHE);
--CASE 4, USING FROM PAl,the Preprocessing Algorithms (Sampling method)
DROP TABLE #PAL_PARAMETER_TBL;
CREATE LOCAL TEMPORARY COLUMN TABLE #PAL_PARAMETER_TBL (
"PARAM_NAME" VARCHAR (256),
"INT_VALUE" INTEGER,
"DOUBLE_VALUE" DOUBLE,
"STRING_VALUE" VARCHAR (1000)
);
INSERT INTO #PAL_PARAMETER_TBL VALUES ('SAMPLING_METHOD', 5, NULL, NULL);
INSERT INTO #PAL_PARAMETER_TBL VALUES ('SAMPLING_SIZE', 1000000, NULL, NULL);
INSERT INTO #PAL_PARAMETER_TBL VALUES ('INTERVAL', 5, NULL, NULL);
INSERT INTO #PAL_PARAMETER_TBL VALUES ('COLUMN_CHOOSE', 2, NULL, NULL);
CALL "_SYS_AFL"."PAL_SAMPLING"("CHECK_tbl", #PAL_PARAMETER_TBL, "OUT_");
4 sampling cases
Metrics of executions
CREATE COLUMN TABLE "DEMO_USER"."DEMO_AGG" (
"ID" INTEGER NOT NULL ,
"Measure_1" DOUBLE NOT NULL ,
"Measure_2" DOUBLE NOT NULL ,
"Measure_3" DOUBLE NOT NULL ,
"Measure_4" DOUBLE NOT NULL ,
"Measure_5" DOUBLE NOT NULL ,
"Measure_6" DOUBLE NOT NULL ,
"Measure_7" DOUBLE NOT NULL ,
"Measure_8" DOUBLE NOT NULL ,
"Measure_9" DOUBLE NOT NULL ,
"Measure_10" DOUBLE NOT NULL )
SAP HANA table creation
.....
# The HANA ML library
import hana_ml.dataframe as dataframe
conn = dataframe.ConnectionContext(server_no_port, port, uname, passw)
dset = conn.sql('''SELECT * FROM DEMO_USER.DEMO_AGG''')
Connection to SAP HANA and define our Hana ML Data frame
Our Hana ML Data frame
########################################################
## ADD DERIVED AGGREGATED COLUMN FROM ALL DATA SET##
#############################################################
corr_cols = list(map(lambda y: y[0], filter(lambda x: (x[1]=='DOUBLE' or x[1]=='DECIMAL'), dset.dtypes())))
corr_cols
#define the group by clm
groupby_col = 'ID'
#define the aggr functions
agg = [ 'count','avg', 'stddev', 'min', 'max', 'median']
agg_sql = []
for operation in agg:
agg_string = map(lambda x: tuple((operation, x, operation+'_'+x)), corr_cols)
agg_sql.extend(agg_string)
dset1 = dset.agg(agg_sql, group_by=groupby_col)
#Run Statement
#dset1.collect()
#Get the sql generated
dset1.select_statement
SAP HANA ML code
- aggregate_operator is one of [‘max’, ‘min’, ‘count’, ‘avg’]
- expression is a str that is a column or column expression name that is the name of this aggregate in the project list.
'SELECT "ID",
count("Measure_1") AS "count_Measure_1",
count("Measure_2") AS "count_Measure_2",
count("Measure_3") AS "count_Measure_3",
count("Measure_4") AS "count_Measure_4",
count("Measure_5") AS "count_Measure_5",
count("Measure_6") AS "count_Measure_6",
count("Measure_7") AS "count_Measure_7",
count("Measure_8") AS "count_Measure_8",
count("Measure_9") AS "count_Measure_9",
count("Measure_10") AS "count_Measure_10",
avg("Measure_1") AS "avg_Measure_1",
avg("Measure_2") AS "avg_Measure_2",
avg("Measure_3") AS "avg_Measure_3",
avg("Measure_4") AS "avg_Measure_4",
avg("Measure_5") AS "avg_Measure_5",
avg("Measure_6") AS "avg_Measure_6",
avg("Measure_7") AS "avg_Measure_7",
avg("Measure_8") AS "avg_Measure_8",
avg("Measure_9") AS "avg_Measure_9",
avg("Measure_10") AS "avg_Measure_10",
stddev("Measure_1") AS "stddev_Measure_1",
stddev("Measure_2") AS "stddev_Measure_2",
stddev("Measure_3") AS "stddev_Measure_3",
stddev("Measure_4") AS "stddev_Measure_4",
stddev("Measure_5") AS "stddev_Measure_5",
stddev("Measure_6") AS "stddev_Measure_6",
stddev("Measure_7") AS "stddev_Measure_7",
stddev("Measure_8") AS "stddev_Measure_8",
stddev("Measure_9") AS "stddev_Measure_9",
stddev("Measure_10") AS "stddev_Measure_10",
min("Measure_1") AS "min_Measure_1",
min("Measure_2") AS "min_Measure_2",
min("Measure_3") AS "min_Measure_3",
min("Measure_4") AS "min_Measure_4",
min("Measure_5") AS "min_Measure_5",
min("Measure_6") AS "min_Measure_6",
min("Measure_7") AS "min_Measure_7",
min("Measure_8") AS "min_Measure_8",
min("Measure_9") AS "min_Measure_9",
min("Measure_10") AS "min_Measure_10",
max("Measure_1") AS "max_Measure_1",
max("Measure_2") AS "max_Measure_2",
max("Measure_3") AS "max_Measure_3",
max("Measure_4") AS "max_Measure_4",
max("Measure_5") AS "max_Measure_5",
max("Measure_6") AS "max_Measure_6",
max("Measure_7") AS "max_Measure_7",
max("Measure_8") AS "max_Measure_8",
max("Measure_9") AS "max_Measure_9",
max("Measure_10") AS "max_Measure_10",
median("Measure_1") AS "median_Measure_1",
median("Measure_2") AS "median_Measure_2",
median("Measure_3") AS "median_Measure_3",
median("Measure_4") AS "median_Measure_4",
median("Measure_5") AS "median_Measure_5",
median("Measure_6") AS "median_Measure_6",
median("Measure_7") AS "median_Measure_7",
median("Measure_8") AS "median_Measure_8",
median("Measure_9") AS "median_Measure_9",
median("Measure_10") AS "median_Measure_10"
FROM (SELECT * FROM DEMO_USER.DEMO_AGG ) AS "DT_0" GROUP BY "ID"'
#############################################################
## ADD DERIVED AGGREGATED COLUMN FROM ALL DATA SET##
#############################################################
corr_cols = list(map(lambda y: y[0], filter(lambda x: (x[1]=='DOUBLE' or x[1]=='DECIMAL'), dset.dtypes())))
corr_cols
#define the group by clm
groupby_col = 'ID'
#define the aggr functions
agg = [ 'count','avg', 'stddev', 'min', 'max', 'median']
agg_sql = []
for operation in agg:
agg_string = map(lambda x: tuple((operation, x, operation+'_'+x)), corr_cols)
agg_sql.extend(agg_string)
dset1 = dset.agg(agg_sql, group_by=groupby_col)
#Run Statement
#dset1.collect()
#Get the sql generated
#dset1.select_statement
#############################################################
#############################################################
## ADD DERIVED AGGREGATED COLUMN PARTITION BY THE ID COLUMN##
#############################################################
agg = [ 'count','avg', 'stddev', 'min', 'max', 'median']
agg_sql = []
for operation in agg:
agg_string = map(lambda x :'%s("%s") over (partition by ID) AS "pct_%s_%s"'%(operation,x,operation,x),corr_cols)
agg_sql.extend(agg_string)
agg_string =", ".join(agg_sql)
df_window_fct_ren = conn.sql('select distinct * from (select "ID", ' + agg_string + ' from DEMO_USER.DEMO_AGG)')
result = dset1.set_index('ID').join(df_window_fct_ren.set_index('ID'))
#Run Statement
#result.collect()
#Get the sql generated - FINAL#
result.select_statement
#############################################################
SELECT T0."ID",
T0."count_Measure_1",
T0."count_Measure_2",
T0."count_Measure_3",
T0."count_Measure_4",
T0."count_Measure_5",
T0."count_Measure_6",
T0."count_Measure_7",
T0."count_Measure_8",
T0."count_Measure_9",
T0."count_Measure_10",
T0."avg_Measure_1",
T0."avg_Measure_2",
T0."avg_Measure_3",
T0."avg_Measure_4",
T0."avg_Measure_5",
T0."avg_Measure_6",
T0."avg_Measure_7",
T0."avg_Measure_8",
T0."avg_Measure_9",
T0."avg_Measure_10",
T0."stddev_Measure_1",
T0."stddev_Measure_2",
T0."stddev_Measure_3",
T0."stddev_Measure_4",
T0."stddev_Measure_5",
T0."stddev_Measure_6",
T0."stddev_Measure_7",
T0."stddev_Measure_8",
T0."stddev_Measure_9",
T0."stddev_Measure_10",
T0."min_Measure_1",
T0."min_Measure_2",
T0."min_Measure_3",
T0."min_Measure_4",
T0."min_Measure_5",
T0."min_Measure_6",
T0."min_Measure_7",
T0."min_Measure_8",
T0."min_Measure_9",
T0."min_Measure_10",
T0."max_Measure_1",
T0."max_Measure_2",
T0."max_Measure_3",
T0."max_Measure_4",
T0."max_Measure_5",
T0."max_Measure_6",
T0."max_Measure_7",
T0."max_Measure_8",
T0."max_Measure_9",
T0."max_Measure_10",
T0."median_Measure_1",
T0."median_Measure_2",
T0."median_Measure_3",
T0."median_Measure_4",
T0."median_Measure_5",
T0."median_Measure_6",
T0."median_Measure_7",
T0."median_Measure_8",
T0."median_Measure_9",
T0."median_Measure_10",
T1."pct_count_Measure_1",
T1."pct_count_Measure_2",
T1."pct_count_Measure_3",
T1."pct_count_Measure_4",
T1."pct_count_Measure_5",
T1."pct_count_Measure_6",
T1."pct_count_Measure_7",
T1."pct_count_Measure_8",
T1."pct_count_Measure_9",
T1."pct_count_Measure_10",
T1."pct_avg_Measure_1",
T1."pct_avg_Measure_2",
T1."pct_avg_Measure_3",
T1."pct_avg_Measure_4",
T1."pct_avg_Measure_5",
T1."pct_avg_Measure_6",
T1."pct_avg_Measure_7",
T1."pct_avg_Measure_8",
T1."pct_avg_Measure_9",
T1."pct_avg_Measure_10",
T1."pct_stddev_Measure_1",
T1."pct_stddev_Measure_2",
T1."pct_stddev_Measure_3",
T1."pct_stddev_Measure_4",
T1."pct_stddev_Measure_5",
T1."pct_stddev_Measure_6",
T1."pct_stddev_Measure_7",
T1."pct_stddev_Measure_8",
T1."pct_stddev_Measure_9",
T1."pct_stddev_Measure_10",
T1."pct_min_Measure_1",
T1."pct_min_Measure_2",
T1."pct_min_Measure_3",
T1."pct_min_Measure_4",
T1."pct_min_Measure_5",
T1."pct_min_Measure_6",
T1."pct_min_Measure_7",
T1."pct_min_Measure_8",
T1."pct_min_Measure_9",
T1."pct_min_Measure_10",
T1."pct_max_Measure_1",
T1."pct_max_Measure_2",
T1."pct_max_Measure_3",
T1."pct_max_Measure_4",
T1."pct_max_Measure_5",
T1."pct_max_Measure_6",
T1."pct_max_Measure_7",
T1."pct_max_Measure_8",
T1."pct_max_Measure_9",
T1."pct_max_Measure_10",
T1."pct_median_Measure_1",
T1."pct_median_Measure_2",
T1."pct_median_Measure_3",
T1."pct_median_Measure_4",
T1."pct_median_Measure_5",
T1."pct_median_Measure_6",
T1."pct_median_Measure_7",
T1."pct_median_Measure_8",
T1."pct_median_Measure_9",
T1."pct_median_Measure_10"\nFROM (SELECT "ID",
count("Measure_1") AS "count_Measure_1",
count("Measure_2") AS "count_Measure_2",
count("Measure_3") AS "count_Measure_3",
count("Measure_4") AS "count_Measure_4",
count("Measure_5") AS "count_Measure_5",
count("Measure_6") AS "count_Measure_6",
count("Measure_7") AS "count_Measure_7",
count("Measure_8") AS "count_Measure_8",
count("Measure_9") AS "count_Measure_9",
count("Measure_10") AS "count_Measure_10",
avg("Measure_1") AS "avg_Measure_1",
avg("Measure_2") AS "avg_Measure_2",
avg("Measure_3") AS "avg_Measure_3",
avg("Measure_4") AS "avg_Measure_4",
avg("Measure_5") AS "avg_Measure_5",
avg("Measure_6") AS "avg_Measure_6",
avg("Measure_7") AS "avg_Measure_7",
avg("Measure_8") AS "avg_Measure_8",
avg("Measure_9") AS "avg_Measure_9",
avg("Measure_10") AS "avg_Measure_10",
stddev("Measure_1") AS "stddev_Measure_1",
stddev("Measure_2") AS "stddev_Measure_2",
stddev("Measure_3") AS "stddev_Measure_3",
stddev("Measure_4") AS "stddev_Measure_4",
stddev("Measure_5") AS "stddev_Measure_5",
stddev("Measure_6") AS "stddev_Measure_6",
stddev("Measure_7") AS "stddev_Measure_7",
stddev("Measure_8") AS "stddev_Measure_8",
stddev("Measure_9") AS "stddev_Measure_9",
stddev("Measure_10") AS "stddev_Measure_10",
min("Measure_1") AS "min_Measure_1",
min("Measure_2") AS "min_Measure_2",
min("Measure_3") AS "min_Measure_3",
min("Measure_4") AS "min_Measure_4",
min("Measure_5") AS "min_Measure_5",
min("Measure_6") AS "min_Measure_6",
min("Measure_7") AS "min_Measure_7",
min("Measure_8") AS "min_Measure_8",
min("Measure_9") AS "min_Measure_9",
min("Measure_10") AS "min_Measure_10",
max("Measure_1") AS "max_Measure_1",
max("Measure_2") AS "max_Measure_2",
max("Measure_3") AS "max_Measure_3",
max("Measure_4") AS "max_Measure_4",
max("Measure_5") AS "max_Measure_5",
max("Measure_6") AS "max_Measure_6",
max("Measure_7") AS "max_Measure_7",
max("Measure_8") AS "max_Measure_8",
max("Measure_9") AS "max_Measure_9",
max("Measure_10") AS "max_Measure_10",
median("Measure_1") AS "median_Measure_1",
median("Measure_2") AS "median_Measure_2",
median("Measure_3") AS "median_Measure_3",
median("Measure_4") AS "median_Measure_4",
median("Measure_5") AS "median_Measure_5",
median("Measure_6") AS "median_Measure_6",
median("Measure_7") AS "median_Measure_7",
median("Measure_8") AS "median_Measure_8",
median("Measure_9") AS "median_Measure_9",
median("Measure_10") AS "median_Measure_10"
FROM (SELECT * FROM DEMO_USER.DEMO_AGG\n ) AS "DT_0" GROUP BY "ID") T0
INNER JOIN (select distinct * from (select "ID",
count("Measure_1") over (partition by ID) AS "pct_count_Measure_1",
count("Measure_2") over (partition by ID) AS "pct_count_Measure_2",
count("Measure_3") over (partition by ID) AS "pct_count_Measure_3",
count("Measure_4") over (partition by ID) AS "pct_count_Measure_4",
count("Measure_5") over (partition by ID) AS "pct_count_Measure_5",
count("Measure_6") over (partition by ID) AS "pct_count_Measure_6",
count("Measure_7") over (partition by ID) AS "pct_count_Measure_7",
count("Measure_8") over (partition by ID) AS "pct_count_Measure_8",
count("Measure_9") over (partition by ID) AS "pct_count_Measure_9",
count("Measure_10") over (partition by ID) AS "pct_count_Measure_10",
avg("Measure_1") over (partition by ID) AS "pct_avg_Measure_1",
avg("Measure_2") over (partition by ID) AS "pct_avg_Measure_2",
avg("Measure_3") over (partition by ID) AS "pct_avg_Measure_3",
avg("Measure_4") over (partition by ID) AS "pct_avg_Measure_4",
avg("Measure_5") over (partition by ID) AS "pct_avg_Measure_5",
avg("Measure_6") over (partition by ID) AS "pct_avg_Measure_6",
avg("Measure_7") over (partition by ID) AS "pct_avg_Measure_7",
avg("Measure_8") over (partition by ID) AS "pct_avg_Measure_8",
avg("Measure_9") over (partition by ID) AS "pct_avg_Measure_9",
avg("Measure_10") over (partition by ID) AS "pct_avg_Measure_10",
stddev("Measure_1") over (partition by ID) AS "pct_stddev_Measure_1",
stddev("Measure_2") over (partition by ID) AS "pct_stddev_Measure_2",
stddev("Measure_3") over (partition by ID) AS "pct_stddev_Measure_3",
stddev("Measure_4") over (partition by ID) AS "pct_stddev_Measure_4",
stddev("Measure_5") over (partition by ID) AS "pct_stddev_Measure_5",
stddev("Measure_6") over (partition by ID) AS "pct_stddev_Measure_6",
stddev("Measure_7") over (partition by ID) AS "pct_stddev_Measure_7",
stddev("Measure_8") over (partition by ID) AS "pct_stddev_Measure_8",
stddev("Measure_9") over (partition by ID) AS "pct_stddev_Measure_9",
stddev("Measure_10") over (partition by ID) AS "pct_stddev_Measure_10",
min("Measure_1") over (partition by ID) AS "pct_min_Measure_1",
min("Measure_2") over (partition by ID) AS "pct_min_Measure_2",
min("Measure_3") over (partition by ID) AS "pct_min_Measure_3",
min("Measure_4") over (partition by ID) AS "pct_min_Measure_4",
min("Measure_5") over (partition by ID) AS "pct_min_Measure_5",
min("Measure_6") over (partition by ID) AS "pct_min_Measure_6",
min("Measure_7") over (partition by ID) AS "pct_min_Measure_7",
min("Measure_8") over (partition by ID) AS "pct_min_Measure_8",
min("Measure_9") over (partition by ID) AS "pct_min_Measure_9",
min("Measure_10") over (partition by ID) AS "pct_min_Measure_10",
max("Measure_1") over (partition by ID) AS "pct_max_Measure_1",
max("Measure_2") over (partition by ID) AS "pct_max_Measure_2",
max("Measure_3") over (partition by ID) AS "pct_max_Measure_3",
max("Measure_4") over (partition by ID) AS "pct_max_Measure_4",
max("Measure_5") over (partition by ID) AS "pct_max_Measure_5",
max("Measure_6") over (partition by ID) AS "pct_max_Measure_6",
max("Measure_7") over (partition by ID) AS "pct_max_Measure_7",
max("Measure_8") over (partition by ID) AS "pct_max_Measure_8",
max("Measure_9") over (partition by ID) AS "pct_max_Measure_9",
max("Measure_10") over (partition by ID) AS "pct_max_Measure_10",
median("Measure_1") over (partition by ID) AS "pct_median_Measure_1",
median("Measure_2") over (partition by ID) AS "pct_median_Measure_2",
median("Measure_3") over (partition by ID) AS "pct_median_Measure_3",
median("Measure_4") over (partition by ID) AS "pct_median_Measure_4",
median("Measure_5") over (partition by ID) AS "pct_median_Measure_5",
median("Measure_6") over (partition by ID) AS "pct_median_Measure_6",
median("Measure_7") over (partition by ID) AS "pct_median_Measure_7",
median("Measure_8") over (partition by ID) AS "pct_median_Measure_8",
median("Measure_9") over (partition by ID) AS "pct_median_Measure_9",
median("Measure_10") over (partition by ID) AS "pct_median_Measure_10" from DEMO_USER.DEMO_AGG)) T1
ON T0."ID" = T1."ID"
You must be a registered user to add a comment. If you've already registered, sign in. Otherwise, register and sign in.
User | Count |
---|---|
37 | |
25 | |
17 | |
13 | |
7 | |
7 | |
7 | |
6 | |
6 | |
6 |