>>> from math import pi
>>> π = pi
>>> area = π * r**2
>>> résumé = "knows Python"
>>> "Python" in résumé
True
What could go wrong? You have the skills, you have the data... but all too often Data Science is seen as more of an analysis task than a process.
"So how can we avoid this seemingly unavoidable issue? My Data Scientists don't want to use proprietary tools, they like Python!"
# First instantiate the Connection Object (conn)
conn = dataframe.ConnectionContext("IP ADDRESS", "PORT", "USER", "PASSWORD")
# Then create the HANA dataframe (df) and point to the table.
df = conn.table("TABLE", schema="SCHEMA")
df.head(5).collect()
ITEM_IDENTIFIER | ITEM_WEIGHT | ITEM_FAT_CONTENT | ITEM_VISIBILITY | ITEM_TYPE | ITEM_MRP | OUTLET_IDENTIFIER | OUTLET_ESTABLISHMENT_YEAR | OUTLET_SIZE | OUTLET_LOCATION_TYPE | OUTLET_TYPE | ITEM_OUTLET_SALES |
FDA15 | 9.3 | Low Fat | 0.016047 | Dairy | 249.8092 | OUT049 | 1999 | Medium | Tier 1 | Supermarket Type1 | 3735.138 |
DRC01 | 5.92 | Regular | 0.019278 | Soft Drinks | 48.2692 | OUT018 | 2009 | Medium | Tier 3 | Supermarket Type2 | 443.4228 |
FDN15 | 17.5 | Low Fat | 0.01676 | Meat | 141.618 | OUT049 | 1999 | Medium | Tier 1 | Supermarket Type1 | 2097.27 |
FDX07 | 19.2 | Regular | 0 | Fruits and Vegetables | 182.095 | OUT010 | 1998 | Tier 3 | Grocery Store | 732.38 | |
NCD19 | 8.93 | Low Fat | 0 | Household | 53.8614 | OUT013 | 1987 | High | Tier 3 | Supermarket Type1 | 994.7052 |
-- The first method, head() generates this SQL
SELECT TOP 5 * FROM (SELECT * FROM "SCHEMA"."TABLE") dt
# The second method collect() returns the results in a Pandas DataFrame
return pd.DataFrame(results, columns=self.columns)
# Describe all columns in a DataFrame
df.describe().collect()
column | count | unique | nulls | mean | std | min | max | median | 25_percent_cont | 25_percent_disc | 50_percent_cont | 50_percent_disc | 75_percent_cont | 75_percent_disc |
ITEM_WEIGHT | 8523 | 416 | 0 | 10.65059 | 6.431899 | 0 | 21.35 | 11 | 6.65 | 6.65 | 11 | 11 | 16 | 16 |
ITEM_VISIBILITY | 8523 | 7880 | 0 | 0.066132 | 0.051598 | 0 | 0.328391 | 0.053931 | 0.026989 | 0.026986 | 0.053931 | 0.053931 | 0.094585 | 0.094603 |
ITEM_MRP | 8523 | 5938 | 0 | 140.9928 | 62.27507 | 31.29 | 266.8884 | 143.0128 | 93.8265 | 93.8094 | 143.0128 | 143.0128 | 185.6437 | 185.6608 |
OUTLET_ESTABLISHMENT_YEAR | 8523 | 9 | 0 | 1997.832 | 8.37176 | 1985 | 2009 | 1999 | 1987 | 1987 | 1999 | 1999 | 2004 | 2004 |
ITEM_OUTLET_SALES | 8523 | 3493 | 0 | 2181.289 | 1706.5 | 33.29 | 13086.96 | 1794.331 | 834.2474 | 833.5816 | 1794.331 | 1794.331 | 3101.296 | 3101.296 |
-- First some simple statistics are calculated
SELECT 'ITEM_WEIGHT' AS "column",
COUNT("ITEM_WEIGHT") AS "count",
COUNT(DISTINCT "ITEM_WEIGHT") AS "unique",
SUM(CASE WHEN "ITEM_WEIGHT" IS NULL THEN 1 ELSE 0 END) AS "nulls",
AVG("ITEM_WEIGHT") AS "mean",
STDDEV("ITEM_WEIGHT") AS "std",
MIN("ITEM_WEIGHT") AS "min",
MAX("ITEM_WEIGHT") AS "max",
MEDIAN("ITEM_WEIGHT") AS "median" FROM (SELECT * FROM "SCHEMA"."TABLE") AS "DT_0"
UNION ALL SELECT 'ITEM_VISIBILITY' AS "column",
COUNT("ITEM_VISIBILITY") AS "count",
COUNT(DISTINCT "ITEM_VISIBILITY") AS "unique",
SUM(CASE WHEN "ITEM_VISIBILITY" IS NULL THEN 1 ELSE 0 END) AS "nulls",
AVG("ITEM_VISIBILITY") AS "mean",
STDDEV("ITEM_VISIBILITY") AS "std",
MIN("ITEM_VISIBILITY") AS "min",
MAX("ITEM_VISIBILITY") AS "max",
MEDIAN("ITEM_VISIBILITY") AS "median" FROM (SELECT * FROM "SCHEMA"."TABLE") AS "DT_0"
UNION ALL SELECT 'ITEM_MRP' AS "column",
COUNT("ITEM_MRP") AS "count",
COUNT(DISTINCT "ITEM_MRP") AS "unique",
SUM(CASE WHEN "ITEM_MRP" IS NULL THEN 1 ELSE 0 END) AS "nulls",
AVG("ITEM_MRP") AS "mean",
STDDEV("ITEM_MRP") AS "std",
MIN("ITEM_MRP") AS "min",
MAX("ITEM_MRP") AS "max",
MEDIAN("ITEM_MRP") AS "median" FROM (SELECT * FROM "SCHEMA"."TABLE") AS "DT_0"
UNION ALL SELECT 'OUTLET_ESTABLISHMENT_YEAR' AS "column",
COUNT("OUTLET_ESTABLISHMENT_YEAR") AS "count",
COUNT(DISTINCT "OUTLET_ESTABLISHMENT_YEAR") AS "unique",
SUM(CASE WHEN "OUTLET_ESTABLISHMENT_YEAR" IS NULL THEN 1 ELSE 0 END) AS "nulls",
AVG("OUTLET_ESTABLISHMENT_YEAR") AS "mean",
STDDEV("OUTLET_ESTABLISHMENT_YEAR") AS "std",
MIN("OUTLET_ESTABLISHMENT_YEAR") AS "min",
MAX("OUTLET_ESTABLISHMENT_YEAR") AS "max",
MEDIAN("OUTLET_ESTABLISHMENT_YEAR") AS "median" FROM (SELECT * FROM "SCHEMA"."TABLE") AS "DT_0"
UNION ALL SELECT 'ITEM_OUTLET_SALES' AS "column",
COUNT("ITEM_OUTLET_SALES") AS "count",
COUNT(DISTINCT "ITEM_OUTLET_SALES") AS "unique",
SUM(CASE WHEN "ITEM_OUTLET_SALES" IS NULL THEN 1 ELSE 0 END) AS "nulls",
AVG("ITEM_OUTLET_SALES") AS "mean",
STDDEV("ITEM_OUTLET_SALES") AS "std",
MIN("ITEM_OUTLET_SALES") AS "min",
MAX("ITEM_OUTLET_SALES") AS "max",
MEDIAN("ITEM_OUTLET_SALES") AS "median" FROM (SELECT * FROM "SCHEMA"."TABLE") AS "DT_0";
-- Second, percentiles are calculated
SELECT 'ITEM_WEIGHT' as "column",
* FROM (SELECT percentile_cont(0.25) WITHIN GROUP (ORDER BY "ITEM_WEIGHT") AS "25_percent_cont",
percentile_disc(0.25) WITHIN GROUP (ORDER BY "ITEM_WEIGHT") AS "25_percent_disc",
percentile_cont(0.50) WITHIN GROUP (ORDER BY "ITEM_WEIGHT") AS "50_percent_cont",
percentile_disc(0.50) WITHIN GROUP (ORDER BY "ITEM_WEIGHT") AS "50_percent_disc",
percentile_cont(0.75) WITHIN GROUP (ORDER BY "ITEM_WEIGHT") AS "75_percent_cont",
percentile_disc(0.75) WITHIN GROUP (ORDER BY "ITEM_WEIGHT") AS "75_percent_disc" FROM (SELECT * FROM "SCHEMA"."TABLE") AS "DT_0")
UNION ALL SELECT 'ITEM_VISIBILITY' as "column",
* FROM (SELECT percentile_cont(0.25) WITHIN GROUP (ORDER BY "ITEM_VISIBILITY") AS "25_percent_cont",
percentile_disc(0.25) WITHIN GROUP (ORDER BY "ITEM_VISIBILITY") AS "25_percent_disc",
percentile_cont(0.50) WITHIN GROUP (ORDER BY "ITEM_VISIBILITY") AS "50_percent_cont",
percentile_disc(0.50) WITHIN GROUP (ORDER BY "ITEM_VISIBILITY") AS "50_percent_disc",
percentile_cont(0.75) WITHIN GROUP (ORDER BY "ITEM_VISIBILITY") AS "75_percent_cont",
percentile_disc(0.75) WITHIN GROUP (ORDER BY "ITEM_VISIBILITY") AS "75_percent_disc" FROM (SELECT * FROM "SCHEMA"."TABLE") AS "DT_0")
UNION ALL SELECT 'ITEM_MRP' as "column",
* FROM (SELECT percentile_cont(0.25) WITHIN GROUP (ORDER BY "ITEM_MRP") AS "25_percent_cont",
percentile_disc(0.25) WITHIN GROUP (ORDER BY "ITEM_MRP") AS "25_percent_disc",
percentile_cont(0.50) WITHIN GROUP (ORDER BY "ITEM_MRP") AS "50_percent_cont",
percentile_disc(0.50) WITHIN GROUP (ORDER BY "ITEM_MRP") AS "50_percent_disc",
percentile_cont(0.75) WITHIN GROUP (ORDER BY "ITEM_MRP") AS "75_percent_cont",
percentile_disc(0.75) WITHIN GROUP (ORDER BY "ITEM_MRP") AS "75_percent_disc" FROM (SELECT * FROM "SCHEMA"."TABLE") AS "DT_0")
UNION ALL SELECT 'OUTLET_ESTABLISHMENT_YEAR' as "column",
* FROM (SELECT percentile_cont(0.25) WITHIN GROUP (ORDER BY "OUTLET_ESTABLISHMENT_YEAR") AS "25_percent_cont",
percentile_disc(0.25) WITHIN GROUP (ORDER BY "OUTLET_ESTABLISHMENT_YEAR") AS "25_percent_disc",
percentile_cont(0.50) WITHIN GROUP (ORDER BY "OUTLET_ESTABLISHMENT_YEAR") AS "50_percent_cont",
percentile_disc(0.50) WITHIN GROUP (ORDER BY "OUTLET_ESTABLISHMENT_YEAR") AS "50_percent_disc",
percentile_cont(0.75) WITHIN GROUP (ORDER BY "OUTLET_ESTABLISHMENT_YEAR") AS "75_percent_cont",
percentile_disc(0.75) WITHIN GROUP (ORDER BY "OUTLET_ESTABLISHMENT_YEAR") AS "75_percent_disc" FROM (SELECT * FROM "SCHEMA"."TABLE") AS "DT_0")
UNION ALL SELECT 'ITEM_OUTLET_SALES' as "column",
* FROM (SELECT percentile_cont(0.25) WITHIN GROUP (ORDER BY "ITEM_OUTLET_SALES") AS "25_percent_cont",
percentile_disc(0.25) WITHIN GROUP (ORDER BY "ITEM_OUTLET_SALES") AS "25_percent_disc",
percentile_cont(0.50) WITHIN GROUP (ORDER BY "ITEM_OUTLET_SALES") AS "50_percent_cont",
percentile_disc(0.50) WITHIN GROUP (ORDER BY "ITEM_OUTLET_SALES") AS "50_percent_disc",
percentile_cont(0.75) WITHIN GROUP (ORDER BY "ITEM_OUTLET_SALES") AS "75_percent_cont",
percentile_disc(0.75) WITHIN GROUP (ORDER BY "ITEM_OUTLET_SALES") AS "75_percent_disc" FROM (SELECT * FROM "SCHEMA"."TABLE") AS "DT_0");
# This will return the SQL statement for the describe method
df.describe().select_statement
You must be a registered user to add a comment. If you've already registered, sign in. Otherwise, register and sign in.
User | Count |
---|---|
21 | |
16 | |
13 | |
12 | |
11 | |
10 | |
9 | |
9 | |
8 | |
7 |