(source:http://gdeltproject.org/)
http://data.gdeltproject.org/events/index.html
http://gdeltproject.org/data.html#documentation
In our scenario, we decided to use SAP HANA to host the "dimension" data and SAP Vora to host the events data.
CREATE SCHEMA DEMO_HANA;
CREATE COLUMN TABLE "DEMO_HANA"."COUNTRY" (
"CODE" NVARCHAR(3) NOT NULL,
"LABEL" NVARCHAR(50),
PRIMARY KEY ("CODE")
) UNLOAD PRIORITY 5 AUTO MERGE ;
CREATE COLUMN TABLE "DEMO_HANA"."EVENTCODE" (
"CAMEOEVENTCODE" NVARCHAR(5) NOT NULL,
"EVENTDESCRIPTION" NVARCHAR(100),
PRIMARY KEY ("CAMEOEVENTCODE")
) UNLOAD PRIORITY 5 AUTO MERGE ;
CREATE COLUMN TABLE "DEMO_HANA"."GOLDSTEINSCALE" (
"CAMEOEVENTCODE" NVARCHAR(5) NOT NULL,
"GOLDSTEINSCALE" DECIMAL(6,1) CS_FIXED,
PRIMARY KEY ("CAMEOEVENTCODE")
) UNLOAD PRIORITY 5 AUTO MERGE ;
http://data.gdeltproject.org/events/index.html
Then you can transfer the files to your SAP Vora virtual machine (using either the "shared folder" feature of your "hypervisor" or via FTP).
You can for example copy the files into a directory like: /home/vora/zip
Note: for those familiar with wget or curl, you can download the files directly to your virtual machine (but make sure your proxy is properly configured if you use one).
Also I recommend you to transfer the unzipped version of the files, so it will be faster, but then you will need to unzip them using them with a command like this:
mkdir /home/vora/csv
unzip "/home/vora/zip/*.zip" -d /home/vora/csv
The uncompressed files are "CSV", but they use tabs as delimiters, and are not using quotes to delimit fields, which will not ease the import in the disk engine (it works fine with the relational engine however).
So, you can run the following command to transform the csv files into a more standard format:
for i in /home/vora/csv/*.CSV; do sed -e 's/"/\\"/g' -e 's/<tab>/","/g' -e 's/^/"/' -e 's/$/"/' "$i" > "${i/.CSV}".csv; done
Make sure you replace the "<tab>" by an actual tab (using CTRL+V, Tab key).
Then, we need to import the data into Hadoop HDFS with the following command (executed as the vora user:
sudo hdfs dfs -mkdir /user/hdfs
sudo hdfs dfs -mkdir /user/hdfs/csv
sudo hdfs dfs -put /home/vora/csv/*.csv /user/hdfs/csv
sudo hdfs dfs -chmod -R 777 /user/hdfs
We used "sudo" here, because Hadoop is currently installed and running as the "root" user.
Now that the files are on the Hadoop HDFS, you can run the following SQL command from the Vora tools SQL Editor to create the tables:
DROP TABLE IF EXISTS EVENTS_DE;
CREATE TABLE EVENTS_DE (
GLOBALEVENTID BIGINT,
SQLDATE INTEGER,
MONTHYEAR INTEGER,
YYEAR INTEGER,
FRACTIONDATE VARCHAR(20),
ACTOR1CODE VARCHAR(20),
ACTOR1NAME VARCHAR(255),
ACTOR1COUNTRYCODE VARCHAR(3),
ACTOR1KNOWNGROUPCODE VARCHAR(3),
ACTOR1ETHNICCODE VARCHAR(3),
ACTOR1RELIGION1CODE VARCHAR(3),
ACTOR1RELIGION2CODE VARCHAR(3),
ACTOR1TYPE1CODE VARCHAR(3),
ACTOR1TYPE2CODE VARCHAR(3),
ACTOR1TYPE3CODE VARCHAR(3),
ACTOR2CODE VARCHAR(15),
ACTOR2NAME VARCHAR(255),
ACTOR2COUNTRYCODE VARCHAR(3),
ACTOR2KNOWNGROUPCODE VARCHAR(3),
ACTOR2ETHNICCODE VARCHAR(3),
ACTOR2RELIGION1CODE VARCHAR(3),
ACTOR2RELIGION2CODE VARCHAR(3),
ACTOR2TYPE1CODE VARCHAR(3),
ACTOR2TYPE2CODE VARCHAR(3),
ACTOR2TYPE3CODE VARCHAR(3),
ISROOTEVENT INTEGER,
EVENTCODE VARCHAR(4),
EVENTBASECODE VARCHAR(3),
EVENTROOTCODE VARCHAR(2),
QUADCLASS INTEGER,
GOLDSTEINSCALE DECIMAL(1,1),
NUMMENTIONS INTEGER,
NUMSOURCES INTEGER,
NUMARTICLES INTEGER,
AVGTONE DOUBLE,
ACTOR1GEO_TYPE INTEGER,
ACTOR1GEO_FULLNAME VARCHAR(255),
ACTOR1GEO_COUNTRYCODE VARCHAR(2),
ACTOR1GEO_ADM1CODE VARCHAR(5),
ACTOR1GEO_LAT DECIMAL(9,4),
ACTOR1GEO_LONG DECIMAL(9,4),
ACTOR1GEO_FEATUREID VARCHAR(10),
ACTOR2GEO_TYPE INTEGER,
ACTOR2GEO_FULLNAME VARCHAR(255),
ACTOR2GEO_COUNTRYCODE VARCHAR(3),
ACTOR2GEO_ADM1CODE VARCHAR(5),
ACTOR2GEO_LAT DECIMAL(9,4),
ACTOR2GEO_LONG DECIMAL(9,4),
ACTOR2GEO_FEATUREID VARCHAR(10),
ACTIONGEO_TYPE INTEGER,
ACTIONGEO_FULLNAME VARCHAR(255),
ACTIONGEO_COUNTRYCODE VARCHAR(3),
ACTIONGEO_ADM1CODE VARCHAR(5),
ACTIONGEO_LAT DECIMAL(9,4),
ACTIONGEO_LONG DECIMAL(9,4),
ACTIONGEO_FEATUREID VARCHAR(50),
DATEADDED INTEGER,
SOURCEURL VARCHAR(255)
) USING
com.sap.spark.engines.disk
OPTIONS (
storagebackend "hdfs",
files "/user/hdfs/csv/20170318.export.csv",
format "csv",
tableName "EVENTS_DISK",
tableSchema "GLOBALEVENTID BIGINT, SQLDATE INTEGER, MONTHYEAR INTEGER, YYEAR INTEGER, FRACTIONDATE VARCHAR(20), ACTOR1CODE VARCHAR(20), ACTOR1NAME VARCHAR(255), ACTOR1COUNTRYCODE VARCHAR(3), ACTOR1KNOWNGROUPCODE VARCHAR(3), ACTOR1ETHNICCODE VARCHAR(3), ACTOR1RELIGION1CODE VARCHAR(3), ACTOR1RELIGION2CODE VARCHAR(3), ACTOR1TYPE1CODE VARCHAR(3), ACTOR1TYPE2CODE VARCHAR(3), ACTOR1TYPE3CODE VARCHAR(3), ACTOR2CODE VARCHAR(15), ACTOR2NAME VARCHAR(255), ACTOR2COUNTRYCODE VARCHAR(3), ACTOR2KNOWNGROUPCODE VARCHAR(3), ACTOR2ETHNICCODE VARCHAR(3), ACTOR2RELIGION1CODE VARCHAR(3), ACTOR2RELIGION2CODE VARCHAR(3), ACTOR2TYPE1CODE VARCHAR(3), ACTOR2TYPE2CODE VARCHAR(3), ACTOR2TYPE3CODE VARCHAR(3), ISROOTEVENT INTEGER, EVENTCODE VARCHAR(4), EVENTBASECODE VARCHAR(3), EVENTROOTCODE VARCHAR(2), QUADCLASS INTEGER, GOLDSTEINSCALE DECIMAL(1,1), NUMMENTIONS INTEGER, NUMSOURCES INTEGER, NUMARTICLES INTEGER, AVGTONE DOUBLE, ACTOR1GEO_TYPE INTEGER, ACTOR1GEO_FULLNAME VARCHAR(255), ACTOR1GEO_COUNTRYCODE VARCHAR(2), ACTOR1GEO_ADM1CODE VARCHAR(5), ACTOR1GEO_LAT DECIMAL(9,4), ACTOR1GEO_LONG DECIMAL(9,4), ACTOR1GEO_FEATUREID VARCHAR(10), ACTOR2GEO_TYPE INTEGER, ACTOR2GEO_FULLNAME VARCHAR(255), ACTOR2GEO_COUNTRYCODE VARCHAR(3), ACTOR2GEO_ADM1CODE VARCHAR(5), ACTOR2GEO_LAT DECIMAL(9,4), ACTOR2GEO_LONG DECIMAL(9,4), ACTOR2GEO_FEATUREID VARCHAR(10), ACTIONGEO_TYPE INTEGER, ACTIONGEO_FULLNAME VARCHAR(255), ACTIONGEO_COUNTRYCODE VARCHAR(3), ACTIONGEO_ADM1CODE VARCHAR(5), ACTIONGEO_LAT DECIMAL(9,4), ACTIONGEO_LONG DECIMAL(9,4), ACTIONGEO_FEATUREID VARCHAR(50), DATEADDED INTEGER, SOURCEURL VARCHAR(255)"
);
SELECT * FROM EVENTS_DE LIMIT 10;
DROP TABLE IF EXISTS EVENTS_RE;
CREATE TABLE EVENTS_RE(
GLOBALEVENTID BIGINT,
SQLDATE INTEGER,
MONTHYEAR INTEGER,
YYEAR INTEGER,
FRACTIONDATE VARCHAR(20),
ACTOR1CODE VARCHAR(20),
ACTOR1NAME VARCHAR(255),
ACTOR1COUNTRYCODE VARCHAR(3),
ACTOR1KNOWNGROUPCODE VARCHAR(3),
ACTOR1ETHNICCODE VARCHAR(3),
ACTOR1RELIGION1CODE VARCHAR(3),
ACTOR1RELIGION2CODE VARCHAR(3),
ACTOR1TYPE1CODE VARCHAR(3),
ACTOR1TYPE2CODE VARCHAR(3),
ACTOR1TYPE3CODE VARCHAR(3),
ACTOR2CODE VARCHAR(15),
ACTOR2NAME VARCHAR(255),
ACTOR2COUNTRYCODE VARCHAR(3),
ACTOR2KNOWNGROUPCODE VARCHAR(3),
ACTOR2ETHNICCODE VARCHAR(3),
ACTOR2RELIGION1CODE VARCHAR(3),
ACTOR2RELIGION2CODE VARCHAR(3),
ACTOR2TYPE1CODE VARCHAR(3),
ACTOR2TYPE2CODE VARCHAR(3),
ACTOR2TYPE3CODE VARCHAR(3),
ISROOTEVENT INTEGER,
EVENTCODE VARCHAR(4),
EVENTBASECODE VARCHAR(3),
EVENTROOTCODE VARCHAR(2),
QUADCLASS INTEGER,
GOLDSTEINSCALE DECIMAL(3,1),
NUMMENTIONS INTEGER,
NUMSOURCES INTEGER,
NUMARTICLES INTEGER,
AVGTONE DOUBLE,
ACTOR1GEO_TYPE INTEGER,
ACTOR1GEO_FULLNAME VARCHAR(255),
ACTOR1GEO_COUNTRYCODE VARCHAR(2),
ACTOR1GEO_ADM1CODE VARCHAR(5),
ACTOR1GEO_LAT DECIMAL(9,4),
ACTOR1GEO_LONG DECIMAL(9,4),
ACTOR1GEO_FEATUREID VARCHAR(10),
ACTOR2GEO_TYPE INTEGER,
ACTOR2GEO_FULLNAME VARCHAR(255),
ACTOR2GEO_COUNTRYCODE VARCHAR(3),
ACTOR2GEO_ADM1CODE VARCHAR(5),
ACTOR2GEO_LAT DECIMAL(9,4),
ACTOR2GEO_LONG DECIMAL(9,4),
ACTOR2GEO_FEATUREID VARCHAR(10),
ACTIONGEO_TYPE INTEGER,
ACTIONGEO_FULLNAME VARCHAR(255),
ACTIONGEO_COUNTRYCODE VARCHAR(3),
ACTIONGEO_ADM1CODE VARCHAR(5),
ACTIONGEO_LAT DECIMAL(9,4),
ACTIONGEO_LONG DECIMAL(9,4),
ACTIONGEO_FEATUREID VARCHAR(50),
DATEADDED INTEGER,
SOURCEURL VARCHAR(255)
) USING
com.sap.spark.vora
OPTIONS (
storagebackend "hdfs",
files "/user/hdfs/csv/20170318.export.csv",
format "csv",
schema "GLOBALEVENTID BIGINT, SQLDATE INTEGER, MONTHYEAR INTEGER, YYEAR INTEGER, FRACTIONDATE VARCHAR(20), ACTOR1CODE VARCHAR(20), ACTOR1NAME VARCHAR(255), ACTOR1COUNTRYCODE VARCHAR(3), ACTOR1KNOWNGROUPCODE VARCHAR(3), ACTOR1ETHNICCODE VARCHAR(3), ACTOR1RELIGION1CODE VARCHAR(3), ACTOR1RELIGION2CODE VARCHAR(3), ACTOR1TYPE1CODE VARCHAR(3), ACTOR1TYPE2CODE VARCHAR(3), ACTOR1TYPE3CODE VARCHAR(3), ACTOR2CODE VARCHAR(15), ACTOR2NAME VARCHAR(255), ACTOR2COUNTRYCODE VARCHAR(3), ACTOR2KNOWNGROUPCODE VARCHAR(3), ACTOR2ETHNICCODE VARCHAR(3), ACTOR2RELIGION1CODE VARCHAR(3), ACTOR2RELIGION2CODE VARCHAR(3), ACTOR2TYPE1CODE VARCHAR(3), ACTOR2TYPE2CODE VARCHAR(3), ACTOR2TYPE3CODE VARCHAR(3), ISROOTEVENT INTEGER, EVENTCODE VARCHAR(4), EVENTBASECODE VARCHAR(3), EVENTROOTCODE VARCHAR(2), QUADCLASS INTEGER, GOLDSTEINSCALE DECIMAL(3,1), NUMMENTIONS INTEGER, NUMSOURCES INTEGER, NUMARTICLES INTEGER, AVGTONE DOUBLE, ACTOR1GEO_TYPE INTEGER, ACTOR1GEO_FULLNAME VARCHAR(255), ACTOR1GEO_COUNTRYCODE VARCHAR(2), ACTOR1GEO_ADM1CODE VARCHAR(5), ACTOR1GEO_LAT DECIMAL(9,4), ACTOR1GEO_LONG DECIMAL(9,4), ACTOR1GEO_FEATUREID VARCHAR(10), ACTOR2GEO_TYPE INTEGER, ACTOR2GEO_FULLNAME VARCHAR(255), ACTOR2GEO_COUNTRYCODE VARCHAR(3), ACTOR2GEO_ADM1CODE VARCHAR(5), ACTOR2GEO_LAT DECIMAL(9,4), ACTOR2GEO_LONG DECIMAL(9,4), ACTOR2GEO_FEATUREID VARCHAR(10), ACTIONGEO_TYPE INTEGER, ACTIONGEO_FULLNAME VARCHAR(255), ACTIONGEO_COUNTRYCODE VARCHAR(3), ACTIONGEO_ADM1CODE VARCHAR(5), ACTIONGEO_LAT DECIMAL(9,4), ACTIONGEO_LONG DECIMAL(9,4), ACTIONGEO_FEATUREID VARCHAR(50), DATEADDED INTEGER, SOURCEURL VARCHAR(255)"
);
SELECT * FROM EVENTS_RE LIMIT 10;
files "/user/hdfs/csv/20170318.export.csv",
files "/user/hdfs/csv/*",
You must be a registered user to add a comment. If you've already registered, sign in. Otherwise, register and sign in.
User | Count |
---|---|
40 | |
25 | |
17 | |
13 | |
8 | |
7 | |
7 | |
7 | |
6 | |
6 |