2021年4月、SAPはSAP HANA Cloud用の新しいテキストマイニング機能を発表しました。
create column table PAL_TFIDF_DATA_TAB (
"ID" nvarchar(1000),
"CONTENT" nvarchar(1000),
"CATEGORY" nvarchar(1000)
);
INSERT INTO PAL_TFIDF_DATA_TAB VALUES('doc1','term1 term2 term2 term3 term3 term3','CATEGORY_1');
INSERT INTO PAL_TFIDF_DATA_TAB VALUES('doc2','term2 term3 term3 term4 term4 term4','CATEGORY_1');
INSERT INTO PAL_TFIDF_DATA_TAB VALUES('doc3','term3 term4 term4 term5 term5 term5','CATEGORY_2');
INSERT INTO PAL_TFIDF_DATA_TAB VALUES('doc5','term3 term4 term4 term5 term5 term5 term5 term5 term5','CATEGORY_2');
INSERT INTO PAL_TFIDF_DATA_TAB VALUES('doc4','term4 term6','CATEGORY_3');
INSERT INTO PAL_TFIDF_DATA_TAB VALUES('doc6','term4 term6 term6 term6','CATEGORY_3');
call _SYS_AFL.PAL_TF_ANALYSIS(
PAL_TFIDF_DATA_TAB,
PARAMETERS,
PAL_TM_TERM_TAB,
PAL_TM_DOC_TERM_FREQ_TAB,
PAL_TM_CATE_TAB
);
create column table PAL_TM_PRED_TAB (
"ID" nvarchar(1000),
"CONTENT" nvarchar(1000)
);
INSERT INTO PAL_TM_PRED_TAB VALUES('doc10','term2 term2 term3 term3');
INSERT INTO PAL_TM_PRED_TAB VALUES('doc11','term4 term4 term4 term5');
call _SYS_AFL.PAL_TEXTCLASSIFICATION(
PAL_TM_TERM_TAB,
PAL_TM_DOC_TERM_FREQ_TAB,
PAL_TM_CATE_TAB,
PAL_TM_PRED_TAB,
PARAMETERS, ?, ?);
import pandas as pd
from datetime import date, datetime, timedelta
import os
from hdbcli import dbapi
from hana_ml import dataframe
from hana_ml.dataframe import create_dataframe_from_pandas
from hana_ml.algorithms.pal.partition import train_test_val_split
from hana_ml.text.tm import tf_analysis
from hana_ml.text.tm import get_related_doc, get_related_term, get_relevant_doc, get_relevant_term, get_suggested_term, text_classification
pd.set_option('display.max_colwidth', None)
pd.set_option("display.colheader_justify","left")
# Instantiate connection object
conn = dataframe.ConnectionContext(address = 'XXXX.hana.canary-eu10.hanacloud.ondemand.com',
port = 443,
user = 'XXXX',
password = 'XXXXX',
encrypt = 'true',
sslValidateCertificate = 'false'
)
# Send basic SELECT statement and display the result
sql = 'SELECT 12345 FROM DUMMY'
df_remote = conn.sql(sql)
print(df_remote.collect())
# Set up TEXTANALYSIS User, Add rights to execute PAL
# cursor = conn.connection.cursor()
# cursor.execute('CREATE USER TEXTANALYSIS Password "Textan123" SET USERGROUP DEFAULT;')
# cursor.execute('ALTER USER TEXTANALYSIS DISABLE PASSWORD LIFETIME;')
# cursor.execute('GRANT "AFL__SYS_AFL_AFLPAL_EXECUTE_WITH_GRANT_OPTION" TO TEXTANALYSIS')
df_tm_source = dataframe.DataFrame(conn,'select "CONTENT", "CATEGORY", "MFR_NAME", "MAKETXT","MODELTXT","YEARTXT","CITY","STATE" FROM COMPLAINTS').filter('MAKETXT = \'FORD\'').drop_duplicates(['CONTENT']).add_id('ID').cast('ID', 'NVARCHAR (5000)')
train, test, valid = train_test_val_split(data=df_tm_source, training_percentage = 0.1, validation_percentage = 0.8, testing_percentage = 0.1, random_seed = 41)
train.count(),valid.count(), test.count()
test.select(['ID', 'MFR_NAME', 'CONTENT', 'CATEGORY']).head(5).collect()
tfidf= tf_analysis(train.select(['ID', 'CONTENT', 'CATEGORY']))
tc = text_classification(test.filter('ID = 1').select(['ID', 'CONTENT']), tfidf, thread_ratio = 1, k_nearest_neighbours = 1)
tc[0].collect()
get_relevant_term(test.filter('ID = 1').select('CONTENT'), tfidf).head(10).collect()
get_related_doc(test.filter('ID = 1').select('CONTENT'), tfidf).head(10).collect()
train.filter('ID = 113733').collect().iloc[0,1]
train.filter('ID = 46').collect().iloc[0,1]
You must be a registered user to add a comment. If you've already registered, sign in. Otherwise, register and sign in.
User | Count |
---|---|
29 | |
21 | |
10 | |
7 | |
7 | |
6 | |
6 | |
5 | |
5 | |
5 |