{Athena}使用開始方法 - HTN20190109の日記

※S3バケットのパスはデータファイルではなく、データファイルのあるディレクトリを指定する
データファイルを指定した場合、selectでゼロ個のレコードが返される

例えば、テーブルの場所が次のようになっている場合、Athena クエリはゼロ個のレコードを返します。

s3://doc-example-bucket/table1.csv
s3://doc-example-bucket/table2.csv

-- 1. コマンド等のインストール

-- 1.1 aws cli version 2 インストール

curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
unzip awscliv2.zip
sudo ./aws/install
aws --version

-- 1.2 jqインストール
sudo yum -y install jq

-- 2. S3 バケットを作成する

aws s3 ls

aws s3 mb s3://bucket123

-- 3. S3にテストデータアップロード

vim tab1_0001.txt
1,10,AAA
2,20,BBB
3,30,CCC

vim tab1_0002.txt
4,40,DDD
5,50,EEE
6,60,FFF

aws s3 cp tab1_0001.txt s3://bucket123/tab1/tab1_0001.txt
aws s3 cp tab1_0002.txt s3://bucket123/tab1/tab1_0002.txt

aws s3 ls s3://bucket123 --recursive

-- 4. クエリの結果の場所の設定

aws athena list-work-groups

aws athena get-work-group \
--work-group primary

aws athena update-work-group \
--work-group primary \
--configuration-updates '{
"EnforceWorkGroupConfiguration": false,
"ResultConfigurationUpdates": {
"OutputLocation": "s3://bucket123/result"
},
"PublishCloudWatchMetricsEnabled": false,
"RequesterPaysEnabled": false,
"EngineVersion": {
"SelectedEngineVersion": "AUTO",
"EffectiveEngineVersion": "Athena engine version 2"
}
}'

-- 5. データベースを作成する

aws glue create-database \
--database-input '{"Name": "test"}'

aws glue get-databases

aws glue get-database \
--name test

-- 6. テーブルを作成する

aws glue create-table \
--database-name test \
--table-input '{
"Name": "tab1",
"StorageDescriptor": {
"Columns": [
{
"Name": "col1",
"Type": "int"
},
{
"Name": "col2",
"Type": "int"
},
{
"Name": "col3",
"Type": "string"
}
],
"Location": "s3://bucket123/tab1",
"InputFormat": "org.apache.hadoop.mapred.TextInputFormat",
"OutputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
"SerdeInfo": {
"SerializationLibrary": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
"Parameters": {
"field.delim": ",",
"line.delim": "\n",
"serialization.format": ","
}
}
},
"PartitionKeys": [],
"TableType": "EXTERNAL_TABLE"
}'

aws glue get-tables \
--database-name test

aws glue get-table \
--database-name test \
--name tab1

-- 7. データをクエリする

aws athena start-query-execution \
--query-string "SELECT * FROM test.tab1;" \
--result-configuration OutputLocation=s3://bucket123/output

aws athena get-query-execution \
--query-execution-id 66666666-7777-8888-9999-aaaaaaaaaaaa

aws athena get-query-results \
--query-execution-id 66666666-7777-8888-9999-aaaaaaaaaaaa

aws s3 ls s3://bucket123 --recursive

-- 8. クエリを保存する

aws athena list-named-queries

aws athena create-named-query \
--name query01 \
--database test \
--query-string "SELECT * FROM test.tab1;"

aws athena get-named-query \
--named-query-id 11111111-2222-3333-4444-555555555555

aws athena delete-named-query \
--named-query-id 11111111-2222-3333-4444-555555555555

-- 9. クリーンアップ

-- テーブルの削除

aws glue get-tables \
--database-name test

aws glue get-table \
--database-name test \
--name tab1

aws glue delete-table \
--database-name test \
--name tab1

-- データベースの削除

aws glue get-databases

aws glue get-database \
--name test

aws glue delete-database \
--name test

aws glue get-database \
--name default

aws glue delete-database \
--name default

-- バケットの削除

aws s3 ls

aws s3 rb s3://bucket123 --force