https://docs.aws.amazon.com/ja_jp/athena/latest/ug/partitions.html
https://aws.amazon.com/jp/premiumsupport/knowledge-center/athena-create-use-partitioned-tables/
https://docs.aws.amazon.com/ja_jp/athena/latest/ug/lazy-simple-serde.html
https://blog.ingage.jp/entry/2021/05/10/080000
https://dev.classmethod.jp/articles/athena-partition-reinvent/
[1] Hive形式データの場合
[2] Hive形式でないデータの場合
[3] パーティション射影
-- 1. コマンド等のインストール
-- 1.1 aws cli version 2 インストール
curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
unzip awscliv2.zip
sudo ./aws/install
aws --version
-- 1.2 jqインストール
sudo yum -y install jq
-- 2. S3 バケットを作成する
aws s3 ls
aws s3 mb s3://bucket123
-- 3. テストデータ作成
echo "1,10,AAA" > f0001.txt
echo "2,10,AAA" > f0002.txt
echo "3,10,AAA" > f0003.txt
echo "4,10,AAA" > f0004.txt
echo "5,10,AAA" > f0005.txt
echo "6,10,AAA" > f0006.txt
echo "7,10,AAA" > f0007.txt
-- 4. クエリ結果の場所の設定
aws athena list-work-groups
aws athena get-work-group \
--work-group primary
aws athena update-work-group \
--work-group primary \
--configuration-updates '{
"EnforceWorkGroupConfiguration": false,
"ResultConfigurationUpdates": {
"OutputLocation": "s3://bucket123/result"
},
"PublishCloudWatchMetricsEnabled": false,
"RequesterPaysEnabled": false,
"EngineVersion": {
"SelectedEngineVersion": "AUTO",
"EffectiveEngineVersion": "Athena engine version 2"
}
}'
-- 5. データベースを作成する
aws glue create-database \
--database-input '{"Name": "test"}'
aws glue get-databases
aws glue get-database \
--name test
-- 6. テーブルを作成する
[1] Hive形式データの場合
aws s3 cp f0001.txt s3://bucket123/tab1/dt=2022-11-19/f0001.txt
aws s3 cp f0002.txt s3://bucket123/tab1/dt=2022-11-19/f0002.txt
aws s3 cp f0003.txt s3://bucket123/tab1/dt=2022-11-20/f0003.txt
aws s3 cp f0004.txt s3://bucket123/tab1/dt=2022-11-20/f0004.txt
aws s3 cp f0005.txt s3://bucket123/tab1/dt=2022-11-21/f0005.txt
aws s3 cp f0006.txt s3://bucket123/tab1/dt=2022-11-21/f0006.txt
aws s3 cp f0007.txt s3://bucket123/tab1/dt=2022-11-22/f0007.txt
aws s3 ls s3://bucket123 --recursive
create external table tab1 (
col1 int
, col2 int
, col3 string
)
partitioned by (dt string)
row format delimited
fields terminated by ','
escaped by '\\'
lines terminated by '\n'
location 's3://bucket123/tab1/';
msck repair table tab1;
select dt,col1,col2,col3 from tab1
where dt >= '2022-11-19'
and dt < '2022-11-20' ;
スキャンしたデータ -> 0.02KB
フルスキャンした場合は0.06KBなので、プルーニングされている
[2] Hive形式でないデータの場合
aws s3 cp f0001.txt s3://bucket123/tab2/2022/11/19/f0001.txt
aws s3 cp f0002.txt s3://bucket123/tab2/2022/11/19/f0002.txt
aws s3 cp f0003.txt s3://bucket123/tab2/2022/11/20/f0003.txt
aws s3 cp f0004.txt s3://bucket123/tab2/2022/11/20/f0004.txt
aws s3 cp f0005.txt s3://bucket123/tab2/2022/11/21/f0005.txt
aws s3 cp f0006.txt s3://bucket123/tab2/2022/11/21/f0006.txt
aws s3 cp f0007.txt s3://bucket123/tab2/2022/11/22/f0007.txt
aws s3 ls s3://bucket123 --recursive
create external table tab2 (
col1 int
, col2 int
, col3 string
)
partitioned by (dt string)
row format delimited
fields terminated by ','
escaped by '\\'
lines terminated by '\n'
location 's3://bucket123/tab2/';
alter table tab2 add partition (dt='2022-11-19') location 's3://bucket123/tab2/2022/11/19/';
alter table tab2 add partition (dt='2022-11-20') location 's3://bucket123/tab2/2022/11/20/';
alter table tab2 add partition (dt='2022-11-21') location 's3://bucket123/tab2/2022/11/21/';
alter table tab2 add partition (dt='2022-11-22') location 's3://bucket123/tab2/2022/11/22/';
select dt,col1,col2,col3 from tab2
where dt >= '2022-11-19'
and dt < '2022-11-20' ;
[3] パーティション射影
aws s3 cp f0001.txt s3://bucket123/tab3/2022/11/19/f0001.txt
aws s3 cp f0002.txt s3://bucket123/tab3/2022/11/19/f0002.txt
aws s3 cp f0003.txt s3://bucket123/tab3/2022/11/20/f0003.txt
aws s3 cp f0004.txt s3://bucket123/tab3/2022/11/20/f0004.txt
aws s3 cp f0005.txt s3://bucket123/tab3/2022/11/21/f0005.txt
aws s3 cp f0006.txt s3://bucket123/tab3/2022/11/21/f0006.txt
aws s3 cp f0007.txt s3://bucket123/tab3/2022/11/22/f0007.txt
aws s3 ls s3://bucket123 --recursive
create external table tab3 (
col1 int
, col2 int
, col3 string
)
partitioned by (dt string)
row format delimited
fields terminated by ','
escaped by '\\'
lines terminated by '\n'
location 's3://bucket123/tab3/'
tblproperties (
"projection.enabled" = "true",
"projection.dt.type" = "date",
"projection.dt.format" = "yyyy/MM/dd",
"projection.dt.range" = "2022/11/19,NOW",
"projection.dt.interval" = "1",
"projection.dt.interval.unit" = "DAYS",
"storage.location.template" = "s3://bucket123/tab3/${dt}/"
);
select dt,col1,col2,col3 from tab3
where dt >= '2022/11/19'
and dt < '2022/11/20' ;
-- 7. クリーンアップ
-- データベースの削除
aws glue get-databases
aws glue delete-database \
--name test
aws glue delete-database \
--name default
-- バケットの削除
aws s3 ls
aws s3 rb s3://bucket123 --force