{Athena}Athena でのデータのパーティション化

 

https://docs.aws.amazon.com/ja_jp/athena/latest/ug/partitions.html
https://aws.amazon.com/jp/premiumsupport/knowledge-center/athena-create-use-partitioned-tables/
https://docs.aws.amazon.com/ja_jp/athena/latest/ug/lazy-simple-serde.html
https://blog.ingage.jp/entry/2021/05/10/080000
https://dev.classmethod.jp/articles/athena-partition-reinvent/


[1] Hive形式データの場合
[2] Hive形式でないデータの場合
[3] パーティション射影


-- 1. コマンド等のインストール

-- 1.1 aws cli version 2 インストール

curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
unzip awscliv2.zip
sudo ./aws/install
aws --version

-- 1.2 jqインストール
sudo yum -y install jq


-- 2. S3 バケットを作成する

aws s3 ls

aws s3 mb s3://bucket123

 

-- 3. テストデータ作成

echo "1,10,AAA" > f0001.txt
echo "2,10,AAA" > f0002.txt
echo "3,10,AAA" > f0003.txt
echo "4,10,AAA" > f0004.txt
echo "5,10,AAA" > f0005.txt
echo "6,10,AAA" > f0006.txt
echo "7,10,AAA" > f0007.txt

 

 


-- 4. クエリ結果の場所の設定


aws athena list-work-groups

aws athena get-work-group \
--work-group primary


aws athena update-work-group \
--work-group primary \
--configuration-updates '{
  "EnforceWorkGroupConfiguration": false,
  "ResultConfigurationUpdates": {
    "OutputLocation": "s3://bucket123/result"
  },
  "PublishCloudWatchMetricsEnabled": false,
  "RequesterPaysEnabled": false,
  "EngineVersion": {
    "SelectedEngineVersion": "AUTO",
    "EffectiveEngineVersion": "Athena engine version 2"
  }
}'

-- 5. データベースを作成する

aws glue create-database \
--database-input '{"Name": "test"}'

aws glue get-databases

aws glue get-database \
--name test


-- 6. テーブルを作成する

[1] Hive形式データの場合


aws s3 cp f0001.txt s3://bucket123/tab1/dt=2022-11-19/f0001.txt
aws s3 cp f0002.txt s3://bucket123/tab1/dt=2022-11-19/f0002.txt
aws s3 cp f0003.txt s3://bucket123/tab1/dt=2022-11-20/f0003.txt
aws s3 cp f0004.txt s3://bucket123/tab1/dt=2022-11-20/f0004.txt
aws s3 cp f0005.txt s3://bucket123/tab1/dt=2022-11-21/f0005.txt
aws s3 cp f0006.txt s3://bucket123/tab1/dt=2022-11-21/f0006.txt
aws s3 cp f0007.txt s3://bucket123/tab1/dt=2022-11-22/f0007.txt


aws s3 ls s3://bucket123 --recursive

 

create external table tab1 (
  col1 int
, col2 int
, col3 string
)
partitioned by (dt string)
row format delimited
  fields terminated by ','
  escaped by '\\'
  lines terminated by '\n'
location 's3://bucket123/tab1/';


msck repair table tab1;


select dt,col1,col2,col3 from tab1 
where dt >= '2022-11-19' 
and dt < '2022-11-20' ;

スキャンしたデータ -> 0.02KB
フルスキャンした場合は0.06KBなので、プルーニングされている


[2] Hive形式でないデータの場合

aws s3 cp f0001.txt s3://bucket123/tab2/2022/11/19/f0001.txt
aws s3 cp f0002.txt s3://bucket123/tab2/2022/11/19/f0002.txt
aws s3 cp f0003.txt s3://bucket123/tab2/2022/11/20/f0003.txt
aws s3 cp f0004.txt s3://bucket123/tab2/2022/11/20/f0004.txt
aws s3 cp f0005.txt s3://bucket123/tab2/2022/11/21/f0005.txt
aws s3 cp f0006.txt s3://bucket123/tab2/2022/11/21/f0006.txt
aws s3 cp f0007.txt s3://bucket123/tab2/2022/11/22/f0007.txt


aws s3 ls s3://bucket123 --recursive

 

create external table tab2 (
  col1 int
, col2 int
, col3 string
)
partitioned by (dt string)
row format delimited
  fields terminated by ','
  escaped by '\\'
  lines terminated by '\n'
location 's3://bucket123/tab2/';

 

alter table tab2 add partition (dt='2022-11-19') location 's3://bucket123/tab2/2022/11/19/';
alter table tab2 add partition (dt='2022-11-20') location 's3://bucket123/tab2/2022/11/20/';
alter table tab2 add partition (dt='2022-11-21') location 's3://bucket123/tab2/2022/11/21/';
alter table tab2 add partition (dt='2022-11-22') location 's3://bucket123/tab2/2022/11/22/';


select dt,col1,col2,col3 from tab2
where dt >= '2022-11-19' 
and dt < '2022-11-20' ;

 

[3] パーティション射影

aws s3 cp f0001.txt s3://bucket123/tab3/2022/11/19/f0001.txt
aws s3 cp f0002.txt s3://bucket123/tab3/2022/11/19/f0002.txt
aws s3 cp f0003.txt s3://bucket123/tab3/2022/11/20/f0003.txt
aws s3 cp f0004.txt s3://bucket123/tab3/2022/11/20/f0004.txt
aws s3 cp f0005.txt s3://bucket123/tab3/2022/11/21/f0005.txt
aws s3 cp f0006.txt s3://bucket123/tab3/2022/11/21/f0006.txt
aws s3 cp f0007.txt s3://bucket123/tab3/2022/11/22/f0007.txt


aws s3 ls s3://bucket123 --recursive

 

create external table tab3 (
  col1 int
, col2 int
, col3 string
)
partitioned by (dt string)
row format delimited
  fields terminated by ','
  escaped by '\\'
  lines terminated by '\n'
location 's3://bucket123/tab3/'
tblproperties (
 "projection.enabled" = "true",

 "projection.dt.type" = "date",
 "projection.dt.format" = "yyyy/MM/dd",
 "projection.dt.range" = "2022/11/19,NOW",
 "projection.dt.interval" = "1",
 "projection.dt.interval.unit" = "DAYS",

  "storage.location.template" = "s3://bucket123/tab3/${dt}/"
);


select dt,col1,col2,col3 from tab3
where dt >= '2022/11/19' 
and dt < '2022/11/20' ;

 

-- 7. クリーンアップ


-- データベースの削除

aws glue get-databases


aws glue delete-database \
--name test

aws glue delete-database \
--name default

 


-- バケットの削除

aws s3 ls

aws s3 rb s3://bucket123 --force