https://docs.aws.amazon.com/ja_jp/comprehend/latest/dg/tutorial-reviews.html
-- 1. コマンド等のインストール
-- 1.1 aws cli version 2 インストール
curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
unzip awscliv2.zip
sudo ./aws/install
aws --version
-- 1.2 jqインストール
sudo yum -y install jq
-- 2. S3 バケットを作成しドキュメントを追加する
wget https://docs.aws.amazon.com/ja_jp/comprehend/latest/dg/samples/tutorial-reviews-data.zip
unzip tutorial-reviews-data.zip
ls -ltr
head amazon-reviews.csv
aws s3 mb s3://bucket123
aws s3 ls
aws s3 cp amazon-reviews.csv s3://bucket123/input/
aws s3 ls s3://bucket123 --recursive
-- 3. Amazon Comprehend用とGlue用のIAMロールの作成
-- 3.1 IAMポリシー作成(Comprehend用)
vim policy01.json
{
"Version": "2012-10-17",
"Statement": [
{
"Action": [
"s3:GetObject"
],
"Resource": [
"arn:aws:s3:::bucket123/*"
],
"Effect": "Allow"
},
{
"Action": [
"s3:ListBucket"
],
"Resource": [
"arn:aws:s3:::bucket123"
],
"Effect": "Allow"
},
{
"Action": [
"s3:PutObject"
],
"Resource": [
"arn:aws:s3:::bucket123/*"
],
"Effect": "Allow"
}
]
}
aws iam create-policy \
--policy-name policy01 \
--policy-document file://policy01.json
-- 3.2 IAMロール作成(Comprehend用)
vim role01.json
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": "comprehend.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}
aws iam create-role \
--role-name role01 \
--assume-role-policy-document file://role01.json
-- 3.3 ポリシーをロールにアタッチ(Comprehend用)
aws iam attach-role-policy \
--policy-arn arn:aws:iam::999999999999:policy/policy01 \
--role-name role01
-- 3.4 IAMポリシー作成(Glue用)
vim policy02.json
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"s3:GetObject",
"s3:PutObject"
],
"Resource": [
"arn:aws:s3:::bucket123/sentiment-results*",
"arn:aws:s3:::bucket123/entities-results*"
]
}
]
}
aws iam create-policy \
--policy-name policy02 \
--policy-document file://policy02.json
-- 3.5 IAMロール作成(Glue用)
vim role02.json
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": "glue.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}
aws iam create-role \
--role-name role02 \
--assume-role-policy-document file://role02.json
-- 3.6 ポリシーをロールにアタッチ(Glue用)
aws iam attach-role-policy \
--policy-arn arn:aws:iam::999999999999:policy/policy02 \
--role-name role02
aws iam attach-role-policy \
--policy-arn arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole \
--role-name role02
-- 4. Amazon S3 のドキュメントに対する分析ジョブの実行
-- 4.1 感情分析ジョブの開始
aws comprehend start-sentiment-detection-job \
--input-data-config S3Uri=s3://bucket123/input/ \
--output-data-config S3Uri=s3://bucket123/output/ \
--data-access-role-arn arn:aws:iam::999999999999:role/role01 \
--job-name job01 \
--language-code en
aws comprehend list-sentiment-detection-jobs
aws comprehend describe-sentiment-detection-job \
--job-id 11111111111111111111111111111111
-- 4.2 エンティティ分析ジョブの開始
aws comprehend start-entities-detection-job \
--input-data-config S3Uri=s3://bucket123/input/ \
--output-data-config S3Uri=s3://bucket123/output/ \
--data-access-role-arn arn:aws:iam::999999999999:role/role01 \
--job-name job02 \
--language-code en
aws comprehend list-entities-detection-jobs
aws comprehend describe-entities-detection-job \
--job-id 22222222222222222222222222222222
-- 5. Amazon Comprehend の出力をデータ視覚化のために準備する
-- 5.1 アウトプットをダウンロードする
aws s3 cp s3://bucket123/output/999999999999-SENTIMENT-11111111111111111111111111111111/output/output.tar.gz output01.tar.gz
aws s3 cp s3://bucket123/output/999999999999-NER-22222222222222222222222222222222/output/output.tar.gz output02.tar.gz
-- 5.2 出力ファイルを展開します
tar -xvf output01.tar.gz --transform 's,^,sentiment-,'
tar -xvf output02.tar.gz --transform 's,^,entities-,'
-- 5.3 抽出したファイルをアップロードする
aws s3 cp sentiment-output s3://bucket123/sentiment-results/
aws s3 cp entities-output s3://bucket123/entities-results/
aws s3 ls s3://bucket123 --recursive
-- 5.4 データベースを作成する
aws glue create-database \
--database-input Name="test"
aws glue get-databases
aws glue get-database \
--name test
-- 5.5 クローラを作成する
aws glue create-crawler \
--name crawler01 \
--role arn:aws:iam::999999999999:role/role02 \
--database-name test \
--targets '{
"S3Targets": [
{ "Path": "s3://bucket123/sentiment-results" },
{ "Path": "s3://bucket123/entities-results" }
]
}'
aws glue list-crawlers
aws glue get-crawlers
aws glue get-crawler \
--name crawler01
-- 5.6 クローラを実行する
aws glue start-crawler \
--name crawler01
aws glue get-crawler \
--name crawler01
-- 5.7 クローラが作成したテーブルを表示する
aws glue get-tables \
--database-name test
aws glue get-table \
--database-name test \
--name sentiment_results
aws glue get-table \
--database-name test \
--name entities_results
-- 5.8 Athnaクエリの結果の場所の設定
設定タブ
→ s3://bucket123/query-results/
-- 5.9 ネスト解除済みテーブルの作成
※Athenaコンソールで実施
CREATE TABLE sentiment_results_final AS
SELECT file, line, sentiment,
sentimentscore.mixed AS mixed,
sentimentscore.negative AS negative,
sentimentscore.neutral AS neutral,
sentimentscore.positive AS positive
FROM sentiment_results
;
CREATE TABLE entities_results_1 AS
SELECT file, line, nested FROM entities_results
CROSS JOIN UNNEST(entities) as t(nested)
;
CREATE TABLE entities_results_final AS
SELECT file, line,
nested.beginoffset AS beginoffset,
nested.endoffset AS endoffset,
nested.score AS score,
nested.text AS entity,
nested.type AS category
FROM entities_results_1
;
-- 6. クリーンアップ
-- テーブルの削除
aws glue get-tables \
--database-name test | jq -r .TableList[].Name
aws glue delete-table \
--database-name test \
--name sentiment_results
aws glue delete-table \
--database-name test \
--name entities_results
aws glue delete-table \
--database-name test \
--name sentiment_results_final
aws glue delete-table \
--database-name test \
--name entities_results_final
aws glue delete-table \
--database-name test \
--name entities_results_1
-- クローラの削除
aws glue list-crawlers
aws glue get-crawlers
aws glue delete-crawler \
--name crawler01
-- データベースの削除
aws glue get-databases
aws glue get-database \
--name test
aws glue delete-database \
--name test
-- IAMロールの削除(Glue用)
aws iam list-roles | grep role02
aws iam detach-role-policy \
--role-name role02 \
--policy-arn arn:aws:iam::999999999999:policy/policy02
aws iam detach-role-policy \
--role-name role02 \
--policy-arn arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole
aws iam delete-role --role-name role02
-- IAMポリシーの削除(Glue用)
aws iam list-policies | grep policy02
aws iam delete-policy \
--policy-arn arn:aws:iam::999999999999:policy/policy02
-- IAMロールの削除(Comprehend用)
aws iam list-roles | grep role01
aws iam detach-role-policy \
--role-name role01 \
--policy-arn arn:aws:iam::999999999999:policy/policy01
aws iam delete-role --role-name role01
-- IAMポリシーの削除(Comprehend用)
aws iam list-policies | grep policy01
aws iam delete-policy \
--policy-arn arn:aws:iam::999999999999:policy/policy01
-- S3バケットの削除
aws s3 ls
aws s3 rb s3://bucket123 --force