https://cloud.google.com/dataproc/docs/tutorials/gcs-connector-spark-tutorial?hl=ja#python
-- 1. 前作業
gcloud init
gcloud auth list
gcloud --version
gcloud projects create project01-9999999 \
--name="project01"
gcloud config list
gcloud config set project project01-9999999
gcloud config set compute/region asia-northeast1 --quiet
gcloud config set run/region asia-northeast1 --quiet
gcloud config set compute/zone asia-northeast1-a --quiet
gcloud beta billing accounts list
gcloud beta billing projects link project01-9999999 --billing-account=111111-111111-111111
gcloud services enable compute.googleapis.com --project project01-9999999
gcloud components update
-- 2. Dataproc, Compute Engine, Cloud Storage API を有効化
gcloud services list --enabled
gcloud services list --available
gcloud services enable dataproc.googleapis.com --project project01-9999999
gcloud services enable compute.googleapis.com --project project01-9999999
gcloud services enable storage.googleapis.com --project project01-9999999
-- 3. サービス アカウントの作成
gcloud iam service-accounts create sa123 \
--description="sa123" \
--display-name="sa123"
gcloud iam service-accounts list
gcloud projects add-iam-policy-binding project01-9999999 \
--member="serviceAccount:sa123@project01-9999999.iam.gserviceaccount.com" \
--role="roles/owner"
gcloud projects get-iam-policy project01-9999999
gcloud iam service-accounts keys create ~/key01.json \
--iam-account=sa123@project01-9999999.iam.gserviceaccount.com
cat ~/key01.json
gcloud iam service-accounts keys list \
--iam-account=sa123@project01-9999999.iam.gserviceaccount.com
export GOOGLE_APPLICATION_CREDENTIALS=~/key01.json
-- 4. Cloud Storage バケットを作成
gcloud storage buckets create gs://bucket123 \
--default-storage-class=Standard \
--no-enable-autoclass \
--location=us-central1 \
--public-access-prevention \
--uniform-bucket-level-access
gcloud storage ls
-- 5. Dataproc クラスタを作成
gcloud dataproc clusters create cluster01 \
--project=project01-9999999 \
--region=us-central1 \
--single-node
gcloud dataproc clusters list --region=us-central1
-- 6. 一般公開データを Cloud Storage バケットにコピー
gsutil cp gs://pub/shakespeare/rose.txt \
gs://bucket123/input/rose.txt
gsutil ls -r gs://bucket123
-- 7. Python の開発環境を設定
python -V
mkdir 123
cd 123
python -m venv env
source env/bin/activate
pip install google-cloud-storage
-- 8. Spark wordcount ジョブを準備
vim word-count.py
#!/usr/bin/env python
import pyspark
import sys
if len(sys.argv) != 3:
raise Exception("Exactly 2 arguments are required: <inputUri> <outputUri>")
inputUri=sys.argv[1]
outputUri=sys.argv[2]
sc = pyspark.SparkContext()
lines = sc.textFile(sys.argv[1])
words = lines.flatMap(lambda line: line.split())
wordCounts = words.map(lambda word: (word, 1)).reduceByKey(lambda count1, count2: count1 + count2)
wordCounts.saveAsTextFile(sys.argv[2])
-- 9. ジョブを送信
gcloud dataproc jobs submit pyspark word-count.py \
--cluster=cluster01 \
--region=us-central1 \
-- gs://bucket123/input/ gs://bucket123/output/
gcloud dataproc jobs list --region=us-central1
-- 10. 出力を表示
gsutil ls -r gs://bucket123
gsutil cat gs://bucket123/output/*
deactivate
-- 11. クリーンアップ
gcloud dataproc clusters delete cluster01 \
--region=us-central1 \
--quiet
gcloud storage rm gs://bucket123 --recursive
gcloud storage ls
gcloud projects list
gcloud projects delete project01-9999999 \
--quiet