PG-Strom

 

https://heterodb.github.io/pg-strom/ja/install/
https://sakaik.hateblo.jp/entry/20230824/pgstrom_install
https://tech.virtualtech.jp/entry/2023/06/12/171529
https://zenn.dev/yumizz/articles/73d6c7d1085d2f


GPUバイス
computing capability が6.0以降のモデル(Pascal世代以降)である必要があります。

 

 

OS: Locky Linux 9
GPU: GeForce GTX 1050  ( Compute Capability = 6.1 , Pascal世代 )
CPU: 4コア
Memory: 32G
Disk: 300G
CUDA: 12.5
PostgreSQL: 16
PG-Strom: 5.2


-- 1. 事前作業

dnf update -y
reboot
uname -r


dnf groupinstall -y "Development Tools"

systemctl set-default multi-user.target
systemctl isolate multi-user.target

 

-- 2. nouveauドライバの無効化

lsmod | grep nouveau

cat > /etc/modprobe.d/blacklist-nouveau.conf <<EOF
blacklist nouveau
options nouveau modeset=0
EOF

dracut -f
reboot

lsmod | grep nouveau

 

 

-- 3. 追加リポジトリの有効化

dnf install -y epel-release

dnf install -y https://heterodb.github.io/swdc/yum/rhel9-noarch/heterodb-swdc-1.3-1.el9.noarch.rpm

 

 

 

-- 4. CUDA Toolkitのインストール


cd /etc/yum.repos.d/

curl -O https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo

cd

dnf search cuda

dnf install -y dkms
dnf module install -y nvidia-driver:latest-dkms
dnf install -y cuda-12-5
reboot

lsmod | grep -i nvidia

nvidia-smi

/usr/local/cuda-12.5/bin/nvcc -V

-- 5. PostgreSQLのインストール


dnf install -y https://download.postgresql.org/pub/repos/yum/reporpms/EL-9-x86_64/pgdg-redhat-repo-latest.noarch.rpm
dnf -y module disable postgresql

dnf config-manager --set-enabled crb

dnf install -y postgresql16-devel postgresql16-server

 

ls -la /var/lib/pgsql/16/data

/usr/pgsql-16/bin/postgresql-16-setup initdb


-- 6. PG-Stromのインストール

dnf install -y pg_strom-PG16


-- 7. PG-Stromの設定


su - postgres


vi /var/lib/pgsql/16/data/postgresql.conf 

shared_preload_libraries = '$libdir/pg_strom'
max_worker_processes = 100
shared_buffers = 10GB
work_mem = 1GB

exit

systemctl restart postgresql-16
systemctl status postgresql-16

su - postgres

psql -U postgres
CREATE EXTENSION pg_strom ;
\dx

 

-- 8. 動作確認


CREATE TABLE t_test1 AS
 SELECT       x, 'a'::char(100) AS y, 'b'::char(100) AS z
 FROM   generate_series(1, 5000000) AS x
 ORDER BY random();
 

EXPLAIN VERBOSE SELECT count(*)
 FROM   t_test1
 WHERE sqrt(x) > 0
 GROUP BY y;


 HashAggregate  (cost=62568.17..62570.17 rows=200 width=412)
   Output: pgstrom.fcount*1, y
         Workers Planned: 2
         ->  Parallel Custom Scan (GpuPreAgg) on public.t_test1  (cost=61545.17..61547.17 rows=200 width=412)
               Output: (pgstrom.nrows()), y
               GPU Projection: pgstrom.nrows(), y
               GPU Scan Quals: (sqrt((x)::double precision) > '0'::double precision) [rows: 1323648 -> 183840]
               KVars-Slot: <slot=0, type='int4', expr='x', kv_off=0x0000>, <slot=1, type='bpchar', expr='y', kv_off=0x1400>, <slot=2, type='bpchar', expr='y'>
               KVecs-Buffer: nbytes: 18432, ndims: 2, items=[kvec0=<0x0000-13ff, type='int4', expr='x'>, kvec1=<0x1400-47ff, type='bpchar', expr='y'>]
               LoadVars OpCode: {Packed items[0]={LoadVars(depth=0): kvars=[<slot=0, type='int4' resno=1(x)>, <slot=1, type='bpchar' resno=2(y)>]}}
               MoveVars OpCode: {Packed items[0]={MoveVars(depth=0): items=[<slot=1, offset=0x1400-47ff, type='bpchar', expr='y'>]}}}
               Scan Quals OpCode: {Func(bool)::float8gt args=[{Func(float8)::dsqrt arg={Func(float8)::float8 arg={Var(int4): slot=0, expr='x'}}}, {Const(float8): value='0'}]}
               Group-By KeyHash OpCode: {HashValue arg={SaveExpr: <slot=1, type='bpchar'> arg={Var(bpchar): kvec=0x1400-4800, expr='y'}}}
               Group-By KeyLoad OpCode: {LoadVars(depth=-2): kvars=[<slot=2, type='bpchar' resno=2(y)>]}
               Group-By KeyComp OpCode: {Func(bool)::bpchareq args=[{Var(bpchar): slot=1, expr='y'}, {Var(bpchar): slot=2, expr='y'}]}
               Partial Aggregation OpCode: {AggFuncs <nrows[*], vref[slot=1, expr='y']> arg={SaveExpr: <slot=1, type='bpchar'> arg={Var(bpchar): kvec=0x1400-4800, expr='y'}}}
               Fallback-desc: [<dest='1', expr='y', depth=0:1>, <dest='2', expr='x', depth=0:0>]
               Partial Function BufSz: 8
               CUDA Stack Size: 3936
(22 rows)

 

 

*1:pgstrom.nrows())), y
   Group Key: t_test1.y
   ->  Gather  (cost=62545.17..62567.17 rows=200 width=412)
         Output: (pgstrom.nrows(