ダンプのパラレル実行 - HTN20190109の日記

MySQL

(8.0.22)

CPUコア数=4

(パターン1)通常テーブルテーブル数=1 全データ件数=400万件
(パターン2)通常テーブルテーブル数=4 全データ件数=400万件
(パターン3)パーティションテーブルテーブル数=1 パーティション数=4 全データ件数=400万件

パラレル度=1,4,8

全部で9パターン

-- テストデータ作成

drop table tab1;
create table tab1(col1 int, col2 int);
alter table tab1 add constraint tab1pk primary key(col1);

drop procedure proc1;

delimiter //
create procedure proc1()
begin
declare i bigint;
set i = 1;
while i <= 100000 do
insert into tab1 values(i,i);
set i = i + 1;
end while;
end
//
delimiter ;

call proc1();

select count(*) from tab1;

insert into tab1 select col1+100000,col2+100000 from tab1;
insert into tab1 select col1+200000,col2+200000 from tab1;
insert into tab1 select col1+400000,col2+400000 from tab1;
insert into tab1 select col1+200000,col2+200000 from tab1 where col1 between 600001 and 800000;
insert into tab1 select col1+1000000,col2+1000000 from tab1;
insert into tab1 select col1+2000000,col2+2000000 from tab1;

drop table tab21;
create table tab21(col1 int, col2 int);
alter table tab21 add constraint tab21pk primary key(col1);

drop table tab22;
create table tab22(col1 int, col2 int);
alter table tab22 add constraint tab22pk primary key(col1);

drop table tab23;
create table tab23(col1 int, col2 int);
alter table tab23 add constraint tab23pk primary key(col1);

drop table tab24;
create table tab24(col1 int, col2 int);
alter table tab24 add constraint tab24pk primary key(col1);

insert into tab21 select * from tab1 where col1 between 1 and 1000000;
insert into tab22 select * from tab1 where col1 between 1000001 and 2000000;
insert into tab23 select * from tab1 where col1 between 2000001 and 3000000;
insert into tab24 select * from tab1 where col1 between 3000001 and 4000000;

select count(*) from tab21;
select count(*) from tab22;
select count(*) from tab23;
select count(*) from tab24;

drop table tab30;
create table tab30 (col1 int,col2 int)
partition by hash(col1)
(partition tab31,partition tab32, partition tab33, partition tab34);

insert into tab30 select * from tab1;

select count(*) from tab30 partition(tab31);
select count(*) from tab30 partition(tab32);
select count(*) from tab30 partition(tab33);
select count(*) from tab30 partition(tab34);

-- 測定
time mysqlpump -u root -p --default-parallelism=1 test tab1 > tab1.dmp
time mysqlpump -u root -p --default-parallelism=4 test tab1 > tab1.dmp
time mysqlpump -u root -p --default-parallelism=8 test tab1 > tab1.dmp

time mysqlpump -u root -p --default-parallelism=1 test tab21 tab22 tab23 tab24 > tab2.dmp
time mysqlpump -u root -p --default-parallelism=4 test tab21 tab22 tab23 tab24 > tab2.dmp
time mysqlpump -u root -p --default-parallelism=8 test tab21 tab22 tab23 tab24 > tab2.dmp

time mysqlpump -u root -p --default-parallelism=1 test tab30 > tab3.dmp
time mysqlpump -u root -p --default-parallelism=4 test tab30 > tab3.dmp
time mysqlpump -u root -p --default-parallelism=8 test tab30 > tab3.dmp

(パターン1)通常テーブルテーブル数=1 全データ件数=400万件
パラレル度=0m4.547s
パラレル度=0m5.252s
パラレル度=0m5.490s

(パターン2)通常テーブルテーブル数=4 全データ件数=400万件
パラレル度=0m5.135s
パラレル度=0m3.713s
パラレル度=0m3.530s

(パターン3)パーティションテーブルテーブル数=1 パーティション数=4 全データ件数=400万件
パラレル度=0m5.259s
パラレル度=0m5.763s
パラレル度=0m5.093s

-- 結論
複数テーブル同時にダンプする場合であれば、パラレル実行で少しは速くなる傾向あり
※1個のパーティションテーブルの場合は効果はない模様

Oracle

(19c)

CPUコア数=4

パラレル度=1,4,8

全部で9パターン

-- テストデータ作成
drop table tab1 purge;
create table tab1(col1 int, col2 int);
alter table tab1 add constraint tab1pk primary key(col1);

declare
begin
for i in 1..4000000 loop
insert into tab1 values(i,i);
commit;
end loop;
end;
/

select count(*) from tab1;

drop table tab21 purge;
create table tab21(col1 int, col2 int);
alter table tab21 add constraint tab21pk primary key(col1);

drop table tab22 purge;
create table tab22(col1 int, col2 int);
alter table tab22 add constraint tab22pk primary key(col1);

drop table tab23 purge;
create table tab23(col1 int, col2 int);
alter table tab23 add constraint tab23pk primary key(col1);

drop table tab24 purge;
create table tab24(col1 int, col2 int);
alter table tab24 add constraint tab24pk primary key(col1);

insert into tab21 select * from tab1 where col1 between 1 and 1000000;
commit;
insert into tab22 select * from tab1 where col1 between 1000001 and 2000000;
commit;
insert into tab23 select * from tab1 where col1 between 2000001 and 3000000;
commit;
insert into tab24 select * from tab1 where col1 between 3000001 and 4000000;
commit;

select count(*) from tab21;
select count(*) from tab22;
select count(*) from tab23;
select count(*) from tab24;

drop table tab30 purge;
create table tab30(col1 int, col2 int)
partition by hash (col1)
(partition tab31,partition tab32, partition tab33, partition tab34);
alter table tab30 add constraint tab30pk primary key(col1);

insert into tab30 select * from tab1;
commit;

select count(*) from tab30 partition(tab31);
select count(*) from tab30 partition(tab32);
select count(*) from tab30 partition(tab33);
select count(*) from tab30 partition(tab34);

-- 測定

time expdp test/test@pdb1 directory=ORA_DIR dumpfile=tab1_%U.dmp logfile=tab1.log tables=tab1 reuse_dumpfiles=yes parallel=1
time expdp test/test@pdb1 directory=ORA_DIR dumpfile=tab1_%U.dmp logfile=tab1.log tables=tab1 reuse_dumpfiles=yes parallel=4
time expdp test/test@pdb1 directory=ORA_DIR dumpfile=tab1_%U.dmp logfile=tab1.log tables=tab1 reuse_dumpfiles=yes parallel=8

time expdp test/test@pdb1 directory=ORA_DIR dumpfile=tab2_%U.dmp logfile=tab2.log tables=tab21,tab22,tab23,tab24 reuse_dumpfiles=yes parallel=1
time expdp test/test@pdb1 directory=ORA_DIR dumpfile=tab2_%U.dmp logfile=tab2.log tables=tab21,tab22,tab23,tab24 reuse_dumpfiles=yes parallel=4
time expdp test/test@pdb1 directory=ORA_DIR dumpfile=tab2_%U.dmp logfile=tab2.log tables=tab21,tab22,tab23,tab24 reuse_dumpfiles=yes parallel=8

time expdp test/test@pdb1 directory=ORA_DIR dumpfile=tab3_%U.dmp logfile=tab3.log tables=tab30 reuse_dumpfiles=yes parallel=1
time expdp test/test@pdb1 directory=ORA_DIR dumpfile=tab3_%U.dmp logfile=tab3.log tables=tab30 reuse_dumpfiles=yes parallel=4
time expdp test/test@pdb1 directory=ORA_DIR dumpfile=tab3_%U.dmp logfile=tab3.log tables=tab30 reuse_dumpfiles=yes parallel=8

(パターン1)通常テーブルテーブル数=1 全データ件数=400万件
パラレル度=1 0m42.289s
パラレル度=4 0m46.528s
パラレル度=8 0m41.523s

(パターン2)通常テーブルテーブル数=4 全データ件数=400万件
パラレル度=1 0m41.273s
パラレル度=4 0m33.438s
パラレル度=8 0m36.295s

(パターン3)パーティションテーブルテーブル数=1 パーティション数=4 全データ件数=400万件
パラレル度=1 0m35.218s
パラレル度=4 0m29.283s
パラレル度=8 0m33.453s

-- 結論
複数テーブル同時にダンプする場合や1テーブルであってもパーティションテーブルであれば、パラレル実行で少しは速くなる傾向あり

PostgreSQL

(13)

CPUコア数=4

パラレル度=1,4,8

全部で9パターン

-- テストデータ作成
drop table tab1;
create table tab1(col1 int, col2 int);
alter table tab1 add constraint tab1pk primary key(col1);

do $$
declare
begin
for i in 1..4000000 loop
insert into tab1 values(i,i);
end loop;
end
$$
;

select count(*) from tab1;

drop table tab21;
create table tab21(col1 int, col2 int);
alter table tab21 add constraint tab21pk primary key(col1);

drop table tab22;
create table tab22(col1 int, col2 int);
alter table tab22 add constraint tab22pk primary key(col1);

drop table tab23;
create table tab23(col1 int, col2 int);
alter table tab23 add constraint tab23pk primary key(col1);

drop table tab24;
create table tab24(col1 int, col2 int);
alter table tab24 add constraint tab24pk primary key(col1);

select count(*) from tab21;
select count(*) from tab22;
select count(*) from tab23;
select count(*) from tab24;

drop table tab30;
create table tab30(col1 int,col2 int)
partition by hash(col1);
alter table tab30 add constraint tab30pk primary key(col1);

create table tab31 partition of tab30 for values with (modulus 4,remainder 0);
create table tab32 partition of tab30 for values with (modulus 4,remainder 1);
create table tab33 partition of tab30 for values with (modulus 4,remainder 2);
create table tab34 partition of tab30 for values with (modulus 4,remainder 3);

insert into tab30 select * from tab1;

select count(*) from tab31;
select count(*) from tab32;
select count(*) from tab33;
select count(*) from tab34;

-- 測定
rm -rf /var/lib/pgsql/tab?

time pg_dump -U postgres -f /var/lib/pgsql/tab1 -F d -j 1 -n public -t tab1 test
time pg_dump -U postgres -f /var/lib/pgsql/tab1 -F d -j 4 -n public -t tab1 test
time pg_dump -U postgres -f /var/lib/pgsql/tab1 -F d -j 8 -n public -t tab1 test

time pg_dump -U postgres -f /var/lib/pgsql/tab2 -F d -j 1 -n public -t 'tab2?' test
time pg_dump -U postgres -f /var/lib/pgsql/tab2 -F d -j 4 -n public -t 'tab2?' test
time pg_dump -U postgres -f /var/lib/pgsql/tab2 -F d -j 8 -n public -t 'tab2?' test

time pg_dump -U postgres -f /var/lib/pgsql/tab3 -F d -j 1 -n public -t 'tab3?' test
time pg_dump -U postgres -f /var/lib/pgsql/tab3 -F d -j 4 -n public -t 'tab3?' test
time pg_dump -U postgres -f /var/lib/pgsql/tab3 -F d -j 8 -n public -t 'tab3?' test

(パターン1)通常テーブルテーブル数=1 全データ件数=400万件
パラレル度=1 0m2.255s
パラレル度=4 0m2.276s
パラレル度=8 0m2.303s

(パターン2)通常テーブルテーブル数=4 全データ件数=400万件
パラレル度=1 0m2.257s
パラレル度=4 0m0.891s
パラレル度=8 0m0.891s

(パターン3)パーティションテーブルテーブル数=1 パーティション数=4 全データ件数=400万件
パラレル度=1 0m5.462s
パラレル度=4 0m1.832s
パラレル度=8 0m2.038s

-- 結論
複数テーブル同時にダンプする場合やパーティションテーブルであれば、パラレル実行で速くなる傾向あり

SQL Server

調べた限り、bcp ユーティリティはパラレルオプションなし