セミジョイン/アンチジョイン

作業概要：
「in」「exists」「not in」「not exists」それぞれについて
結合キーカラムが nullable の場合とnot nullの場合の実行計画を確認した

結果：
「in」「exists」「not exists」については、結合キーカラムのnull属性に関係なくサブクエリのテーブルでインデックスが使用された。

「not in」については、結合キーカラムのnull属性に依存して実行計画が変化した。

結合キーカラムが nullable の場合、サブクエリのテーブルでフルスキャンとなった。(MySQL、Oracle、PostgreSQL、SQL Server)
結合キーカラムが not null の場合、サブクエリのテーブルでインデックスが使用された。(MySQL、Oracle、SQL Server)
結合キーカラムが not null の場合、サブクエリのテーブルでフルスキャンのままだった。(PostgreSQL)

MySQL

(8.0.29)

drop table tab1 ;
drop table tab2 ;
create table tab1(col1 int ,col2 int);
create table tab2(col1 int ,col2 int);

drop procedure proc1;

delimiter //
create procedure proc1(in x bigint)
begin
declare i bigint;
set i = 0;
start transaction;
while i < x do
set i = i + 1;
insert into tab1(col1,col2) select i, ceil(rand()*100000);
insert into tab2(col1,col2) select i, ceil(rand()*100000);
end while;
commit;
end
//
delimiter ;

call proc1(10000);

select count(*) from tab1;
select count(*) from tab2;
select * from tab1 limit 10;
select * from tab2 limit 10;

create index ind1 on tab1(col1);
create index ind2 on tab2(col1);

analyze table tab1;
analyze table tab2;

explain analyze
select count(*)
from tab1 t1
where t1.col1 in ( select col1 from tab2 where col2 < 100 )
and t1.col2 < 10;

→ Nested loop semijoin/Table scan on t1/Index lookup on tab2 using ind2

explain analyze
select count(*)
from tab1 t1
where exists ( select 1 from tab2 where col2 < 100 and col1 = t1.col1 )
and t1.col2 < 10;

→ Nested loop semijoin/Table scan on t1/Index lookup on tab2 using ind2

explain analyze
select count(*)
from tab1 t1
where t1.col1 not in ( select col1 from tab2 where col2 < 100 )
and t1.col2 < 10;

→ Filter/Table scan on t1/Table scan on tab2 ★

※結合キーのカラム(col1)がnot nullの場合、他と同様にインデックスが使用される
→ Nested loop antijoin/Table scan on t1/Index lookup on tab2 using ind2

explain analyze
select count(*)
from tab1 t1
where not exists ( select 1 from tab2 where col2 < 100 and col1 = t1.col1 )
and t1.col2 < 10;

→ Nested loop antijoin/Table scan on t1/Index lookup on tab2 using ind2

Oracle

(19c)

drop table tab1 purge;
drop table tab2 purge;
create table tab1(col1 int ,col2 int);
create table tab2(col1 int ,col2 int);

declare
begin
for i in 1..10000 loop
insert into tab1(col1,col2) values(i, ceil(dbms_random.value()*100000) );
insert into tab2(col1,col2) values(i, ceil(dbms_random.value()*100000) );

end loop;
end;
/

commit;

select count(*) from tab1;
select count(*) from tab2;
select * from tab1 fetch first 10 rows only;
select * from tab2 fetch first 10 rows only;

create index ind1 on tab1(col1);
create index ind2 on tab2(col1);

exec dbms_stats.gather_table_stats(user, 'TAB1');
exec dbms_stats.gather_table_stats(user, 'TAB2');

explain plan for
select count(*)
from tab1 t1
where t1.col1 in ( select col1 from tab2 where col2 < 100 )
and t1.col2 < 10;
select * from table(dbms_xplan.display(format => 'ALL') );

→ NESTED LOOPS SEMI/TABLE ACCESS FULL/INDEX RANGE SCAN

explain plan for
select count(*)
from tab1 t1
where exists ( select 1 from tab2 where col2 < 100 and col1 = t1.col1 )
and t1.col2 < 10;
select * from table(dbms_xplan.display(format => 'ALL') );

→ NESTED LOOPS SEMI/TABLE ACCESS FULL/INDEX RANGE SCAN

explain plan for
select count(*)
from tab1 t1
where t1.col1 not in ( select col1 from tab2 where col2 < 100 )
and t1.col2 < 10;
select * from table(dbms_xplan.display(format => 'ALL') );

→ HASH JOIN ANTI NA/TABLE ACCESS FULL/TABLE ACCESS FULL ★

※結合キーのカラム(col1)がnot nullの場合、他と同様にインデックスが使用される
→ NESTED LOOPS SEMI/TABLE ACCESS FULL/INDEX RANGE SCAN

explain plan for
select count(*)
from tab1 t1
where not exists ( select 1 from tab2 where col2 < 100 and col1 = t1.col1 )
and t1.col2 < 10;
select * from table(dbms_xplan.display(format => 'ALL') );

→ NESTED LOOPS ANTI/TABLE ACCESS FULL/INDEX RANGE SCAN

PostgreSQL

(14)

drop table tab1 ;
drop table tab2 ;
create table tab1(col1 int ,col2 int);
create table tab2(col1 int ,col2 int);

insert into tab1(col1,col2) select g, ceil(random()*100000) from generate_series(1,10000) g;
insert into tab2(col1,col2) select g, ceil(random()*100000) from generate_series(1,10000) g;

select count(*) from tab1;
select count(*) from tab2;
select * from tab1 limit 10;
select * from tab2 limit 10;

create index ind1 on tab1(col1);
create index ind2 on tab2(col1);

analyze tab1;
analyze tab2;

explain analyze
select count(*)
from tab1 t1
where t1.col1 in ( select col1 from tab2 where col2 < 100 )
and t1.col2 < 10;

→ Nested loop semijoin/Seq Scan on tab1 t1/Index Scan using ind2 on tab2

explain analyze
select count(*)
from tab1 t1
where exists ( select 1 from tab2 where col2 < 100 and col1 = t1.col1 )
and t1.col2 < 10;

→ Nested loop semijoin/Seq Scan on tab1 t1/Index Scan using ind2 on tab2

explain analyze
select count(*)
from tab1 t1
where t1.col1 not in ( select col1 from tab2 where col2 < 100 )
and t1.col2 < 10;

→ Seq Scan on tab1 t1/SubPlan 1/Seq Scan on tab2 ★

※結合キーのカラム(col1)がnot nullの場合でもインデックスは使用されない ★★

explain analyze
select count(*)
from tab1 t1
where not exists ( select 1 from tab2 where col2 < 100 and col1 = t1.col1 )
and t1.col2 < 10;

→ Nested Loop Anti Join/Seq Scan on tab1 t1/Index Scan using ind2 on tab2

SQL Server

(2019)

drop table tab1 ;
drop table tab2 ;
create table tab1(col1 int ,col2 int);
create table tab2(col1 int ,col2 int);

set nocount on
declare @i bigint;
set @i = 1;
begin transaction;
while @i <= 10000
begin
insert into tab1(col1,col2) select @i, ceiling(rand()*100000);
insert into tab2(col1,col2) select @i, ceiling(rand()*100000);
set @i = @i + 1;
end
commit;

select count(*) from tab1;
select count(*) from tab2;
select top(10) * from tab1 ;
select top(10) * from tab2 ;

create index ind1 on tab1(col1);
create index ind2 on tab2(col1);

update statistics tab1;
update statistics tab2;

set showplan_all on
go

select count(*)
from tab1 t1
where t1.col1 in ( select col1 from tab2 where col2 < 100 )
and t1.col2 < 10;

→ Nested Loops(Left Semi Join)/Table Scan/Index Seek

select count(*)
from tab1 t1
where exists ( select 1 from tab2 where col2 < 100 and col1 = t1.col1 )
and t1.col2 < 10;

→ Nested Loops(Left Semi Join)/Table Scan/Index Seek

select count(*)
from tab1 t1
where t1.col1 not in ( select col1 from tab2 where col2 < 100 )
and t1.col2 < 10;

→ Nested Loops(Left Anti Join)/Table Scan/Table Scan ★

※結合キーのカラム(col1)がnot nullの場合、他と同様にインデックスが使用される
→ Nested Loops(Left Anti Join)/Table Scan/Index Seek

select count(*)
from tab1 t1
where not exists ( select 1 from tab2 where col2 < 100 and col1 = t1.col1 )
and t1.col2 < 10;

→ Nested Loops(Left Anti Join)/Table Scan/Index Seek