diff --git a/.github/workflows/ci_linux/deps_apt.sh b/.github/workflows/ci_linux/deps_apt.sh
index fdc19d6b53..b4fbc23a73 100755
--- a/.github/workflows/ci_linux/deps_apt.sh
+++ b/.github/workflows/ci_linux/deps_apt.sh
@@ -1,6 +1,9 @@
 #!/usr/bin/env bash
 sudo apt-get -qq install \
     gcc \
+    g++ \
+    gfortran \
+    liblapack-dev \
     libblas-dev \
     cmake \
     curl
diff --git a/.github/workflows/ci_linux/python_deps.sh b/.github/workflows/ci_linux/python_deps.sh
index 1f5e400f2d..bc4dd8b246 100755
--- a/.github/workflows/ci_linux/python_deps.sh
+++ b/.github/workflows/ci_linux/python_deps.sh
@@ -1,17 +1,16 @@
 #!/usr/bin/env bash
 python -m pip install --upgrade pip
 pip install "numpy!=1.16,!=1.17" "scipy!=1.5" h5py pytest pytest-cov pytest-timer
-pip install pyberny
+pip install git+https://github.com/jhrmnn/pyberny.git@36a4be9
 pip install --no-deps pyscf-dispersion==1.3.0
+pip install geometric
 
 version=$(python -c 'import sys; version=sys.version_info[:2]; print("{0}.{1}".format(*version))')
-if [ $version != '3.12' ]; then
-    pip install geometric
-    pip install spglib
-fi
 
-if [ $version != '3.8' ]; then
+if [ $version == '3.12' ]; then
+    pip install spglib
     pip install pytblis
+    pip install git+https://github.com/sunqm/zquatev
 fi
 
 #cppe
diff --git a/.github/workflows/run_tests.sh b/.github/workflows/run_tests.sh
index 886e8c78c8..b4a6396778 100755
--- a/.github/workflows/run_tests.sh
+++ b/.github/workflows/run_tests.sh
@@ -13,6 +13,7 @@ version=$(python -c 'import sys; print("{0}.{1}".format(*sys.version_info[:2]))'
 # pytest-cov on Python 3.12 consumes huge memory
 if [ "$RUNNER_OS" == "Linux" ] && [ $version != "3.12" ]; then
   pytest pyscf/ -s -c pytest.ini \
+    --durations=20 \
     --cov-report xml --cov-report term --cov-config .coveragerc --cov pyscf
 else
   pytest pyscf/ -s -c pytest.ini pyscf
diff --git a/CHANGELOG b/CHANGELOG
index ddd2c09500..03bcbd0f87 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,12 @@
+PySCF 2.13.1 (2026-06-01)
+-------------------------
+* Fixes
+  - Missing CP2K basis set data in wheel distributions
+  - Small-rotor error.
+  - Corrected SG1 grid radii handling for ghost atoms.
+  - Fixed ECP loading to correctly fall back to Basis Set Exchange when local data is unavailable.
+
+
 PySCF 2.13.0 (2026-04-20)
 -------------------------
 * Added
diff --git a/MANIFEST.in b/MANIFEST.in
index 469d803cf3..e62e98ac34 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -11,12 +11,16 @@ include pyscf/lib/deps/lib*/libcint.[4-9].dylib
 include pyscf/lib/deps/lib*/libxc.*.dylib
 include pyscf/lib/deps/lib*/libxcfun.[2-9].dylib
 
+# windows dynamic libraries
+include pyscf/lib/*.dll
+include pyscf/lib/deps/bin/*.dll
+
 include pyscf/geomopt/log.ini
 include pyscf/gto/basis/bse_meta.json
 
 # CP2K basis set
-include pyscf/lib/pbc/gto/basis/*BASIS*
-include pyscf/lib/pbc/gto/pseudo/*POTENTIAL*
+include pyscf/pbc/gto/basis/*BASIS*
+include pyscf/pbc/gto/pseudo/*POTENTIAL*
 
 # source code
 recursive-include pyscf/lib *.c *.h CMakeLists.txt
diff --git a/README.md b/README.md
index 82544d08cb..9632434deb 100644
--- a/README.md
+++ b/README.md
@@ -7,9 +7,9 @@ Python-based Simulations of Chemistry Framework
 [![Build Status](https://github.com/pyscf/pyscf/workflows/CI/badge.svg)](https://github.com/pyscf/pyscf/actions?query=workflow%3ACI)
 [![codecov](https://codecov.io/gh/pyscf/pyscf/branch/master/graph/badge.svg)](https://codecov.io/gh/pyscf/pyscf)
 
-2026-04-20
+2026-06-01
 
-* [Stable release 2.13.0](https://github.com/pyscf/pyscf/releases/tag/v2.13.0)
+* [Stable release 2.13.1](https://github.com/pyscf/pyscf/releases/tag/v2.13.1)
 * [Changelog](../master/CHANGELOG)
 * [Documentation](http://www.pyscf.org)
 * [Installation](#installation)
diff --git a/examples/2-benchmark/benchmarking_utils.py b/examples/2-benchmark/benchmarking_utils.py
index 78e3c37eaf..ddefe615a3 100644
--- a/examples/2-benchmark/benchmarking_utils.py
+++ b/examples/2-benchmark/benchmarking_utils.py
@@ -4,13 +4,24 @@
 
 def setup_logger():
     log = pyscf.lib.logger.Logger(verbose=5)
-    with open('/proc/cpuinfo') as f:
-        for line in f:
-            if 'model name' in line:
-                log.note(line[:-1])
-                break
-    with open('/proc/meminfo') as f:
-        log.note(f.readline()[:-1])
+    try:
+        with open('/proc/cpuinfo') as f:
+            for line in f:
+                if 'model name' in line:
+                    log.note(line[:-1])
+                    break
+    except FileNotFoundError:
+        pass
+    try:
+        with open('/proc/meminfo') as f:
+            log.note(f.readline()[:-1])
+    except FileNotFoundError:
+        try:
+            import psutil
+            mem = psutil.virtual_memory()
+            log.note(f'MemTotal: {mem.total // 1024} kB')
+        except ImportError:
+            pass
     log.note('OMP_NUM_THREADS=%s\n', os.environ.get('OMP_NUM_THREADS', None))
     return log
 
diff --git a/examples/ao2mo/01-outcore.py b/examples/ao2mo/01-outcore.py
index a175441e84..70fd796d31 100644
--- a/examples/ao2mo/01-outcore.py
+++ b/examples/ao2mo/01-outcore.py
@@ -3,9 +3,9 @@
 # Author: Qiming Sun <osirpt.sun@gmail.com>
 #
 
-import tempfile
 import h5py
 from pyscf import gto, scf, ao2mo
+from pyscf import lib
 
 '''
 Save the transformed integrals in the given file in HDF5 format
@@ -22,7 +22,7 @@
 myhf.kernel()
 
 orb = myhf.mo_coeff
-ftmp = tempfile.NamedTemporaryFile()
+ftmp = lib.NamedTemporaryFile()
 print('MO integrals are saved in file  %s  under dataset "eri_mo"' % ftmp.name)
 ao2mo.kernel(mol, orb, ftmp.name)
 
diff --git a/examples/ao2mo/10-diff_orbs_for_ijkl.py b/examples/ao2mo/10-diff_orbs_for_ijkl.py
index e2ac7240ca..6af4a3fb7a 100644
--- a/examples/ao2mo/10-diff_orbs_for_ijkl.py
+++ b/examples/ao2mo/10-diff_orbs_for_ijkl.py
@@ -3,10 +3,10 @@
 # Author: Qiming Sun <osirpt.sun@gmail.com>
 #
 
-import tempfile
 import numpy
 import h5py
 from pyscf import gto, scf, ao2mo
+from pyscf import lib
 
 '''
 Integral transformation for four different orbitals
@@ -39,7 +39,7 @@
 #
 # Given four MOs, compute the MO-integrals and saved in dataset "mp2_bz"
 #
-eritmp = tempfile.NamedTemporaryFile()
+eritmp = lib.NamedTemporaryFile()
 nocc = mol.nelectron // 2
 nvir = len(mf.mo_energy) - nocc
 co = mf.mo_coeff[:,:nocc]
diff --git a/examples/ao2mo/11-ump2.py b/examples/ao2mo/11-ump2.py
index cbb66405d9..b1642b2156 100644
--- a/examples/ao2mo/11-ump2.py
+++ b/examples/ao2mo/11-ump2.py
@@ -3,7 +3,6 @@
 # Author: Qiming Sun <osirpt.sun@gmail.com>
 #
 
-import tempfile
 import numpy
 import h5py
 from pyscf import gto, scf, ao2mo
diff --git a/examples/ao2mo/20-eri_grad_hess.py b/examples/ao2mo/20-eri_grad_hess.py
index aefd32f851..22e739280c 100644
--- a/examples/ao2mo/20-eri_grad_hess.py
+++ b/examples/ao2mo/20-eri_grad_hess.py
@@ -3,10 +3,10 @@
 # Author: Qiming Sun <osirpt.sun@gmail.com>
 #
 
-import tempfile
 import numpy
 import h5py
 from pyscf import gto, scf, ao2mo
+from pyscf import lib
 
 '''
 Integral transformation for irregular operators
@@ -28,7 +28,7 @@
 #
 # Given four MOs, compute the MO-integral gradients
 #
-gradtmp = tempfile.NamedTemporaryFile()
+gradtmp = lib.NamedTemporaryFile()
 nocc = mol.nelectron // 2
 nvir = len(mf.mo_energy) - nocc
 co = mf.mo_coeff[:,:nocc]
@@ -56,7 +56,7 @@
 #       9       d/dZ  d/dZ
 #
 orb = mf.mo_coeff
-hesstmp = tempfile.NamedTemporaryFile()
+hesstmp = lib.NamedTemporaryFile()
 ao2mo.kernel(mol, orb, hesstmp.name, intor='cint2e_ipvip1_sph',
              dataname='hessints1', aosym='s4')
 with ao2mo.load(hesstmp, 'hessints1') as eri:
diff --git a/examples/ao2mo/22-rkb_no_pair_ints.py b/examples/ao2mo/22-rkb_no_pair_ints.py
index e0c0ee2216..319c676eb1 100644
--- a/examples/ao2mo/22-rkb_no_pair_ints.py
+++ b/examples/ao2mo/22-rkb_no_pair_ints.py
@@ -10,7 +10,6 @@
 from pyscf import scf
 from pyscf import lib
 from pyscf.ao2mo import r_outcore
-import tempfile
 import os
 
 mol = gto.M(
@@ -53,7 +52,7 @@ def no_pair_ovov(mol, mo_coeff, erifile):
 
         def run_and_add(mol, mos, erifile, dataname_main, intor):
             # Use a temporary file for the intermediate integrals
-            with tempfile.NamedTemporaryFile(suffix=".h5", delete=False) as tmpfile:
+            with lib.NamedTemporaryFile(suffix=".h5", delete=False) as tmpfile:
                 tmp_erifile = tmpfile.name
 
             try:
diff --git a/examples/cc/03-gccsd.py b/examples/cc/03-gccsd.py
new file mode 100644
index 0000000000..4f5f798e06
--- /dev/null
+++ b/examples/cc/03-gccsd.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+
+'''
+GCCSD: CCSD based on the GHF reference.
+
+The cluster amplitudes of GCCSD are represented in the spin-orbital basis and
+are solved without assuming spin symmetry. GCCSD can be applied to spin-orbit
+coupled systems.
+
+For non-relativistic calculations, GCCSD is typically equivalent to the
+corresponding UHF-CCSD calculation. When spin-orbit coupling (SOC) is included,
+for example, the X2C Hamiltonian (see examples/x2c/03-x2c_ghf.py) or SOC-ECP
+(see examples/scf/44-soc_ecp.py), the GHF orbitals become complex-valued, and
+the resulting GCCSD amplitude are also complex-valued.
+'''
+
+import pyscf
+
+mol = pyscf.M(atom='''
+O    0.   0.       0.
+H    0.   -0.757   0.587
+H    0.   0.757    0.587''',
+basis='cc-pvdz')
+#
+# Non-relativistic calculation. The CCSD object returned by mf.CCSD() is an
+# instance of the GCCSD class. The cluster amplitudes are represented in the
+# spin-orbital basis.
+#
+mf = mol.GHF().run()
+mycc = mf.CCSD().run()
+
+#
+# Enable SOC via the X2C Hamiltonian. GCCSD amplitudes are complex-valued.
+#
+mf = mol.GHF().x2c().run()
+mycc = mf.CCSD().run()
+
+#
+# For calculations using ECPs, SOC can be enabled with the setting
+#     mf.with_soc = True
+#
+# Running mf.CCSD() on such a reference will performs a GCCSD calculation
+# with complex-valued amplitudes.
+#
diff --git a/examples/cc/61-rccsdtq.py b/examples/cc/61-rccsdtq.py
index ce1aa962b8..4c385a1c7a 100644
--- a/examples/cc/61-rccsdtq.py
+++ b/examples/cc/61-rccsdtq.py
@@ -52,7 +52,7 @@
 mycc2.verbose = 5
 mycc2.incore_complete = True
 mycc2.kernel()
-print('Full-T4 RCCSDQ e_corr       % .12f    Ref % .12f    Diff % .12e' % (
+print('Full-T4 RCCSDTQ e_corr       % .12f    Ref % .12f    Diff % .12e' % (
         mycc2.e_corr, ref_e_corr, mycc2.e_corr - ref_e_corr))
 
 #
diff --git a/examples/cc/63-check_rccsdt_uccsdt_consistency.py b/examples/cc/63-check_rccsdt_uccsdt_consistency.py
index a04ab655e8..5af5e4f00a 100644
--- a/examples/cc/63-check_rccsdt_uccsdt_consistency.py
+++ b/examples/cc/63-check_rccsdt_uccsdt_consistency.py
@@ -63,7 +63,7 @@
 
 # Restart UCCSDT using amplitudes converted from RCCSDT
 tamps_init_uhf = [t1_rhf2uhf, t2_rhf2uhf, t3_rhf2uhf]
-myucc2 = cc.UCCSDT(mf, compact_tamps=False).set(conv_tol=1e-10, conv_tol_normt=1e-8, verbose=5)
+myucc2 = cc.UCCSDT(mf_uhf, compact_tamps=False).set(conv_tol=1e-10, conv_tol_normt=1e-8, verbose=5)
 myucc2.kernel(tamps=tamps_init_uhf)
 print('UCCSDT correlation energy % .12f    Ref % .12f    Diff % .12e' % (
         myucc2.e_corr, -0.2188784727114157, myucc2.e_corr - -0.2188784727114157))
diff --git a/examples/cc/64-chained_rccsd_rccsdt_rccsdtq.py b/examples/cc/64-chained_rccsd_rccsdt_rccsdtq.py
index 162be81458..3389325cbe 100644
--- a/examples/cc/64-chained_rccsd_rccsdt_rccsdtq.py
+++ b/examples/cc/64-chained_rccsd_rccsdt_rccsdtq.py
@@ -11,7 +11,6 @@
     - Examine the influence of DIIS acceleration on convergence.
 '''
 
-import numpy as np
 from pyscf import gto, scf, cc
 
 def run_rccsd_rccsdt_rccsdtq(do_diis=False, do_diis_max_t=False, verbose=0):
diff --git a/examples/cc/65-chained_uccsd_uccsdt.py b/examples/cc/65-chained_uccsd_uccsdt.py
index bbfc3a8777..15a5a782c0 100644
--- a/examples/cc/65-chained_uccsd_uccsdt.py
+++ b/examples/cc/65-chained_uccsd_uccsdt.py
@@ -11,7 +11,6 @@
     - Understand and handle the difference in T2 amplitude conventions between UCCSD and UCCSDT implementations.
 '''
 
-import numpy as np
 from pyscf import gto, scf, cc
 
 def run_uccsd_uccsdt(do_diis=False, do_diis_max_t=False, verbose=0):
@@ -32,7 +31,7 @@ def run_uccsd_uccsdt(do_diis=False, do_diis_max_t=False, verbose=0):
     myccsd.verbose = verbose
     myccsd.diis = do_diis
     myccsd.kernel()
-    print('RCCSD   e_corr % .12f    Ref % .12f    Diff % .12e' % (
+    print('UCCSD   e_corr % .12f    Ref % .12f    Diff % .12e' % (
             myccsd.e_corr, ref_ccsd_e_corr, myccsd.e_corr - ref_ccsd_e_corr))
 
     # UCCSDT
@@ -99,7 +98,7 @@ def run_uccsd_uccsdt(do_diis=False, do_diis_max_t=False, verbose=0):
     do_diis_max_t = False
     run_uccsd_uccsdt(do_diis=do_diis, do_diis_max_t=do_diis_max_t)
 
-    print('=== UCCSD / UCCSDT with DIIS (including T3 amplitudes) ===')
+    print('=== UCCSD -> UCCSDT with DIIS (including T3 amplitudes) ===')
     do_diis = True
     do_diis_max_t = True
     run_uccsd_uccsdt(do_diis=do_diis, do_diis_max_t=do_diis_max_t)
diff --git a/examples/cc/66-rccsdt_q.py b/examples/cc/66-rccsdt_q.py
new file mode 100644
index 0000000000..28e51177fa
--- /dev/null
+++ b/examples/cc/66-rccsdt_q.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+#
+# Author: Yu Jin <yjin@flatironinstitute.org>
+#
+
+'''
+Examples of RCCSDT(Q) calculations.
+
+This script demonstrates:
+    - Consistency of the [Q] and (Q) energy corrections on top of RCCSDT between calculations
+        using full and compact T3 storage.
+'''
+
+from pyscf import gto, scf, cc
+
+mol = gto.M(atom='H 0 0 0; F 0 0 1.1', basis='ccpvdz')
+mf = scf.RHF(mol)
+mf.conv_tol = 1e-14
+mf.kernel()
+
+# Reference CCSDT correlation energy, and [Q] and (Q) energy correction
+ref_e_corr = -0.2188784733230733
+ref_e_q_bracket = -0.0005026220700017348
+ref_e_q_paren = -0.0005490746450078632
+
+mycc1 = cc.RCCSDT(mf, compact_tamps=True)
+mycc1.conv_tol = 1e-10
+mycc1.conv_tol_normt = 1e-8
+mycc1.verbose = 5
+# einsum_backend: numpy (default) | pyscf | pytblis (recommended)
+# pytblis can be installed via `pip install pytblis==0.05` (See https://github.com/chillenb/pytblis)
+mycc1.set_einsum_backend('pyscf')
+mycc1.incore_complete = True
+mycc1.kernel()
+e_q_bracket, e_q_paren = mycc1.ccsdt_q()
+print('Triangular RCCSDT e_corr % .12f    Ref % .12f    Diff % .12e' % (
+        mycc1.e_corr, ref_e_corr, mycc1.e_corr - ref_e_corr))
+print('Triangular RCCSDT [Q]    % .12f    Ref % .12f    Diff % .12e' % (
+        e_q_bracket, ref_e_q_bracket, e_q_bracket - ref_e_q_bracket))
+print('Triangular RCCSDT (Q)    % .12f    Ref % .12f    Diff % .12e' % (
+        e_q_paren, ref_e_q_paren, e_q_paren - ref_e_q_paren))
+
+#
+# RCCSDT with full T3 storage
+# Same as cc.rccsdt_highm.RCCSDT
+#
+mycc2 = cc.RCCSDT(mf, compact_tamps=False)
+mycc2.conv_tol = 1e-10
+mycc2.conv_tol_normt = 1e-8
+mycc2.verbose = 5
+mycc2.incore_complete = True
+mycc2.kernel()
+q_bracket2, q_paren2 = mycc2.ccsdt_q()
+print('Full-T3 RCCSDT e_corr    % .12f    Ref % .12f    Diff % .12e' % (
+        mycc2.e_corr, ref_e_corr, mycc2.e_corr - ref_e_corr))
+print('Full-T3 RCCSDT [Q]       % .12f    Ref % .12f    Diff % .12e' % (
+        q_bracket2, ref_e_q_bracket, q_bracket2 - ref_e_q_bracket))
+print('Full-T3 RCCSDT (Q)       % .12f    Ref % .12f    Diff % .12e' % (
+        q_paren2, ref_e_q_paren, q_paren2 - ref_e_q_paren))
diff --git a/examples/df/01-auxbasis.py b/examples/df/01-auxbasis.py
index 523b9e3012..df01368000 100644
--- a/examples/df/01-auxbasis.py
+++ b/examples/df/01-auxbasis.py
@@ -10,7 +10,6 @@
 See also examples/gto/04-input_basis.py
 '''
 
-import tempfile
 from pyscf import gto, scf, df
 
 #
diff --git a/examples/df/40-precompute_df_integrals.py b/examples/df/40-precompute_df_integrals.py
index 38b36245a5..13cddcc35d 100644
--- a/examples/df/40-precompute_df_integrals.py
+++ b/examples/df/40-precompute_df_integrals.py
@@ -10,12 +10,12 @@
 reused many times.
 '''
 
-import tempfile
 from pyscf import gto, scf, df
+from pyscf import lib
 from pyscf.pbc import gto as pgto
 from pyscf.pbc import dft as pdft
 
-tmpf = tempfile.NamedTemporaryFile()
+tmpf = lib.NamedTemporaryFile()
 file_to_save_df_ints = tmpf.name
 print('DF integral is saved in %s' % file_to_save_df_ints)
 
diff --git a/examples/dft/33-custom_disp.py b/examples/dft/33-custom_disp.py
new file mode 100644
index 0000000000..f3d01b4cbe
--- /dev/null
+++ b/examples/dft/33-custom_disp.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python
+# Copyright 2021-2026 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+###########################################################
+#  Example of DFT with custom dispersion correction (dftd3/dftd4)
+###########################################################
+
+"""
+This example demonstrates the updated dispersion convention (mf.disp) in PySCF.
+
+To run the D3 and D4 examples, install the optional dispersion dependencies:
+
+    pip install 'pyscf-dispersion>1.5.0'
+
+Key knobs
+1) mf.xc
+   The XC functional for the underlying DFT calculation (e.g. 'b3lyp', 'wb97x-v').
+
+2) mf.disp
+   The dispersion correction to apply (e.g. D3BJ or D4).
+
+   Two common forms are supported:
+   a) Version only: 'd3bj', 'd3zero', 'd3bjm', 'd3zerom', 'd3op', 'd4'
+      The code will infer the dispersion parameter "method keyword" from mf.xc.
+
+   b) Explicit version:method: 'd4:wb97x' / 'd4:wb97x-rev' / 'd4:wb97x-3c'
+      - version: dispersion engine/version tag (d3bj, d3zero, d4, ...)
+      - method:  the keyword used by dftd3 (https://github.com/dftd3/simple-dftd3/blob/main/assets/parameters.toml) 
+                    or dftd4 (https://github.com/dftd4/dftd4/blob/main/assets/parameters.toml) to select parameters
+
+3) mf.nlc
+   Non-local correlation (e.g. VV10).
+   You do not need to set this if you would like to use *-V functional since
+   they will invoke VV10 by default. If you want the wB97X-V/wB97M-V XC form, but
+   without VV10, and with D3/D4 instead (e.g. wB97X-3c, wB97M-D4), explicitly disable VV10 via:
+       mf.nlc = 0
+
+Below we run six minimal single-point examples for H2O. Each block creates an
+SCF object, then sets mf.xc / mf.disp / mf.nlc explicitly.
+"""
+
+import pyscf
+from pyscf import dft
+
+atom = '''
+O       0.0000000000    -0.0000000000     0.1174000000
+H      -0.7570000000    -0.0000000000    -0.4696000000
+H       0.7570000000     0.0000000000    -0.4696000000
+'''
+
+mol = pyscf.M(atom=atom, basis='def2-svp')
+
+print('Dispersion convention examples (tutorial)')
+print('------------------------------------------------')
+
+print()
+print('Example 1: B3LYP + D3BJ')
+mf = dft.RKS(mol)
+mf.xc = 'b3lyp'
+mf.disp = 'd3bj'
+mf.grids.level = 5
+mf.direct_scf_tol = 1e-14
+mf.conv_tol = 1e-12
+mf.max_cycle = 50
+e_tot = mf.kernel()
+print(f'  mf.xc   = {mf.xc}')
+print(f'  mf.disp = {mf.disp}')
+print(f'  e_tot   = {e_tot}')
+
+print()
+print("Example 2: B3LYP + D3BJ (explicit version:method)")
+print("  'd3bj:b3lyp' means: use D3BJ, and force the D3BJ parameters of method='b3lyp'")
+mf = dft.RKS(mol)
+mf.xc = 'b3lyp'
+mf.disp = 'd3bj:b3lyp'
+mf.grids.level = 5
+mf.direct_scf_tol = 1e-14
+mf.conv_tol = 1e-12
+mf.max_cycle = 50
+e_tot = mf.kernel()
+print(f'  mf.xc   = {mf.xc}')
+print(f'  mf.disp = {mf.disp}')
+print(f'  e_tot   = {e_tot}')
+
+print()
+print('Example 3: wB97X-V (VV10 nonlocal correlation)')
+print("  Here we demonstrate mf.nlc='vv10' (and no extra dispersion via mf.disp)")
+mf = dft.RKS(mol)
+mf.xc = 'wb97x-v'
+mf.nlc = 'vv10'
+mf.disp = None
+mf.grids.level = 5
+mf.direct_scf_tol = 1e-14
+mf.conv_tol = 1e-12
+mf.max_cycle = 50
+e_tot = mf.kernel()
+print(f'  mf.xc   = {mf.xc}')
+print(f'  mf.nlc  = {mf.nlc}')
+print(f'  mf.disp = {mf.disp}')
+print(f'  e_tot   = {e_tot}')
+
+print()
+print('Example 4: wB97X-D4 (explicit D4 parameters for method=wb97x, VV10 disabled)')
+print("  Key point: mf.xc='wb97x-v' + mf.nlc=0 + mf.disp='d4:wb97x'")
+mf = dft.RKS(mol)
+mf.xc = 'wb97x-v'
+mf.nlc = 0
+mf.disp = 'd4:wb97x'
+mf.grids.level = 5
+mf.direct_scf_tol = 1e-14
+mf.conv_tol = 1e-12
+mf.max_cycle = 50
+e_tot = mf.kernel()
+print(f'  mf.xc   = {mf.xc}')
+print(f'  mf.nlc  = {mf.nlc}')
+print(f'  mf.disp = {mf.disp}')
+print(f'  e_tot   = {e_tot}')
+
+print()
+print('Example 5: wB97X-D4rev (explicit D4 parameters for method=wb97x-rev, VV10 disabled)')
+print("  Key point: mf.xc='wb97x-v' + mf.nlc=0 + mf.disp='d4:wb97x-rev'")
+mf = dft.RKS(mol)
+mf.xc = 'wb97x-v'
+mf.nlc = 0
+mf.disp = 'd4:wb97x-rev'
+mf.grids.level = 5
+mf.direct_scf_tol = 1e-14
+mf.conv_tol = 1e-12
+mf.max_cycle = 50
+e_tot = mf.kernel()
+print(f'  mf.xc   = {mf.xc}')
+print(f'  mf.nlc  = {mf.nlc}')
+print(f'  mf.disp = {mf.disp}')
+print(f'  e_tot   = {e_tot}')
+
+print()
+print('Example 6: wB97X-3c (use wB97X-V form but disable VV10, then add D4 parameters for wb97x-3c)')
+print("  Key point: mf.xc='wb97x-v' + mf.nlc=0 + mf.disp='d4:wb97x-3c'")
+print("  basis = 'Grimme vDZP'")
+print("  ecp   = 'Grimme vDZP', please specify it for each element that needs ecp")
+print("  To load the Grimme vDZP basis/ECP, install basis-set-exchange:")
+print("      pip install basis-set-exchange")
+
+mol_3c = pyscf.M(
+    atom=atom,
+    basis='Grimme vDZP',
+    ecp={'O': 'Grimme vDZP'},  # H does not have ecp in Grimme vDZP.
+)
+mf = dft.RKS(mol_3c)
+mf.xc = 'wb97x-v'
+mf.nlc = 0
+mf.disp = 'd4:wb97x-3c'
+mf.grids.level = 5
+mf.direct_scf_tol = 1e-14
+mf.conv_tol = 1e-12
+mf.max_cycle = 50
+e_tot = mf.kernel()
+print(f'  mf.xc   = {mf.xc}')
+print(f'  mf.nlc  = {mf.nlc}')
+print(f'  mf.disp = {mf.disp}')
+print(f'  e_tot   = {e_tot}')
diff --git a/examples/gto/01-input_geometry.py b/examples/gto/01-input_geometry.py
index 96a4334601..6fba4e8dbc 100644
--- a/examples/gto/01-input_geometry.py
+++ b/examples/gto/01-input_geometry.py
@@ -14,6 +14,7 @@
 
 import numpy
 from pyscf import gto
+from pyscf import lib
 
 #
 # Input Cartesian coordinates
@@ -127,8 +128,7 @@
 # Read geometry from a file. If the file name is assigned to mol.atom, the
 # build method will guess the file format and parse the contents accordingly
 #
-import tempfile
-with tempfile.NamedTemporaryFile(mode='w', suffix='.xyz') as f:
+with lib.NamedTemporaryFile(mode='w', suffix='.xyz') as f:
     f.write('''3
 
 O 0 0 0
diff --git a/examples/gw/04-bse.py b/examples/gw/04-bse.py
new file mode 100644
index 0000000000..9da63f0c7e
--- /dev/null
+++ b/examples/gw/04-bse.py
@@ -0,0 +1,99 @@
+"""
+Example for Bethe-Salpeter equation.
+
+########
+Reference results for acetone / B3LYP / def2-SVP
+acetone geometry from: J. Phys. Chem. Lett. 2016, 7, 3, 586-591
+
+* GW step (fully analytic GW, quasiparticle equation solved iteratively)
+                HOMO (eV)   LUMO (eV)
+Turbomole        -8.78        2.75
+PySCF            -8.79        2.75
+
+* First three singlet excitations (eV) for BSE
+                S1      S2      S3
+Turbomole      3.41    7.35    8.56
+PySCF          3.41    7.36    8.58
+
+* First three triplet excitations (eV) for BSE
+                T1      T2      T3
+Turbomole      2.68    4.67    7.14
+PySCF          2.67    4.67    7.14
+
+"""
+import numpy as np
+from pyscf import gto, dft
+from pyscf.gw.gw_ac import GWAC
+from pyscf.gw.ugw_ac import UGWAC
+from pyscf.gw.bse import BSE, bse_lanczos, lanczos_estimate_spectrum
+
+# restricted
+mol = gto.Mole()
+mol.verbose = 5
+mol.atom = [[8, (0.0, 0.0, 0.0)], [1, (0.7571, 0.0, 0.5861)], [1, (-0.7571, 0.0, 0.5861)]]
+mol.basis = 'def2-svp'
+mol.build()
+mf = dft.RKS(mol, xc='pbe')
+mf.kernel()
+
+# GW-AC/BSE
+gw = GWAC(mf)
+gw.kernel()
+bse = BSE(gw)
+# Davidson algorithm for singlet excitation
+bse.TDA = False
+bse.kernel('s')
+bse.analyze()
+# Davidson algorithm for triplet excitation, turn on TDA
+bse.TDA = True
+bse.kernel('t')
+bse.analyze()
+# full diagonalization for triplet excitation
+bse.full_diagonalization('t')
+bse.analyze()
+
+eta = 0.01 # spectrum broadening in eV
+omega = np.linspace(0.0, 1.0, 1000)[:, None] + 1j * eta # (nω, 1)
+
+ao_dip = mol.intor('int1e_r', comp=3)
+nocc = mol.nelectron // 2
+mo_dip = np.einsum('xij,ia,jb->xab', ao_dip, mf.mo_coeff[:, :nocc], mf.mo_coeff[:, nocc:])
+
+bse.TDA = False
+lanczos_spectra = []
+for j in range(3):
+    alphas, betas = bse_lanczos(bse, multi='s', u1=mo_dip[j].flatten(), nsteps=500)
+    freqs, density = lanczos_estimate_spectrum(alphas, betas, (0, 1), eta, 1000)
+    lanczos_spectra.append(density)
+mean_spectrum = np.mean(lanczos_spectra, axis=0) * 4 * np.pi
+print("spectrum from Lanczos algorithm:")
+for i in range(len(freqs)):
+    print(f"{freqs[i]:.6f} {mean_spectrum[i]:.6f}")
+
+# Energy-specific BSE, target excitations above 0.4 AU
+gw = GWAC(mf)
+gw.kernel()
+bse = BSE(gw)
+bse.kernel('s', e_min=0.4)
+bse.analyze()
+bse.kernel('t', e_min=0.4)
+bse.analyze()
+
+# unrestricted
+mol = gto.Mole()
+mol.verbose = 5
+mol.atom = [[8, (0.0, 0.0, 0.0)], [1, (0.7571, 0.0, 0.5861)], [1, (-0.7571, 0.0, 0.5861)]]
+mol.charge = 1
+mol.spin = 1
+mol.basis = 'def2-svp'
+mol.build()
+mf = dft.UKS(mol, xc='pbe')
+mf.kernel()
+
+# UGWAC/BSE
+gw = UGWAC(mf)
+gw.kernel()
+
+bse = BSE(gw)
+bse.kernel('u')
+bse.analyze()
diff --git a/examples/mcscf/13-load_chkfile.py b/examples/mcscf/13-load_chkfile.py
index e25c68259b..096d9c2a81 100644
--- a/examples/mcscf/13-load_chkfile.py
+++ b/examples/mcscf/13-load_chkfile.py
@@ -3,7 +3,7 @@
 # Author: Qiming Sun <osirpt.sun@gmail.com>
 #
 
-import tempfile
+import os
 import h5py
 from pyscf import gto, scf, mcscf
 from pyscf import lib
@@ -14,7 +14,8 @@
 MCSCF objects.
 '''
 
-tmpchk = tempfile.NamedTemporaryFile()
+
+
 
 mol = gto.Mole()
 mol.atom = 'C 0 0 0; C 0 0 1.2'
@@ -22,11 +23,12 @@
 mol.build()
 
 mf = scf.RHF(mol)
-mf.chkfile = tmpchk.name
+chkname = os.path.join(lib.param.TMPDIR, '13-load_chkfile.chk')
+mf.chkfile = chkname
 mf.kernel()
 
 mc = mcscf.CASSCF(mf, 6, 6)
-mc.chkfile = tmpchk.name
+mc.chkfile = chkname
 mc.max_cycle_macro = 1
 mc.kernel()
 
@@ -35,7 +37,7 @@
 # Scenario 1: Using h5py to read quantities in chkfile
 #
 
-with h5py.File(tmpchk.name) as f:
+with h5py.File(chkname) as f:
     print('Keys in chkfile', f.keys)
     print('Keys in mcscf group', f['mcscf'].keys)
     mcscf_orb = f['mcscf/mo_coeff'].value
@@ -44,12 +46,12 @@
 #
 # Scenario 2: Using lib.chkfile module
 #
-mol = lib.chkfile.load_mol(tmpchk.name)
-mcscf_orb = lib.chkfile.load(tmpchk.name, 'mcscf/mo_coeff')
+mol = lib.chkfile.load_mol(chkname)
+mcscf_orb = lib.chkfile.load(chkname, 'mcscf/mo_coeff')
 
 #
 # Scenario 3: Using Python trick to quickly load scf/mcscf
 # intermediates/results
 #
 mc = mcscf.CASSCF(mf, 6, 6)
-mc.__dict__.update(lib.chkfile.load(tmpchk.name, 'mcscf'))
+mc.__dict__.update(lib.chkfile.load(chkname, 'mcscf'))
diff --git a/examples/mcscf/13-restart.py b/examples/mcscf/13-restart.py
index a6846d17fa..5aea906fc7 100644
--- a/examples/mcscf/13-restart.py
+++ b/examples/mcscf/13-restart.py
@@ -3,7 +3,7 @@
 # Author: Qiming Sun <osirpt.sun@gmail.com>
 #
 
-import tempfile
+import os
 from pyscf import gto, scf, mcscf
 from pyscf import lib
 
@@ -19,25 +19,25 @@
 intermediate results.
 '''
 
-tmpchk = tempfile.NamedTemporaryFile()
-
 mol = gto.Mole()
 mol.atom = 'C 0 0 0; C 0 0 1.2'
 mol.basis = 'ccpvdz'
 mol.build()
 
+chkname = os.path.join(lib.param.TMPDIR, '13-restart.chk')
 mf = scf.RHF(mol)
+mf.chkfile = chkname
 mf.kernel()
 
 mc = mcscf.CASSCF(mf, 6, 6)
-mc.chkfile = tmpchk.name
+mc.chkfile = chkname
 mc.max_cycle_macro = 1
 mc.kernel()
 
 #######################################################################
 #
 # Assuming the CASSCF was interrupted.  Intermediate data were saved in
-# tmpchk file.  Here we read the chkfile to restart the previous calculation.
+# chkname file.  Here we read the chkfile to restart the previous calculation.
 #
 #######################################################################
 mol = gto.Mole()
@@ -46,11 +46,11 @@
 mol.build()
 
 mc = mcscf.CASSCF(scf.RHF(mol), 6, 6)
-mo = lib.chkfile.load(tmpchk.name, 'mcscf/mo_coeff')
+mo = lib.chkfile.load(chkname, 'mcscf/mo_coeff')
 mc.kernel(mo)
 
 # Assuming you lose all memory about the previous calculation.
 # Restart the calculation with chkfile only.
-mol, mcdata = mcscf.chkfile.load_mcscf(tmpchk.name)
-mc = mcscf.CASSCF(mol, mcdata['ncas'], mcdata['nelecas']).update_from_chk(tmpchk.name)
+mol, mcdata = mcscf.chkfile.load_mcscf(chkname)
+mc = mcscf.CASSCF(mol, mcdata['ncas'], mcdata['nelecas']).update_from_chk(chkname)
 mc.kernel()
diff --git a/examples/mcscf/41-mcscf_custom_df_hamiltonian.py b/examples/mcscf/41-mcscf_custom_df_hamiltonian.py
index 8ef9465796..8ba2a00c4c 100644
--- a/examples/mcscf/41-mcscf_custom_df_hamiltonian.py
+++ b/examples/mcscf/41-mcscf_custom_df_hamiltonian.py
@@ -3,9 +3,9 @@
 # Author: Qiming Sun <osirpt.sun@gmail.com>
 #
 
-import tempfile
 import h5py
 from pyscf import gto, df, scf, mcscf
+from pyscf import lib
 
 '''
 Using the Cholesky decomposed 2-electron integrals to define the Hamiltonian in CASSCF
@@ -33,7 +33,7 @@
 #
 # Integrals on disk
 #
-ftmp = tempfile.NamedTemporaryFile()
+ftmp = lib.NamedTemporaryFile()
 df.outcore.cholesky_eri(mol, ftmp.name, auxbasis='ccpvdz-fit')
 
 with h5py.File(ftmp.name, 'r') as file1:
diff --git a/examples/mp/13-mp2_cabs.py b/examples/mp/13-mp2_cabs.py
new file mode 100644
index 0000000000..393158ed2a
--- /dev/null
+++ b/examples/mp/13-mp2_cabs.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python
+#
+# Author: Igor S. Gerasimov <foxtranigor@gmail.com>
+#
+
+"""
+A simple example to run MP2 calculation with CABS correction.
+"""
+
+import pyscf
+
+mol = pyscf.M(atom='H 0 0 0; F 0 0 1.1', basis='ccpvdz')
+
+mf = mol.RHF().run()
+
+mf.MP2().run()
+
+pyscf.mp.cabs.energy_singles(mf, auxbasis='ccpvdzri')
diff --git a/examples/pbc/22-k_points_gw.py b/examples/pbc/22-k_points_gw.py
index 2c1604bc28..854807600a 100644
--- a/examples/pbc/22-k_points_gw.py
+++ b/examples/pbc/22-k_points_gw.py
@@ -3,10 +3,8 @@
 '''
 G0W0 with k-points sampling
 '''
-
-from functools import reduce
-import numpy
-from pyscf.pbc import gto, scf, gw
+import numpy as np
+from pyscf.pbc import df, gto, scf, gw
 
 cell = gto.Cell()
 cell.atom='''
@@ -23,21 +21,59 @@
 cell.verbose = 5
 cell.build()
 
-#
-# KDFT and KGW with 2x2x2 k-points
-#
-kpts = cell.make_kpts([2,2,2])
-kmf = scf.KRKS(cell).density_fit()
-kmf.kpts = kpts
-emf = kmf.kernel()
+kpts = cell.make_kpts([2, 2, 2])
+gdf = df.RSDF(cell, kpts)
+gdf.build()
+
+# restricted KGW
+kmf = scf.KRKS(cell, kpts).rs_density_fit()
+kmf.with_df = gdf
+kmf.kernel()
+
+# KRGWAC using analytical continuation
+mygw = gw.krgw_ac.KRGWAC(kmf)
+mygw.kernel()
+
+# KRGWAC low-memory routine
+# finite-size correction is not implemented for outcore routine
+mygw = gw.krgw_ac.KRGWAC(kmf)
+mygw.outcore = True
+mygw.fc = False
+mygw.kernel()
 
-# Default is AC frequency integration
-mygw = gw.KRGW(kmf)
+# KRGWAC full self-energy and density of states
+mygw = gw.krgw_ac.KRGWAC(kmf)
+mygw.fullsigma = True
 mygw.kernel()
-print("KRGW energies =", mygw.mo_energy)
+omega = np.linspace(-1, 1, 201)
+# gf: GW Green's function; gf0: DFT Green's function; sigma: self-energy
+gf, gf0, sigma = mygw.make_gf(omega, eta=1e-2)
+print("k=0 density of states")
+for i in range(len(omega)):
+    print(omega[i], -np.trace(gf[0, :, :, i].imag) / np.pi)
 
 # With CD frequency integration
 #mygw = gw.KRGW(kmf, freq_int='cd')
 #mygw.kernel()
 #print("KRGW-CD energies =", mygw.mo_energy)
 
+# restricted KGW
+kmf = scf.KUKS(cell, kpts).rs_density_fit()
+kmf.with_df = gdf
+kmf.kernel()
+
+# KUGWAC using analytical continuation with finite-size correction
+mygw = gw.kugw_ac.KUGWAC(kmf)
+mygw.fc = True
+mygw.kernel()
+
+# KUGWAC full self-energy and density of states
+mygw = gw.kugw_ac.KUGWAC(kmf)
+mygw.fullsigma = True
+mygw.kernel()
+omega = np.linspace(-1, 1, 201)
+# gf: GW Green's function; gf0: DFT Green's function; sigma: self-energy
+gf, gf0, sigma = mygw.make_gf(omega, eta=1e-2)
+print("k=0 density of states: alpha beta")
+for i in range(len(omega)):
+    print(omega[i], -np.trace(gf[0, 0, :, :, i].imag) / np.pi, -np.trace(gf[0, 1, :, :, i].imag) / np.pi)
diff --git a/examples/pbc/22-k_points_rpa.py b/examples/pbc/22-k_points_rpa.py
new file mode 100644
index 0000000000..b21b02106d
--- /dev/null
+++ b/examples/pbc/22-k_points_rpa.py
@@ -0,0 +1,114 @@
+'''
+RPA with k-points sampling
+'''
+
+from pyscf.pbc import gto, df, dft, scf
+from pyscf.pbc.gw.krpa import KRPA
+from pyscf.pbc.gw.kurpa import KURPA
+
+# spin-restricted RPA
+cell = gto.Cell()
+cell.build(
+    unit='angstrom',
+    a="""
+            0.000000     1.783500     1.783500
+            1.783500     0.000000     1.783500
+            1.783500     1.783500     0.000000
+        """,
+    atom='C 1.337625 1.337625 1.337625; C 2.229375 2.229375 2.229375',
+    dimension=3,
+    max_memory=12000,
+    verbose=5,
+    pseudo='gth-pbe',
+    basis='gth-dzv',
+    precision=1e-12,
+)
+
+kpts = cell.make_kpts([3, 1, 1], scaled_center=[0, 0, 0])
+gdf = df.RSGDF(cell, kpts)
+gdf.build()
+kmf = scf.KRHF(cell, kpts).rs_density_fit()
+kmf.with_df = gdf
+kmf.kernel()
+
+# RPA with finite-size correction
+rpa = KRPA(kmf)
+rpa.fc = True
+rpa.kernel()
+# RPA with finite-size correction
+rpa = KRPA(kmf)
+rpa.fc = False
+rpa.kernel()
+# low-memory routine
+rpa = KRPA(kmf)
+rpa.outcore = True
+rpa.segsize = 2
+rpa.kernel()
+
+# Na (metallic)
+cell = gto.Cell()
+cell.build(
+    unit='angstrom',
+    a="""
+         -2.11250000000000   2.11250000000000   2.11250000000000
+        2.11250000000000  -2.11250000000000   2.11250000000000
+        2.11250000000000   2.11250000000000  -2.11250000000000
+        """,
+    atom="""Na   0.00000   0.00000   0.00000""",
+    dimension=3,
+    max_memory=126000,
+    verbose=5,
+    pseudo='gth-pade',
+    basis='gth-dzvp-molopt-sr',
+    precision=1e-10,
+)
+
+kpts = cell.make_kpts([2, 2, 1], scaled_center=[0, 0, 0])
+gdf = df.RSGDF(cell, kpts)
+gdf.build()
+
+kmf = dft.KRKS(cell, kpts).rs_density_fit()
+kmf = scf.addons.smearing_(kmf, sigma=5e-3, method='fermi')
+kmf.xc = 'lda'
+kmf.with_df = gdf
+kmf.kernel()
+
+rpa = KRPA(kmf)
+rpa.kernel()
+# use ACFDT exchange energy
+rpa = KRPA(kmf)
+rpa.acfd_exx = True
+rpa.kernel()
+
+# spin-unrestricted RPA
+cell = gto.Cell()
+cell.build(
+    unit='B',
+    a=[[0.0, 6.74027466, 6.74027466], [6.74027466, 0.0, 6.74027466], [6.74027466, 6.74027466, 0.0]],
+    atom="""H 0 0 0
+            H 1.68506866 1.68506866 1.68506866
+            H 3.37013733 3.37013733 3.37013733""",
+    basis='gth-dzvp',
+    pseudo='gth-pade',
+    verbose=5,
+    charge=0,
+    spin=1,
+)
+
+cell.spin = cell.spin * 3
+kpts = cell.make_kpts([3, 1, 1], scaled_center=[0, 0, 0])
+gdf = df.RSDF(cell, kpts)
+gdf.build()
+
+kmf = scf.KUHF(cell, kpts, exxdiv='ewald').rs_density_fit()
+kmf = scf.addons.smearing_(kmf, sigma=5e-3, method='fermi')
+kmf.xc = 'lda'
+kmf.with_df = gdf
+kmf.kernel()
+
+rpa = KURPA(kmf)
+rpa.kernel()
+# use ACFDT exchange energy
+rpa = KURPA(kmf)
+rpa.acfd_exx = True
+rpa.kernel()
diff --git a/examples/scf/02-ghf.py b/examples/scf/02-ghf.py
index 6a6963c5c0..b32be1c68d 100644
--- a/examples/scf/02-ghf.py
+++ b/examples/scf/02-ghf.py
@@ -4,14 +4,23 @@
 #
 
 '''
-scf.GHF, real, complex.
+Examples of generalized Hartree–Fock (GHF) calculations.
+
+Each molecular orbital in GHF is represented in a two-component basis (alpha
+beta components). Typically, the GHF orbital coefficient matrix (mo_coeff) has
+dimension 2N x 2N, where N is the number of AOs (mol.nao). The alpha
+components are stored in the upper block (mo_coeff[:N]) and the beta components
+are stored in the lower block (mo_coeff[N:]).
+
+This example demonstrates
+
+1. Real-valued GHF calculations.
+2. Complex-valued GHF calculations.
+3. Breaking the Sz spin symmetry in GHF.
 '''
 
 from pyscf import gto, scf
 
-#
-# 1. real GHF
-#
 mol = gto.M(
     atom = '''
 O 0 0      0
@@ -22,14 +31,51 @@
     spin = 1  # = 2S = spin_up - spin_down
 )
 
-mf = scf.GHF(mol)
+#
+# 1. Real-valued GHF
+#
+# For a non-relativistic Hamiltonian with only real-valued integrals, the GHF
+# solution is normally real. In this case, the converged GHF solution is usually
+# equivalent to the corresponding UHF solution. Although the Hamiltonian itself
+# does not couple the alpha and beta spin channels, degeneracy can lead to the
+# rotation within the alpha and beta orbitals, leading to spin mixed spin
+# components in the GHF orbitals.
+#
+mf = mol.GHF()
 mf.kernel()
 
 #
-# 2. complex GHF
+# 2. Complex-valued GHF
+#
+# GHF can also optimize complex-valued orbitals. One way to obtain such a
+# solution is to start the SCF procedure from a complex density matrix.
+#
+mf = mol.GHF()
+dm = mf.get_init_guess() + 0j
+dm[0,0] += .05j
+dm[1,1] -= .05j
+mf.kernel(dm0=dm)
+
+#
+# 3. Breaking the Sz spin symmetry
+#
+# Spin-orbit coupling (SOC) operator can mix alpha and beta components. The SOC
+# term can be enabled by the X2C relativistic calculations with GHF (see also
+# examples/x2c/03-x2c_ghf.py) or the configuration mf.with_soc in the case of
+# ECP-SOC calculations (see also examples/scf/44-soc_ecp.py).
+#
+mf = mol.GHF().x2c()
+mf.run()
+
+#
+# A non-zero alpha-beta block in the density matrix explicitly couples the two
+# spin sectors. Such initial guesses can drive the SCF procedure toward a
+# solution that breaks the Sz symmetry, even without an explicit SOC term in the
+# Hamiltonian.
 #
-mf = scf.GHF(mol)
+mf = mol.GHF()
 dm = mf.get_init_guess() + 0j
-dm[0,:] += .05j
-dm[:,0] -= .05j
+nao = mol.nao
+dm[:nao,nao:] = 0.05j
+dm[nao:,:nao] = -0.05j
 mf.kernel(dm0=dm)
diff --git a/examples/scf/15-initial_guess.py b/examples/scf/15-initial_guess.py
index 98807bd96e..03f9480f0e 100644
--- a/examples/scf/15-initial_guess.py
+++ b/examples/scf/15-initial_guess.py
@@ -11,7 +11,6 @@
 initial guess.
 '''
 
-import tempfile
 from pyscf import gto
 from pyscf import scf
 
@@ -38,10 +37,7 @@
     basis = 'cc-pVDZ',
 )
 
-tmp_chkfile = tempfile.NamedTemporaryFile()
-chkfile_name = tmp_chkfile.name
 mf = scf.RHF(mol)
-mf.chkfile = chkfile_name
 mf.kernel(dm_init_guess)
 
 # If a numpy array is assigned to the attribute .init_guess, it will be used
diff --git a/examples/scf/21-x2c.py b/examples/scf/21-x2c.py
index 0a55200396..1b8762a7ba 100644
--- a/examples/scf/21-x2c.py
+++ b/examples/scf/21-x2c.py
@@ -1,7 +1,4 @@
 #!/usr/bin/env python
-#
-# Author: Qiming Sun <osirpt.sun@gmail.com>
-#
 
 '''
 Applying scalar relativistic effects by decorating the scf object with
diff --git a/examples/scf/32-break_spin_symm.py b/examples/scf/32-break_spin_symm.py
index 2511f757f3..6d28e3c703 100644
--- a/examples/scf/32-break_spin_symm.py
+++ b/examples/scf/32-break_spin_symm.py
@@ -7,6 +7,7 @@
 Break spin symmetry for UHF/UKS by initial guess.
 
 See also examples/dft/32-broken_symmetry_dft.py
+     and examples/scf/56-h2_symm_breaking.py
 '''
 
 import numpy
@@ -38,3 +39,17 @@
 dm_beta[:2,:2] = 0
 dm = (dm_alpha,dm_beta)
 mf.kernel(dm)
+
+#
+# Alternative: use the built-in HOMO-LUMO rotation (breaksym='mix').
+# Instead of zeroing atom blocks, this rotates the alpha and beta HOMOs
+# by +/-45 degrees into the LUMO:
+#   alpha HOMO -> (HOMO + LUMO) / sqrt(2)
+#   beta  HOMO -> (HOMO - LUMO) / sqrt(2)
+# The orbitals remain delocalized over the full molecule, giving a smoother
+# symmetry break that is less likely to collapse back to the RHF solution.
+# This option also works for UKS.
+#
+mf2 = scf.UHF(mol)
+mf2.init_guess_breaksym = 'mix'
+mf2.kernel()
diff --git a/examples/scf/41-hf_with_given_densityfit_ints.py b/examples/scf/41-hf_with_given_densityfit_ints.py
index 364b4804f3..5d772b2700 100644
--- a/examples/scf/41-hf_with_given_densityfit_ints.py
+++ b/examples/scf/41-hf_with_given_densityfit_ints.py
@@ -10,9 +10,9 @@
 examples/df/40-precompute_df_ints.py
 '''
 
-import tempfile
 import h5py
 from pyscf import gto, df, scf
+from pyscf import lib
 
 mol = gto.M(atom='H 0 0 0; F 0 0 1', basis='ccpvdz')
 
@@ -20,7 +20,7 @@
 int3c = df.incore.cholesky_eri(mol, auxbasis='ccpvdz-fit')
 
 # Integrals on disk
-ftmp = tempfile.NamedTemporaryFile()
+ftmp = lib.NamedTemporaryFile()
 df.outcore.cholesky_eri(mol, ftmp.name, auxbasis='ccpvdz-fit')
 
 
diff --git a/examples/scf/56-h2_symm_breaking.py b/examples/scf/56-h2_symm_breaking.py
index 0d7d2c9830..29f529b46e 100644
--- a/examples/scf/56-h2_symm_breaking.py
+++ b/examples/scf/56-h2_symm_breaking.py
@@ -1,12 +1,21 @@
 #!/usr/bin/env python
 # Author: James D Whitfield <jdwhitfield@gmail.com>
 '''
-Scan H2 molecule dissociation curve comparing UHF and RHF solutions per the 
-example of Szabo and Ostlund section 3.8.7
+Scan H2 molecule dissociation curve comparing UHF and RHF solutions per the
+example of Szabo and Ostlund section 3.8.7.
 
 The initial guess is obtained by mixing the HOMO and LUMO and is implemented
 as a function that can be used in other applications.
 
+NOTE: The HOMO-LUMO mixing strategy used here is now available as a built-in
+option via init_guess_breaksym='mix', which also works for UKS.  The manual
+init_guess_mixed function below is kept for educational purposes.  To use the
+built-in version replace uhf.kernel(init_guess_mixed(mol)) with:
+
+    uhf = scf.UHF(mol)
+    uhf.init_guess_breaksym = 'mix'
+    uhf.kernel()
+
 See also 16-h2_scan.py, 30-scan_pes.py, 32-break_spin_symm.py
 '''
 
@@ -23,18 +32,18 @@
 def init_guess_mixed(mol,mixing_parameter=numpy.pi/4):
     ''' Generate density matrix with broken spatial and spin symmetry by mixing
     HOMO and LUMO orbitals following ansatz in Szabo and Ostlund, Sec 3.8.7.
-    
+
     psi_1a = numpy.cos(q)*psi_homo + numpy.sin(q)*psi_lumo
     psi_1b = numpy.cos(q)*psi_homo - numpy.sin(q)*psi_lumo
-        
+
     psi_2a = -numpy.sin(q)*psi_homo + numpy.cos(q)*psi_lumo
     psi_2b =  numpy.sin(q)*psi_homo + numpy.cos(q)*psi_lumo
 
-    Returns: 
+    Returns:
         Density matrices, a list of 2D ndarrays for alpha and beta spins
     '''
     # opt: q, mixing parameter 0 < q < 2 pi
-    
+
     #based on init_guess_by_1e
     h1e = scf.hf.get_hcore(mol)
     s1e = scf.hf.get_ovlp(mol)
@@ -51,7 +60,7 @@ def init_guess_mixed(mol,mixing_parameter=numpy.pi/4):
 
     psi_homo=mo_coeff[:, homo_idx]
     psi_lumo=mo_coeff[:, lumo_idx]
-    
+
     Ca=numpy.zeros_like(mo_coeff)
     Cb=numpy.zeros_like(mo_coeff)
 
@@ -72,7 +81,7 @@ def init_guess_mixed(mol,mixing_parameter=numpy.pi/4):
         Cb[:,k]=mo_coeff[:,k]
 
     dm =scf.UHF(mol).make_rdm1( (Ca,Cb), (mo_occ,mo_occ) )
-    return dm 
+    return dm
 
 
 for b in numpy.arange(0.7, 4.01, 0.1):
diff --git a/pyproject.toml b/pyproject.toml
index 7518049d46..ff6a9d3a41 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,6 +42,7 @@ dependencies = [
   'scipy>=1.6.0',
   'h5py>=2.7',
   'setuptools',
+  "psutil; sys_platform == 'win32'"
 ]
 
 [project.urls]
diff --git a/pyscf/__init__.py b/pyscf/__init__.py
index 571b5fad3e..37fd9a907b 100644
--- a/pyscf/__init__.py
+++ b/pyscf/__init__.py
@@ -35,7 +35,7 @@
 
 '''
 
-__version__ = '2.13.0'
+__version__ = '2.13.1'
 
 import os
 import sys
diff --git a/pyscf/adc/radc_amplitudes.py b/pyscf/adc/radc_amplitudes.py
index 02f978c8d6..5996c52d67 100644
--- a/pyscf/adc/radc_amplitudes.py
+++ b/pyscf/adc/radc_amplitudes.py
@@ -565,5 +565,5 @@ def _create_t2_h5cache():
     as a temporary workaround before figuring out a better solution to handle
     big t2 amplitudes.
     '''
-    tmpfile = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+    tmpfile = lib.NamedTemporaryFile(dir=lib.param.TMPDIR)
     return h5py.File(tmpfile.name, 'w')
diff --git a/pyscf/adc/test/test_radc/test_ee_df_N2.py b/pyscf/adc/test/test_radc/test_ee_df_N2.py
index 55bcd677dd..c44605edc5 100644
--- a/pyscf/adc/test/test_radc/test_ee_df_N2.py
+++ b/pyscf/adc/test/test_radc/test_ee_df_N2.py
@@ -20,9 +20,7 @@
 import unittest
 import numpy as np
 import math
-from pyscf import gto
-from pyscf import scf
-from pyscf import adc
+from pyscf import gto, scf, adc, lib
 
 def setUpModule():
     global mol, mf, myadc, myadc_fr
@@ -49,8 +47,8 @@ def tearDownModule():
 
 def rdms_test(dm):
     r2_int = mol.intor('int1e_r2')
-    dm_ao = np.einsum('pi,ij,qj->pq', mf.mo_coeff, dm, mf.mo_coeff.conj())
-    r2 = np.einsum('pq,pq->',r2_int,dm_ao)
+    dm_ao = lib.einsum('pi,ij,qj->pq', mf.mo_coeff, dm, mf.mo_coeff.conj())
+    r2 = lib.einsum('pq,pq->',r2_int,dm_ao)
     return r2
 
 class KnownValues(unittest.TestCase):
@@ -72,10 +70,10 @@ def test_ee_adc2(self):
         self.assertAlmostEqual(p[3], 6.481812538202186e-30, 6)
 
         dm1_exc = np.array(myadc.make_rdm1())
-        self.assertAlmostEqual(rdms_test(dm1_exc[0]), 39.97509426976306, 4)
-        self.assertAlmostEqual(rdms_test(dm1_exc[1]), 39.97509426976296, 4)
-        self.assertAlmostEqual(rdms_test(dm1_exc[2]), 40.69394840350379, 4)
-        self.assertAlmostEqual(rdms_test(dm1_exc[3]), 40.99987050864409, 4)
+        self.assertAlmostEqual(rdms_test(dm1_exc[0]) - 39.97509426976306, 0, 3)
+        self.assertAlmostEqual(rdms_test(dm1_exc[1]) - 39.97509426976296, 0, 3)
+        self.assertAlmostEqual(rdms_test(dm1_exc[2]) - 40.69394840350379, 0, 3)
+        self.assertAlmostEqual(rdms_test(dm1_exc[3]) - 40.99987050864409, 0, 3)
 
 
     def test_ee_adc2x(self):
@@ -124,7 +122,7 @@ def test_ee_adc2x_cis(self):
         self.assertAlmostEqual(rdms_test(dm1_exc[3]), 40.91091417592432, 4)
 
 
-    def test_ee_adc3(self):
+    def test_ee_adc3_high_cost(self):
         myadc.method = "adc(3)"
         e, t_amp1, t_amp2 = myadc.kernel_gs()
 
diff --git a/pyscf/adc/test/test_uadc/test_ee_df_F2.py b/pyscf/adc/test/test_uadc/test_ee_df_F2.py
index 323427ffad..fae700bef9 100644
--- a/pyscf/adc/test/test_uadc/test_ee_df_F2.py
+++ b/pyscf/adc/test/test_uadc/test_ee_df_F2.py
@@ -120,7 +120,7 @@ def test_ee_adc2x_cis(self):
         self.assertAlmostEqual(rdms_test(dm1_exc[0][2],dm1_exc[1][2]), 40.49491598756553, 6)
         self.assertAlmostEqual(rdms_test(dm1_exc[0][3],dm1_exc[1][3]), 40.49491598756554, 6)
 
-    def test_ee_adc3(self):
+    def test_ee_adc3_high_cost(self):
         myadc.method = "adc(3)"
 
         e,v,p,x = myadc.kernel(nroots=4)
diff --git a/pyscf/adc/test/test_uadc/test_ee_rohf_CN.py b/pyscf/adc/test/test_uadc/test_ee_rohf_CN.py
index 09471cb7fb..c9e94391ab 100644
--- a/pyscf/adc/test/test_uadc/test_ee_rohf_CN.py
+++ b/pyscf/adc/test/test_uadc/test_ee_rohf_CN.py
@@ -122,7 +122,7 @@ def test_ee_adc2x(self):
         self.assertAlmostEqual(rdms_test(dm1_exc[0][2],dm1_exc[1][2]), 40.27044834802643, 4)
         self.assertAlmostEqual(rdms_test(dm1_exc[0][3],dm1_exc[1][3]), 40.64183214575419, 4)
 
-    def test_ee_adc3(self):
+    def test_ee_adc3_high_cost(self):
         myadc.method = "adc(3)"
 
         e,v,p,x = myadc.kernel(nroots=4)
diff --git a/pyscf/agf2/test/test_ragf2_h2o.py b/pyscf/agf2/test/test_ragf2_h2o.py
index 6f890638d7..d395309a4a 100644
--- a/pyscf/agf2/test/test_ragf2_h2o.py
+++ b/pyscf/agf2/test/test_ragf2_h2o.py
@@ -17,7 +17,6 @@
 #
 
 import unittest
-import tempfile
 import numpy as np
 import h5py
 from pyscf import gto, scf, agf2, lib
@@ -29,8 +28,8 @@ class KnownValues(unittest.TestCase):
     def setUpClass(self):
         self.mol = gto.M(atom='O 0 0 0; H 0 0 1; H 0 1 0', basis='cc-pvdz', verbose=0)
         self.mf = scf.RHF(self.mol)
-        self.mf.chkfile = tempfile.NamedTemporaryFile().name
         self.mf.conv_tol = 1e-12
+        self.mf.chkfile = lib.NamedTemporaryFile().name
         self.mf.run()
         self.gf2 = agf2.RAGF2(self.mf)
         self.gf2.conv_tol = 1e-7
diff --git a/pyscf/agf2/test/test_uagf2_beh.py b/pyscf/agf2/test/test_uagf2_beh.py
index 081e06b4e5..68857d1279 100644
--- a/pyscf/agf2/test/test_uagf2_beh.py
+++ b/pyscf/agf2/test/test_uagf2_beh.py
@@ -17,7 +17,6 @@
 #
 
 import unittest
-import tempfile
 import numpy as np
 from pyscf import gto, scf, agf2, lib
 
@@ -28,8 +27,8 @@ class KnownValues(unittest.TestCase):
     def setUpClass(self):
         self.mol = gto.M(atom='Be 0 0 0; H 0 0 1', basis='cc-pvdz', spin=1, verbose=0)
         self.mf = scf.UHF(self.mol)
-        self.mf.chkfile = tempfile.NamedTemporaryFile().name
         self.mf.conv_tol = 1e-12
+        self.mf.chkfile = lib.NamedTemporaryFile().name
         self.mf.run()
         self.gf2 = agf2.UAGF2(self.mf)
         self.gf2.conv_tol = 1e-7
@@ -72,9 +71,9 @@ def test_uagf2_beh_ea(self):
     def test_uagf2_outcore(self):
         # tests the out-of-core and chkfile support for AGF2 for BeH/cc-pvdz
         gf2 = agf2.UAGF2(self.mf)
-        gf2.chkfile = tempfile.NamedTemporaryFile().name
         gf2.max_memory = 1
         gf2.conv_tol = 1e-7
+        gf2.chkfile = lib.NamedTemporaryFile().name
         gf2.run()
         e_ip, v_ip = self.gf2.ipagf2(nroots=1)
         e_ea, v_ea = self.gf2.eaagf2(nroots=1)
diff --git a/pyscf/ao2mo/__init__.py b/pyscf/ao2mo/__init__.py
index fbe646eb56..02b62949dc 100644
--- a/pyscf/ao2mo/__init__.py
+++ b/pyscf/ao2mo/__init__.py
@@ -35,6 +35,7 @@
 from pyscf.ao2mo import incore
 from pyscf.ao2mo import outcore
 from pyscf.ao2mo import r_outcore
+from pyscf.ao2mo import nrr_outcore
 from pyscf.ao2mo.addons import load, restore
 
 def full(eri_or_mol, mo_coeff, erifile=None, dataname='eri_mo', intor='int2e',
@@ -147,6 +148,8 @@ def full(eri_or_mol, mo_coeff, erifile=None, dataname='eri_mo', intor='int2e',
     elif isinstance(eri_or_mol, gto.MoleBase):
         if '_spinor' in intor:
             mod = r_outcore
+        elif numpy.result_type(mo_coeff) == numpy.complex128:
+            mod = nrr_outcore
         else:
             mod = outcore
 
@@ -302,6 +305,8 @@ def general(eri_or_mol, mo_coeffs, erifile=None, dataname='eri_mo', intor='int2e
     elif isinstance(eri_or_mol, gto.MoleBase):
         if '_spinor' in intor:
             mod = r_outcore
+        elif numpy.result_type(*mo_coeffs) == numpy.complex128:
+            mod = nrr_outcore
         else:
             mod = outcore
 
diff --git a/pyscf/ao2mo/_ao2mo.py b/pyscf/ao2mo/_ao2mo.py
index a2b04b458d..5f06314e60 100644
--- a/pyscf/ao2mo/_ao2mo.py
+++ b/pyscf/ao2mo/_ao2mo.py
@@ -14,15 +14,15 @@
 # limitations under the License.
 
 import ctypes
-import _ctypes
 import numpy
 from pyscf import lib
 from pyscf.gto.moleintor import make_cintopt, make_loc, ascint3
 from pyscf.scf import _vhf
 
 libao2mo = lib.load_library('libao2mo')
+
 def _fpointer(name):
-    return ctypes.c_void_p(_ctypes.dlsym(libao2mo._handle, name))
+    return ctypes.cast(getattr(libao2mo, name), ctypes.c_void_p)
 
 class AO2MOpt:
     def __init__(self, mol, intor, prescreen='CVHFnoscreen', qcondname=None):
diff --git a/pyscf/ao2mo/nrr_outcore.py b/pyscf/ao2mo/nrr_outcore.py
index 39556e07ad..51d44ec086 100644
--- a/pyscf/ao2mo/nrr_outcore.py
+++ b/pyscf/ao2mo/nrr_outcore.py
@@ -18,11 +18,9 @@
 '''
 
 import time
-import tempfile
 import numpy
 import h5py
 import ctypes
-import _ctypes
 from pyscf import lib
 from pyscf import gto
 from pyscf.lib import logger
@@ -33,7 +31,7 @@
 
 libao2mo = lib.load_library('libao2mo')
 def _fpointer(name):
-    return ctypes.c_void_p(_ctypes.dlsym(libao2mo._handle, name))
+    return ctypes.cast(getattr(libao2mo, name), ctypes.c_void_p)
 
 IOBLK_SIZE = getattr(__config__, 'ao2mo_outcore_ioblk_size', 256)  # 256 MB
 IOBUF_WORDS = getattr(__config__, 'ao2mo_outcore_iobuf_words', 1e8)  # 1.6 GB
@@ -124,7 +122,7 @@ def general(mol, mo_coeffs, erifile, dataname='eri_mo',
               float(nij_pair)*nkl_pair*comp, nij_pair*nkl_pair*comp*16/1e6)
 
 # transform e1
-    swapfile = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+    swapfile = lib.NamedTemporaryFile(dir=lib.param.TMPDIR)
     half_e1(mol, (mo_alph, mo_beta), swapfile.name, intor, aosym, comp,
             max_memory, ioblk_size, log)
     time_1pass = log.timer('AO->MO transformation for %s 1 pass'%intor,
@@ -189,7 +187,7 @@ def general(mol, mo_coeffs, erifile, dataname='eri_mo',
 def full_iofree(mol, mo_coeff, dataname='eri_mo', intor='int2e_sph',
                 motype='ghf', aosym='s1', comp=None, verbose=logger.debug,
                 **kwargs):
-    erifile = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+    erifile = lib.NamedTemporaryFile(dir=lib.param.TMPDIR)
     general(mol, (mo_coeff,)*4, erifile.name, dataname='eri_mo',
             intor=intor, motype=motype, aosym=aosym, comp=comp,
             verbose=verbose)
@@ -199,7 +197,7 @@ def full_iofree(mol, mo_coeff, dataname='eri_mo', intor='int2e_sph',
 def general_iofree(mol, mo_coeffs, dataname='eri_mo', intor='int2e_sph',
                    motype='ghf', aosym='s1', comp=None, verbose=logger.debug,
                    **kwargs):
-    erifile = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+    erifile = lib.NamedTemporaryFile(dir=lib.param.TMPDIR)
     general(mol, mo_coeffs, erifile.name, dataname='eri_mo',
             intor=intor, motype=motype, aosym=aosym, comp=comp,
             verbose=verbose)
diff --git a/pyscf/ao2mo/r_outcore.py b/pyscf/ao2mo/r_outcore.py
index 2c77b5f89a..66b4ce7ae6 100644
--- a/pyscf/ao2mo/r_outcore.py
+++ b/pyscf/ao2mo/r_outcore.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 
-import tempfile
 import numpy
 import h5py
 from pyscf import lib
@@ -103,7 +102,7 @@ def general(mol, mo_coeffs, erifile, dataname='eri_mo',
               float(nij_pair)*nkl_pair*comp, nij_pair*nkl_pair*comp*16/1e6)
 
 # transform e1
-    swapfile = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+    swapfile = lib.NamedTemporaryFile(dir=lib.param.TMPDIR)
     half_e1(mol, mo_coeffs, swapfile.name, intor, aosym, comp,
             max_memory, ioblk_size, log)
 
@@ -253,7 +252,7 @@ def half_e1(mol, mo_coeffs, swapfile,
 
 def full_iofree(mol, mo_coeff, intor='int2e_spinor', aosym='s4', comp=None,
                 verbose=logger.WARN, **kwargs):
-    erifile = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+    erifile = lib.NamedTemporaryFile(dir=lib.param.TMPDIR)
     general(mol, (mo_coeff,)*4, erifile.name, dataname='eri_mo',
             intor=intor, aosym=aosym, comp=comp,
             verbose=verbose)
@@ -262,7 +261,7 @@ def full_iofree(mol, mo_coeff, intor='int2e_spinor', aosym='s4', comp=None,
 
 def general_iofree(mol, mo_coeffs, intor='int2e_spinor', aosym='s4', comp=None,
                    verbose=logger.WARN, **kwargs):
-    erifile = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+    erifile = lib.NamedTemporaryFile(dir=lib.param.TMPDIR)
     general(mol, mo_coeffs, erifile.name, dataname='eri_mo',
             intor=intor, aosym=aosym, comp=comp,
             verbose=verbose)
diff --git a/pyscf/ao2mo/semi_incore.py b/pyscf/ao2mo/semi_incore.py
index f5a488b838..11de1fda6c 100644
--- a/pyscf/ao2mo/semi_incore.py
+++ b/pyscf/ao2mo/semi_incore.py
@@ -290,7 +290,7 @@ def save(start, stop, buf):
     onnn2 = ao2mo.incore.general(mf._eri, (orbo,mo_coeff,mo_coeff,mo_coeff))
     print('    Time elapsed (s): ',logger.perf_counter() - start_time)
 
-    tmpfile2 = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+    tmpfile2 = lib.NamedTemporaryFile(dir=lib.param.TMPDIR)
 
     print('\n\nCustom outcore transformation ...')
     orbo = mo_coeff[:,:nocc]
diff --git a/pyscf/ao2mo/test/test_incore.py b/pyscf/ao2mo/test/test_incore.py
index befa0a484c..430c7e00c0 100644
--- a/pyscf/ao2mo/test/test_incore.py
+++ b/pyscf/ao2mo/test/test_incore.py
@@ -16,7 +16,6 @@
 import ctypes
 import unittest
 from functools import reduce
-import tempfile
 import numpy
 import h5py
 from pyscf import lib
diff --git a/pyscf/ao2mo/test/test_init.py b/pyscf/ao2mo/test/test_init.py
index 21641746cd..66322ef869 100644
--- a/pyscf/ao2mo/test/test_init.py
+++ b/pyscf/ao2mo/test/test_init.py
@@ -16,7 +16,6 @@
 import ctypes
 import unittest
 from functools import reduce
-import tempfile
 import numpy
 import h5py
 from pyscf import lib
@@ -80,7 +79,7 @@ def test_full(self):
         with ao2mo.load(h5file, 'eri') as eri:
             self.assertEqual(eri.shape, (10,10))
 
-        ftmp = tempfile.NamedTemporaryFile()
+        ftmp = lib.NamedTemporaryFile()
         ao2mo.kernel(mol, mo, ftmp, intor='int2e', dataname='eri')
         with ao2mo.load(ftmp, 'eri') as eri:
             self.assertEqual(eri.shape, (10,10))
@@ -97,7 +96,7 @@ def test_general(self):
         ao2mo.kernel(mol, [mo]*4, erifile=h5file, intor='int2e', dataname='eri')
         self.assertEqual(h5file['eri'].shape, (10,10))
 
-        ftmp = tempfile.NamedTemporaryFile()
+        ftmp = lib.NamedTemporaryFile()
         ao2mo.kernel(mol, [mo]*4, ftmp, intor='int2e', dataname='eri')
         with ao2mo.load(ftmp.name, 'eri') as eri:
             self.assertEqual(eri.shape, (10,10))
diff --git a/pyscf/ao2mo/test/test_nrr_outcore.py b/pyscf/ao2mo/test/test_nrr_outcore.py
index a81e275b34..2f2c16218a 100644
--- a/pyscf/ao2mo/test/test_nrr_outcore.py
+++ b/pyscf/ao2mo/test/test_nrr_outcore.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 import unittest
-import tempfile
 import numpy as np
 import h5py
 from pyscf import lib
diff --git a/pyscf/ao2mo/test/test_outcore.py b/pyscf/ao2mo/test/test_outcore.py
index 6dcd76644a..dc6e3a62da 100644
--- a/pyscf/ao2mo/test/test_outcore.py
+++ b/pyscf/ao2mo/test/test_outcore.py
@@ -16,7 +16,6 @@
 import ctypes
 import unittest
 from functools import reduce
-import tempfile
 import numpy
 import h5py
 from pyscf import lib
@@ -47,7 +46,7 @@ def tearDownModule():
 
 class KnownValues(unittest.TestCase):
     def test_nroutcore_grad(self):
-        ftmp = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+        ftmp = lib.NamedTemporaryFile(dir=lib.param.TMPDIR)
         erifile = ftmp.name
         eri_ao = mol.intor('int2e_ip1', aosym='s1').reshape(3,nao,nao,nao,nao)
         eriref = numpy.einsum('npjkl,pi->nijkl', eri_ao, mo)
@@ -64,7 +63,7 @@ def test_nroutcore_grad(self):
         self.assertTrue(numpy.allclose(eri1, eriref))
 
     def test_nroutcore_eri(self):
-        ftmp = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+        ftmp = lib.NamedTemporaryFile(dir=lib.param.TMPDIR)
         erifile = ftmp.name
         eri_ao = ao2mo.restore(1, mol.intor('int2e', aosym='s2kl'), nao)
         eriref = numpy.einsum('pjkl,pi->ijkl', eri_ao, mo)
diff --git a/pyscf/ao2mo/test/test_r_outcore.py b/pyscf/ao2mo/test/test_r_outcore.py
index 271968f950..66d9263da2 100644
--- a/pyscf/ao2mo/test/test_r_outcore.py
+++ b/pyscf/ao2mo/test/test_r_outcore.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 import unittest
-import tempfile
 import numpy
 from pyscf import lib
 from pyscf import gto
@@ -51,7 +50,7 @@ def test_r_outcore_eri(self):
         numpy.random.seed(1)
         mo = numpy.random.random((n2c,n2c)) + numpy.random.random((n2c,n2c))*1j
         eriref = trans(eri0, [mo]*4)
-        ftmp = tempfile.NamedTemporaryFile()
+        ftmp = lib.NamedTemporaryFile()
 
         ao2mo.kernel(mol, mo, erifile=ftmp.name, intor='int2e_spinor', max_memory=10, ioblk_size=5)
         with ao2mo.load(ftmp) as eri1:
diff --git a/pyscf/ao2mo/test/test_semi_incore.py b/pyscf/ao2mo/test/test_semi_incore.py
index 8271c7b051..0fc95fbd01 100644
--- a/pyscf/ao2mo/test/test_semi_incore.py
+++ b/pyscf/ao2mo/test/test_semi_incore.py
@@ -16,7 +16,6 @@
 import ctypes
 import unittest
 from functools import reduce
-import tempfile
 import numpy
 import h5py
 from pyscf import lib
@@ -46,7 +45,7 @@ def test_general(self):
         mo = numpy.random.random((nao,nmo))
         eriref = ao2mo.incore.full(eri, mo)
 
-        tmpfile = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+        tmpfile = lib.NamedTemporaryFile(dir=lib.param.TMPDIR)
         io_size = nao**2*4e-5
 
         semi_incore.general(eri, [mo]*4, tmpfile.name, ioblk_size=io_size)
@@ -67,7 +66,7 @@ def test_general_complex(self):
                             mo.conj(), mo, mo.conj(), mo)
         eriref = eriref.reshape(12**2,12**2)
 
-        tmpfile = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+        tmpfile = lib.NamedTemporaryFile(dir=lib.param.TMPDIR)
         io_size = nao**2*4e-5
 
         semi_incore.general(eri, [mo]*4, tmpfile.name, ioblk_size=io_size)
diff --git a/pyscf/cc/gccsd.py b/pyscf/cc/gccsd.py
index 303b387ae3..934ebaadf0 100644
--- a/pyscf/cc/gccsd.py
+++ b/pyscf/cc/gccsd.py
@@ -127,7 +127,7 @@ def init_amps(self, eris=None):
         eia = mo_e[:nocc,None] - mo_e[None,nocc:]
         eijab = lib.direct_sum('ia,jb->ijab', eia, eia)
         t1 = eris.fock[:nocc,nocc:] / eia
-        eris_oovv = np.array(eris.oovv)
+        eris_oovv = np.asarray(eris.oovv)
         t2 = eris_oovv.conj() / eijab
         self.emp2 = 0.25*einsum('ijab,ijab', t2, eris_oovv).real
         logger.info(self, 'Init t2, MP2 energy = %.15g', self.emp2)
@@ -346,20 +346,23 @@ def _make_eris_incore(mycc, mo_coeff=None, ao2mofn=None):
     eris._common_init_(mycc, mo_coeff)
     nocc = eris.nocc
     nao, nmo = eris.mo_coeff.shape
+    nao = nao // 2
 
     if callable(ao2mofn):
         eri = ao2mofn(eris.mo_coeff).reshape([nmo]*4)
-    else:
-        assert (eris.mo_coeff.dtype == np.double)
-        mo_a = eris.mo_coeff[:nao//2]
-        mo_b = eris.mo_coeff[nao//2:]
+    elif eris.mo_coeff.dtype == np.float64:
+        mo_a = eris.mo_coeff[:nao]
+        mo_b = eris.mo_coeff[nao:]
         orbspin = eris.orbspin
         if orbspin is None:
             eri  = ao2mo.kernel(mycc._scf._eri, mo_a)
             eri += ao2mo.kernel(mycc._scf._eri, mo_b)
             eri1 = ao2mo.kernel(mycc._scf._eri, (mo_a,mo_a,mo_b,mo_b))
             eri += eri1
-            eri += eri1.T
+            if eri1.ndim == 2:
+                eri += eri1.T
+            else:
+                eri += eri1.transpose(2,3,0,1)
         else:
             mo = mo_a + mo_b
             eri = ao2mo.kernel(mycc._scf._eri, mo)
@@ -373,6 +376,16 @@ def _make_eris_incore(mycc, mo_coeff=None, ao2mofn=None):
 
         if eri.dtype == np.double:
             eri = ao2mo.restore(1, eri, nmo)
+    else:
+        assert eris.mo_coeff.dtype == np.complex128
+        mo_a = eris.mo_coeff[:nao]
+        mo_b = eris.mo_coeff[nao:]
+        eri_ao = ao2mo.restore(1, mycc._scf._eri, nao)
+        eri  = ao2mo.kernel(eri_ao, mo_a)
+        eri += ao2mo.kernel(eri_ao, mo_b)
+        eri1 = ao2mo.kernel(eri_ao, (mo_a,mo_a,mo_b,mo_b))
+        eri += eri1
+        eri += eri1.transpose(2,3,0,1)
 
     eri = eri.reshape(nmo,nmo,nmo,nmo)
     eri = eri.transpose(0,2,1,3) - eri.transpose(0,2,3,1)
@@ -395,30 +408,72 @@ def _make_eris_outcore(mycc, mo_coeff=None):
     eris = _PhysicistsERIs()
     eris._common_init_(mycc, mo_coeff)
     nocc = eris.nocc
-    nao, nmo = eris.mo_coeff.shape
+    mo = eris.mo_coeff
+    nao, nmo = mo.shape
+    nao = nao // 2
     nvir = nmo - nocc
-    assert (eris.mo_coeff.dtype == np.double)
-    mo_a = eris.mo_coeff[:nao//2]
-    mo_b = eris.mo_coeff[nao//2:]
+    mo_a = mo[:nao]
+    mo_b = mo[nao:]
     orbspin = eris.orbspin
 
     feri = eris.feri = lib.H5TmpFile()
-    dtype = np.result_type(eris.mo_coeff).char
-    eris.oooo = feri.create_dataset('oooo', (nocc,nocc,nocc,nocc), dtype)
-    eris.ooov = feri.create_dataset('ooov', (nocc,nocc,nocc,nvir), dtype)
-    eris.oovv = feri.create_dataset('oovv', (nocc,nocc,nvir,nvir), dtype)
-    eris.ovov = feri.create_dataset('ovov', (nocc,nvir,nocc,nvir), dtype)
-    eris.ovvo = feri.create_dataset('ovvo', (nocc,nvir,nvir,nocc), dtype)
-    eris.ovvv = feri.create_dataset('ovvv', (nocc,nvir,nvir,nvir), dtype)
-
-    if orbspin is None:
+    dtype = np.result_type(eris.mo_coeff)
+    eris.oooo = feri.create_dataset('oooo', (nocc,nocc,nocc,nocc), dtype.char)
+    eris.ooov = feri.create_dataset('ooov', (nocc,nocc,nocc,nvir), dtype.char)
+    eris.oovv = feri.create_dataset('oovv', (nocc,nocc,nvir,nvir), dtype.char)
+    eris.ovov = feri.create_dataset('ovov', (nocc,nvir,nocc,nvir), dtype.char)
+    eris.ovvo = feri.create_dataset('ovvo', (nocc,nvir,nvir,nocc), dtype.char)
+    eris.ovvv = feri.create_dataset('ovvv', (nocc,nvir,nvir,nvir), dtype.char)
+
+    if mo.dtype == np.complex128:
+        max_memory = mycc.max_memory-lib.current_memory()[0]
+        blksize = min(nocc, max(2, int(max_memory*1e6/dtype.itemsize/(nmo**3*2))))
+        max_memory = max(MEMORYMIN, max_memory)
+
+        orbo = mo[:,:nocc]
+        orbv = mo[:,nocc:]
+        fswap = lib.H5TmpFile()
+        ao2mo.kernel(mycc.mol, (orbo,mo,mo,mo), fswap, 'eri_mo',
+                     max_memory=max_memory, verbose=log)
+
+        for p0, p1 in lib.prange(0, nocc, blksize):
+            tmp = np.asarray(fswap['eri_mo'][p0*nmo:p1*nmo])
+            tmp = tmp.reshape(p1-p0, nmo, nmo, nmo)
+            eris.oooo[p0:p1] = (tmp[:,:nocc,:nocc,:nocc].transpose(0,2,1,3) -
+                                tmp[:,:nocc,:nocc,:nocc].transpose(0,2,3,1))
+            eris.ooov[p0:p1] = (tmp[:,:nocc,:nocc,nocc:].transpose(0,2,1,3) -
+                                tmp[:,nocc:,:nocc,:nocc].transpose(0,2,3,1))
+            eris.ovvv[p0:p1] = (tmp[:,nocc:,nocc:,nocc:].transpose(0,2,1,3) -
+                                tmp[:,nocc:,nocc:,nocc:].transpose(0,2,3,1))
+            eris.oovv[p0:p1] = (tmp[:,nocc:,:nocc,nocc:].transpose(0,2,1,3) -
+                                tmp[:,nocc:,:nocc,nocc:].transpose(0,2,3,1))
+            eris.ovov[p0:p1] = (tmp[:,:nocc,nocc:,nocc:].transpose(0,2,1,3) -
+                                tmp[:,nocc:,nocc:,:nocc].transpose(0,2,3,1))
+            eris.ovvo[p0:p1] = (tmp[:,nocc:,nocc:,:nocc].transpose(0,2,1,3) -
+                                tmp[:,:nocc,nocc:,nocc:].transpose(0,2,3,1))
+            tmp = None
+        fswap = None
+        cput0 = log.timer_debug1('transforming ovvv', *cput0)
+
+        eris.vvvv = feri.create_dataset('vvvv', (nvir,nvir,nvir,nvir), dtype.char)
+        fswap = lib.H5TmpFile()
+        ao2mo.kernel(mycc.mol, orbv, fswap, 'vvvv',
+                     max_memory=max_memory, verbose=log)
+        for p0, p1 in lib.prange(0, nvir, blksize):
+            tmp = np.asarray(fswap['vvvv'][p0*nvir:p1*nvir])
+            tmp = tmp.reshape(p1-p0, nvir, nvir, nvir)
+            eris.vvvv[p0:p1] = tmp.transpose(0,2,1,3) - tmp.transpose(0,2,3,1)
+            tmp = None
+        cput0 = log.timer_debug1('transforming vvvv', *cput0)
+
+    elif orbspin is None:
         orbo_a = mo_a[:,:nocc]
         orbv_a = mo_a[:,nocc:]
         orbo_b = mo_b[:,:nocc]
         orbv_b = mo_b[:,nocc:]
 
         max_memory = mycc.max_memory-lib.current_memory()[0]
-        blksize = min(nocc, max(2, int(max_memory*1e6/8/(nmo**3*2))))
+        blksize = min(nocc, max(2, int(max_memory*1e6/dtype.itemsize/(nmo**3*2))))
         max_memory = max(MEMORYMIN, max_memory)
 
         fswap = lib.H5TmpFile()
@@ -452,7 +507,7 @@ def _make_eris_outcore(mycc, mo_coeff=None):
             tmp = None
         cput0 = log.timer_debug1('transforming ovvv', *cput0)
 
-        eris.vvvv = feri.create_dataset('vvvv', (nvir,nvir,nvir,nvir), dtype)
+        eris.vvvv = feri.create_dataset('vvvv', (nvir,nvir,nvir,nvir), dtype.char)
         tril2sq = lib.square_mat_in_trilu_indices(nvir)
         fswap = lib.H5TmpFile()
         ao2mo.kernel(mycc.mol, (orbv_a,orbv_a,orbv_a,orbv_a), fswap, 'aaaa',
@@ -491,7 +546,7 @@ def _make_eris_outcore(mycc, mo_coeff=None):
         orbv = mo[:,nocc:]
 
         max_memory = mycc.max_memory-lib.current_memory()[0]
-        blksize = min(nocc, max(2, int(max_memory*1e6/8/(nmo**3*2))))
+        blksize = min(nocc, max(2, int(max_memory*1e6/dtype.itemsize/(nmo**3*2))))
         max_memory = max(MEMORYMIN, max_memory)
 
         fswap = lib.H5TmpFile()
@@ -520,7 +575,7 @@ def _make_eris_outcore(mycc, mo_coeff=None):
             tmp = None
         cput0 = log.timer_debug1('transforming ovvv', *cput0)
 
-        eris.vvvv = feri.create_dataset('vvvv', (nvir,nvir,nvir,nvir), dtype)
+        eris.vvvv = feri.create_dataset('vvvv', (nvir,nvir,nvir,nvir), dtype.char)
         sym_forbid = (orbspin[nocc:,None]!=orbspin[nocc:])[np.tril_indices(nvir)]
         tril2sq = lib.square_mat_in_trilu_indices(nvir)
 
diff --git a/pyscf/cc/rccsdt.py b/pyscf/cc/rccsdt.py
index 7fe8935828..a1204ba066 100644
--- a/pyscf/cc/rccsdt.py
+++ b/pyscf/cc/rccsdt.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright 2014-2025 The PySCF Developers. All Rights Reserved.
+# Copyright 2014-2026 The PySCF Developers. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -37,7 +37,7 @@
 from pyscf.ao2mo import _ao2mo
 from pyscf.lib import logger
 from pyscf.mp.mp2 import get_nocc, get_nmo, get_frozen_mask, get_e_hf, _mo_without_core
-from pyscf.cc import ccsd, _ccsd
+from pyscf.cc import ccsd
 from pyscf import __config__
 
 
@@ -121,23 +121,6 @@ def unpack_t3_tri2block_(t3, t3_blk, map_, mask, i0, i1, j0, j1, k0, k1, nocc, n
     )
     return t3_blk
 
-def unpack_t3_tri2single_pair_(t3, t3_blk, map_, mask, i0, j0, k0, nocc, nvir):
-    assert t3.dtype == np.float64 and t3_blk.dtype == np.float64
-    assert map_.dtype == np.int64 and mask.dtype == np.bool_
-    t3 = np.ascontiguousarray(t3)
-    t3_blk = np.ascontiguousarray(t3_blk)
-    map_ = np.ascontiguousarray(map_)
-    mask = np.ascontiguousarray(mask)
-    _libccsdt.unpack_t3_tri2single_pair_(
-        t3.ctypes.data_as(ctypes.POINTER(ctypes.c_double)),
-        t3_blk.ctypes.data_as(ctypes.POINTER(ctypes.c_double)),
-        map_.ctypes.data_as(ctypes.POINTER(ctypes.c_int64)),
-        mask.ctypes.data_as(ctypes.POINTER(ctypes.c_bool)),
-        ctypes.c_int64(i0), ctypes.c_int64(j0), ctypes.c_int64(k0),
-        ctypes.c_int64(nocc), ctypes.c_int64(nvir),
-    )
-    return t3_blk
-
 def unpack_t3_tri2block_pair_(t3, t3_blk, map_, mask, i0, i1, j0, j1, k0, k1, nocc, nvir, blk_i, blk_j, blk_k):
     assert t3.dtype == np.float64 and t3_blk.dtype == np.float64
     assert map_.dtype == np.int64 and mask.dtype == np.bool_
@@ -177,22 +160,6 @@ def accumulate_t3_block2tri_(t3, t3_blk, map_, i0, i1, j0, j1, k0, k1, nocc, nvi
     )
     return t3
 
-def accumulate_t3_single2tri_(t3, t3_blk, map_, i0, j0, k0, nocc, nvir, alpha, beta):
-    assert t3.dtype == np.float64 and t3_blk.dtype == np.float64
-    assert map_.dtype == np.int64
-    t3 = np.ascontiguousarray(t3)
-    t3_blk = np.ascontiguousarray(t3_blk)
-    map_ = np.ascontiguousarray(map_)
-    _libccsdt.accumulate_t3_single2tri_(
-        t3.ctypes.data_as(ctypes.POINTER(ctypes.c_double)),
-        t3_blk.ctypes.data_as(ctypes.POINTER(ctypes.c_double)),
-        map_.ctypes.data_as(ctypes.POINTER(ctypes.c_int64)),
-        ctypes.c_int64(i0), ctypes.c_int64(j0), ctypes.c_int64(k0),
-        ctypes.c_int64(nocc), ctypes.c_int64(nvir),
-        ctypes.c_double(alpha), ctypes.c_double(beta)
-    )
-    return t3
-
 def _unpack_t3_(mycc, t3, t3_blk, i0, i1, j0, j1, k0, k1, blksize0=None, blksize1=None, blksize2=None):
     r'''Unpack triangular-stored T3 amplitudes into the block `t3_full[i0:i1, j0:j1, k0:k1, :, :, :]`'''
     if blksize0 is None: blksize0 = mycc.blksize
@@ -202,17 +169,9 @@ def _unpack_t3_(mycc, t3, t3_blk, i0, i1, j0, j1, k0, k1, blksize0=None, blksize
                         i0, i1, j0, j1, k0, k1, mycc.nocc, mycc.nmo - mycc.nocc, blksize0, blksize1, blksize2)
     return t3_blk
 
-def _unpack_t3_s_pair_(mycc, t3, t3_blk, i0, j0, k0):
-    r'''Unpack triangular-stored T3 amplitudes into the block
-    `t3_full[i0, j0, k0, :, :, :] + t3_full[j0, i0, k0, :, :, :].transpose(1, 0, 2)`
-    '''
-    unpack_t3_tri2single_pair_(t3, t3_blk, mycc.tri2block_map, mycc.tri2block_mask,
-                                i0, j0, k0, mycc.nocc, mycc.nmo - mycc.nocc)
-    return t3_blk
-
 def _unpack_t3_pair_(mycc, t3, t3_blk, i0, i1, j0, j1, k0, k1, blksize0=None, blksize1=None, blksize2=None):
     r'''Unpack triangular-stored T3 amplitudes into the block
-    `t3_full[i0:i1, j0:j1, k0:k1, :, :, :] + t3_full[k0:k1, j0:j1, i0:i1, :, :, :].transpose(0, 1, 2, 3, 5, 4)`
+    `t3_full[i0:i1, j0:j1, k0:k1, :, :, :] + t3_full[i0:i1, j0:j1, k0:k1, :, :, :].transpose(0, 1, 2, 4, 5, 3)`
     '''
     if blksize0 is None: blksize0 = mycc.blksize_oovv
     if blksize1 is None: blksize1 = mycc.nocc
@@ -230,11 +189,6 @@ def _accumulate_t3_(mycc, t3, t3_blk, i0, i1, j0, j1, k0, k1,
                         mycc.nocc, mycc.nmo - mycc.nocc, blksize0, blksize1, blksize2, alpha=alpha, beta=beta)
     return t3
 
-def _accumulate_t3_s_(mycc, t3, t3_blk, i0, j0, k0, alpha=1.0, beta=0.0):
-    accumulate_t3_single2tri_(t3, t3_blk, mycc.tri2block_map, i0, j0, k0,
-                                mycc.nocc, mycc.nmo - mycc.nocc, alpha=alpha, beta=beta)
-    return t3
-
 def setup_tri2block_rhf(mycc):
     '''Build the map used to unpack and accumulate between the triangular-stored T3 and the block of full T3 tensor.'''
     nx = lambda n, order: prod(n + i for i in range(order)) // factorial(order)
@@ -463,12 +417,13 @@ def intermediates_t1t2(mycc, imds, t2):
     einsum('lkdc,ljcd->kj', t1_eris[:nocc, :nocc, nocc:, nocc:], t2, out=F_oo, alpha=-1.0, beta=1.0)
     W_oooo = t1_eris[:nocc, :nocc, :nocc, :nocc].copy()
     einsum('klcd,ijcd->klij', t1_eris[:nocc, :nocc, nocc:, nocc:], t2, out=W_oooo, alpha=1.0, beta=1.0)
-    W_ovvo = - t1_eris[:nocc, nocc:, nocc:, :nocc]
-    einsum('klcd,ilad->kaci', t1_eris[:nocc, :nocc, nocc:, nocc:], t2, out=W_ovvo, alpha=-1.0, beta=1.0)
-    einsum('kldc,ilad->kaci', t1_eris[:nocc, :nocc, nocc:, nocc:], t2, out=W_ovvo, alpha=0.5, beta=1.0)
-    einsum('klcd,ilda->kaci', t1_eris[:nocc, :nocc, nocc:, nocc:], t2, out=W_ovvo, alpha=0.5, beta=1.0)
-    W_ovov = - t1_eris[:nocc, nocc:, :nocc, nocc:]
-    einsum('kldc,liad->kaic', t1_eris[:nocc, :nocc, nocc:, nocc:], t2, out=W_ovov, alpha=0.5, beta=1.0)
+    c_t2 = 2.0 * t2 - t2.transpose(0, 1, 3, 2)
+    W_ovvo = 2.0 * t1_eris[:nocc, nocc:, nocc:, :nocc] - t1_eris[:nocc, nocc:, :nocc, nocc:].transpose(0, 1, 3, 2)
+    einsum('mled,miea->ladi', t1_eris[:nocc, :nocc, nocc:, nocc:], c_t2, out=W_ovvo, alpha=1.0, beta=1.0)
+    einsum('mlde,miea->ladi', t1_eris[:nocc, :nocc, nocc:, nocc:], c_t2, out=W_ovvo, alpha=-0.5, beta=1.0)
+    c_t2 = None
+    W_ovov = t1_eris[:nocc, nocc:, :nocc, nocc:].copy()
+    einsum('mlde,imea->laid', t1_eris[:nocc, :nocc, nocc:, nocc:], t2, out=W_ovov, alpha=-0.5, beta=1.0)
     imds.F_vv, imds.F_oo, imds.W_oooo, imds.W_ovvo, imds.W_ovov = F_vv, F_oo, W_oooo, W_ovvo, W_ovov
     return imds
 
@@ -495,12 +450,10 @@ def compute_r1r2(mycc, imds, t2):
     einsum("kj,ikab->ijab", F_oo, t2, out=r2, alpha=-1.0, beta=1.0)
     einsum("abcd,ijcd->ijab", t1_eris[nocc:, nocc:, nocc:, nocc:], t2, out=r2, alpha=0.5, beta=1.0)
     einsum("klij,klab->ijab", W_oooo, t2, out=r2, alpha=0.5, beta=1.0)
-    einsum("kajc,ikcb->ijab", W_ovov, t2, out=r2, alpha=1.0, beta=1.0)
-    einsum("kaci,kjcb->ijab", W_ovvo, t2, out=r2, alpha=-2.0, beta=1.0)
-    einsum("kaic,kjcb->ijab", W_ovov, t2, out=r2, alpha=1.0, beta=1.0)
-    einsum("kaci,jkcb->ijab", W_ovvo, t2, out=r2, alpha=1.0, beta=1.0)
-    W_ovvo = imds.W_ovvo = None
-    W_ovov = imds.W_ovov = None
+    einsum("kaci,kjcb->ijab", W_ovvo, c_t2, out=r2, alpha=0.5, beta=1.0)
+    einsum("kaic,jkcb->ijab", W_ovov, t2, out=r2, alpha=-0.5, beta=1.0)
+    einsum("kbic,jkca->ijab", W_ovov, t2, out=r2, alpha=-1.0, beta=1.0)
+    c_t2 = None
     return r1, r2
 
 def r1r2_add_t3_tri_(mycc, imds, r1, r2, t3):
@@ -523,9 +476,7 @@ def r1r2_add_t3_tri_(mycc, imds, r1, r2, t3):
                 t3_spin_summation_inplace_(t3_tmp, blksize**3, nvir, "P3_422", 1.0, 0.0)
                 einsum('jkbc,ijkabc->ia', t1_eris[j0:j1, k0:k1, nocc:, nocc:],
                     t3_tmp[:bi, :bj, :bk], out=r1[i0:i1, :], alpha=0.5, beta=1.0)
-    t3_tmp = None
 
-    t3_tmp = np.empty((blksize,) * 3 + (nvir,) * 3, dtype=t3.dtype)
     for k0, k1 in lib.prange(0, nocc, blksize):
         bk = k1 - k0
         for j0, j1 in lib.prange(0, nocc, blksize):
@@ -535,7 +486,7 @@ def r1r2_add_t3_tri_(mycc, imds, r1, r2, t3):
                 _unpack_t3_(mycc, t3, t3_tmp, k0, k1, i0, i1, j0, j1)
                 t3_spin_summation_inplace_(t3_tmp, blksize**3, nvir, "P3_201", 1.0, 0.0)
                 einsum("kc,kijcab->ijab", t1_fock[k0:k1, nocc:], t3_tmp[:bk, :bi, :bj],
-                    out=r2[i0:i1, j0:j1, :, :], alpha=0.5, beta=1.0)
+                        out=r2[i0:i1, j0:j1, :, :], alpha=0.5, beta=1.0)
                 einsum("bkcd,kijdac->ijab", t1_eris[nocc:, k0:k1, nocc:, nocc:],
                         t3_tmp[:bk, :bi, :bj], out=r2[i0:i1, j0:j1, :, :], alpha=1.0, beta=1.0)
                 einsum("jklc,kijcab->ilab", t1_eris[j0:j1, k0:k1, :nocc, nocc:],
@@ -578,14 +529,12 @@ def intermediates_t3(mycc, imds, t2):
     einsum('lbde,jlea->abdj', t1_eris[:nocc, nocc:, nocc:, nocc:], t2, out=W_vvvo, alpha=-1.0, beta=1.0)
     einsum('lmdj,lmab->abdj', t1_eris[:nocc, :nocc, nocc:, :nocc], t2, out=W_vvvo, alpha=1.0, beta=1.0)
 
-    W_ovvo = (2.0 * t1_eris[:nocc, nocc:, nocc:, :nocc] - t1_eris[:nocc, nocc:, :nocc, nocc:].transpose(0, 1, 3, 2))
-    einsum('mled,miea->ladi', t1_eris[:nocc, :nocc, nocc:, nocc:], c_t2, out=W_ovvo, alpha=2.0, beta=1.0)
-    einsum('mlde,miea->ladi', t1_eris[:nocc, :nocc, nocc:, nocc:], c_t2, out=W_ovvo, alpha=-1.0, beta=1.0)
+    einsum('mled,miea->ladi', t1_eris[:nocc, :nocc, nocc:, nocc:], c_t2, out=imds.W_ovvo, alpha=1.0, beta=1.0)
+    einsum('mlde,miea->ladi', t1_eris[:nocc, :nocc, nocc:, nocc:], c_t2, out=imds.W_ovvo, alpha=-0.5, beta=1.0)
     c_t2 = None
 
-    W_ovov = t1_eris[:nocc, nocc:, :nocc, nocc:].copy()
-    einsum('mlde,imea->laid', t1_eris[:nocc, :nocc, nocc:, nocc:], t2, out=W_ovov, alpha=-1.0, beta=1.0)
-    imds.W_vooo, imds.W_ovvo, imds.W_ovov, imds.W_vvvo, imds.W_vvvv = W_vooo, W_ovvo, W_ovov, W_vvvo, W_vvvv
+    einsum('mlde,imea->laid', t1_eris[:nocc, :nocc, nocc:, nocc:], t2, out=imds.W_ovov, alpha=-0.5, beta=1.0)
+    imds.W_vooo, imds.W_vvvo, imds.W_vvvv = W_vooo, W_vvvo, W_vvvv
     return imds
 
 def intermediates_t3_add_t3_tri(mycc, imds, t3):
@@ -673,10 +622,8 @@ def compute_r3_tri(mycc, imds, t2, t3):
 
                 _unpack_t3_(mycc, t3, t3_tmp, i0, i1, j0, j1, k0, k1)
                 einsum('ad,ijkdbc->ijkabc', F_vv, t3_tmp[:bi, :bj, :bk], out=r3_tmp[:bi, :bj, :bk], alpha=1.0, beta=1.0)
-                _unpack_t3_(mycc, t3, t3_tmp, j0, j1, i0, i1, k0, k1)
-                einsum('bd,jikdac->ijkabc', F_vv, t3_tmp[:bj, :bi, :bk], out=r3_tmp[:bi, :bj, :bk], alpha=1.0, beta=1.0)
-                _unpack_t3_(mycc, t3, t3_tmp, k0, k1, j0, j1, i0, i1)
-                einsum('cd,kjidba->ijkabc', F_vv, t3_tmp[:bk, :bj, :bi], out=r3_tmp[:bi, :bj, :bk], alpha=1.0, beta=1.0)
+                einsum('bd,ijkadc->ijkabc', F_vv, t3_tmp[:bi, :bj, :bk], out=r3_tmp[:bi, :bj, :bk], alpha=1.0, beta=1.0)
+                einsum('cd,ijkabd->ijkabc', F_vv, t3_tmp[:bi, :bj, :bk], out=r3_tmp[:bi, :bj, :bk], alpha=1.0, beta=1.0)
 
                 _accumulate_t3_(mycc, r3, r3_tmp, i0, i1, j0, j1, k0, k1, alpha=1.0, beta=1.0)
         time2 = log.timer_debug1('t3: iter: W_vvvo, W_vooo, F_vv [%3d, %3d]:'%(k0, k1), *time2)
@@ -740,8 +687,7 @@ def compute_r3_tri(mycc, imds, t2, t3):
                 _unpack_t3_(mycc, t3, t3_tmp, j0, j1, 0, nocc, k0, k1, blksize_oovv, nocc, blksize_oovv)
                 einsum('lbid,jlkdac->ijkabc', W_ovov[:, :, i0:i1, :], t3_tmp[:bj, :, :bk],
                     out=r3_tmp[:bi, :bj, :bk], alpha=-1.0, beta=0.0)
-                _unpack_t3_(mycc, t3, t3_tmp, k0, k1, 0, nocc, j0, j1, blksize_oovv, nocc, blksize_oovv)
-                einsum('lcid,kljdab->ijkabc', W_ovov[:, :, i0:i1, :], t3_tmp[:bk, :, :bj],
+                einsum('lcid,jlkbad->ijkabc', W_ovov[:, :, i0:i1, :], t3_tmp[:bj, :, :bk],
                     out=r3_tmp[:bi, :bj, :bk], alpha=-1.0, beta=1.0)
                 _unpack_t3_pair_(mycc, t3, t3_tmp, j0, j1, 0, nocc, k0, k1)
                 einsum('laid,jlkdbc->ijkabc', W_ovov[:, :, i0:i1, :], t3_tmp[:bj, :, :bk],
@@ -750,8 +696,7 @@ def compute_r3_tri(mycc, imds, t2, t3):
                 _unpack_t3_(mycc, t3, t3_tmp, i0, i1, 0, nocc, k0, k1, blksize_oovv, nocc, blksize_oovv)
                 einsum('lajd,ilkdbc->ijkabc', W_ovov[:, :, j0:j1, :], t3_tmp[:bi, :, :bk],
                     out=r3_tmp[:bi, :bj, :bk], alpha=-1.0, beta=1.0)
-                _unpack_t3_(mycc, t3, t3_tmp, k0, k1, 0, nocc, i0, i1, blksize_oovv, nocc, blksize_oovv)
-                einsum('lcjd,klidba->ijkabc', W_ovov[:, :, j0:j1, :], t3_tmp[:bk, :, :bi],
+                einsum('lcjd,ilkabd->ijkabc', W_ovov[:, :, j0:j1, :], t3_tmp[:bi, :, :bk],
                     out=r3_tmp[:bi, :bj, :bk], alpha=-1.0, beta=1.0)
                 _unpack_t3_pair_(mycc, t3, t3_tmp, i0, i1, 0, nocc, k0, k1)
                 einsum('lbjd,ilkdac->ijkabc', W_ovov[:, :, j0:j1, :], t3_tmp[:bi, :, :bk],
@@ -760,8 +705,7 @@ def compute_r3_tri(mycc, imds, t2, t3):
                 _unpack_t3_(mycc, t3, t3_tmp, i0, i1, 0, nocc, j0, j1, blksize_oovv, nocc, blksize_oovv)
                 einsum('lakd,iljdcb->ijkabc', W_ovov[:, :, k0:k1, :], t3_tmp[:bi, :, :bj],
                     out=r3_tmp[:bi, :bj, :bk], alpha=-1.0, beta=1.0)
-                _unpack_t3_(mycc, t3, t3_tmp, j0, j1, 0, nocc, i0, i1, blksize_oovv, nocc, blksize_oovv)
-                einsum('lbkd,jlidca->ijkabc', W_ovov[:, :, k0:k1, :], t3_tmp[:bj, :, :bi],
+                einsum('lbkd,iljacd->ijkabc', W_ovov[:, :, k0:k1, :], t3_tmp[:bi, :, :bj],
                     out=r3_tmp[:bi, :bj, :bk], alpha=-1.0, beta=1.0)
                 _unpack_t3_pair_(mycc, t3, t3_tmp, i0, i1, 0, nocc, j0, j1)
                 einsum('lckd,iljdab->ijkabc', W_ovov[:, :, k0:k1, :], t3_tmp[:bi, :, :bj],
@@ -803,22 +747,16 @@ def compute_r3_tri(mycc, imds, t2, t3):
     W_oooo = imds.W_oooo = None
     time1 = log.timer_debug1('t3: W_oooo * t3', *time1)
 
-    t3_tmp_s = np.empty((nvir, nvir, nvir), dtype=t3.dtype)
-    r3_tmp_s = np.empty((nvir, nvir, nvir), dtype=t3.dtype)
     time2 = logger.process_clock(), logger.perf_counter()
-    for k0 in range(nocc):
-        for j0 in range(k0 + 1):
-            for i0 in range(j0 + 1):
-                _unpack_t3_s_pair_(mycc, t3, t3_tmp_s, i0, j0, k0)
-                einsum('abde,dec->abc', W_vvvv, t3_tmp_s, out=r3_tmp_s, alpha=0.5, beta=0.0)
-                _unpack_t3_s_pair_(mycc, t3, t3_tmp_s, i0, k0, j0)
-                einsum('acde,deb->abc', W_vvvv, t3_tmp_s, out=r3_tmp_s, alpha=0.5, beta=1.0)
-                _unpack_t3_s_pair_(mycc, t3, t3_tmp_s, j0, k0, i0)
-                einsum('bcde,dea->abc', W_vvvv, t3_tmp_s, out=r3_tmp_s, alpha=0.5, beta=1.0)
-                _accumulate_t3_s_(mycc, r3, r3_tmp_s, i0, j0, k0, alpha=1.0, beta=1.0)
-        time2 = log.timer_debug1('t3: iter: W_vvvv %3d:'%k0, *time2)
-    t3_tmp_s = None
-    r3_tmp_s = None
+    index = 0
+    for i0 in range(nocc):
+        for j0 in range(i0, nocc):
+            for k0 in range(j0, nocc):
+                einsum('abde,dec->abc', W_vvvv, t3[index], out=r3[index], alpha=1.0, beta=1.0)
+                einsum('acde,dbe->abc', W_vvvv, t3[index], out=r3[index], alpha=1.0, beta=1.0)
+                einsum('bcde,ade->abc', W_vvvv, t3[index], out=r3[index], alpha=1.0, beta=1.0)
+                index += 1
+        time2 = log.timer_debug1('t3: iter: W_vvvv %3d:'%i0, *time2)
     W_vvvv = imds.W_vvvv = None
     time1 = log.timer_debug1('t3: W_vvvv * t3', *time1)
     return r3
@@ -869,15 +807,9 @@ def update_amps_rccsdt_tri_(mycc, tamps, eris):
     # symmetrization
     r2 += r2.transpose(1, 0, 3, 2)
     time1 = log.timer_debug1('t1t2: symmetrize r2', *time1)
-    # divide by eijkabc
+    # divide by eijab
     r1r2_divide_e_(mycc, r1, r2, mo_energy)
     time1 = log.timer_debug1('t1t2: divide r1 & r2 by eia & eijab', *time1)
-
-    res_norm = [np.linalg.norm(r1), np.linalg.norm(r2)]
-
-    t1 += r1
-    t2 += r2
-    time1 = log.timer_debug1('t1t2: update t1 & t2', *time1)
     time0 = log.timer_debug1('t1t2 total', *time0)
 
     # t3
@@ -897,11 +829,13 @@ def update_amps_rccsdt_tri_(mycc, tamps, eris):
     r3_tri_divide_e_(mycc, r3, mo_energy)
     time1 = log.timer_debug1('t3: divide r3 by eijkabc', *time1)
 
-    res_norm.append(np.linalg.norm(r3))
+    res_norm = [np.linalg.norm(r1), np.linalg.norm(r2), np.linalg.norm(r3)]
 
+    t1 += r1
+    t2 += r2
     t3 += r3
-    r3 = None
-    time1 = log.timer_debug1('t3: update t3', *time1)
+    r1, r2, r3 = None, None, None
+    time1 = log.timer_debug1('t3: update t1, t2, t3', *time1)
     time0 = log.timer_debug1('t3 total', *time0)
     return res_norm
 
@@ -1087,9 +1021,9 @@ def restore_from_diis_(mycc, diis_file, inplace=True):
     else:
         mycc.tamps[:cc_order - 1] = tamps
         if mycc.do_tri_max_t:
-            mycc.tamp[-1] = np.zeros((nx(nocc, cc_order),) + (nvir,) * cc_order, dtype=ccvec.dtype)
+            mycc.tamps[-1] = np.zeros((nx(nocc, cc_order),) + (nvir,) * cc_order, dtype=ccvec.dtype)
         else:
-            mycc.tamp[-1] = np.zeros((nocc,) * cc_order + (nvir,) * cc_order, dtype=ccvec.dtype)
+            mycc.tamps[-1] = np.zeros((nocc,) * cc_order + (nvir,) * cc_order, dtype=ccvec.dtype)
     if inplace:
         mycc.diis = adiis
     return mycc
@@ -1337,8 +1271,8 @@ class RCCSDT(ccsd.CCSDBase):
 the same way as in CCSD. Additional attributes are:
 
     do_diis_max_t : bool
-        Whether to use DIIS to accelerate convergence. Note that enabling DIIS
-        will increase memory consumption.
+        Whether to use DIIS for the highest-order amplitudes to accelerate convergence.
+        Note that enabling DIIS will increase memory consumption.
     blksize, blksize_oovv, blksize_oooo :
         Batch sizes used to reduce the memory footprint during tensor contractions.
     einsum_backend : string
@@ -1359,7 +1293,7 @@ class RCCSDT(ccsd.CCSDBase):
         T amplitudes t1[i,a], t2[i,j,a,b]  (i,j in occ, a,b in virt)
     t3 :
         An array of shape (compressed_occ, nvir, nvir, nvir) for T3 amplitudes.
-        The occupied-oribtal dimension is stored in a compressed form for the
+        The occupied-orbital dimension is stored in a compressed form for the
         i <= j <= k index combinations. The compressed tensor can be expanded to
         the full tensor by self.tamps_tri2full(t3)
     tamps :
@@ -1490,8 +1424,11 @@ def ccsdt(self, tamps=None, eris=None):
         self._finalize()
         return self.e_corr, self.tamps
 
-    def ccsdt_q(self, tamps, eris=None):
-        raise NotImplementedError
+    def ccsdt_q(self, tamps=None, eris=None):
+        from pyscf.cc import rccsdt_q
+        if tamps is None: tamps = self.tamps
+        if eris is None: eris = self.ao2mo(self.mo_coeff)
+        return rccsdt_q.kernel(self, eris, tamps, self.verbose)
 
 class _IMDS:
 
@@ -1634,3 +1571,13 @@ def _make_df_eris_incore_rcc(mycc, mo_coeff=None):
     print('max(abs(t2 difference))                    % .10e' % np.max(np.abs(mycc.t2 - mycc2.t2)))
     print('max(abs(t3_tri - t3_tri_from_t3_full))     % .10e' % np.max(np.abs(t3_tri - t3_tri_from_t3_full)))
     print('max(abs(t3_full - t3_full_from_t3_tri))    % .10e' % np.max(np.abs(t3_full - t3_full_from_t3_tri)))
+
+    # ccsdt_q
+    # [Q] and (Q) energy correction
+    e_q_bracket, e_q_paren = mycc.ccsdt_q()
+    e_q_bracket2, e_q_paren2 = mycc2.ccsdt_q()
+    ref_e_q_bracket, ref_e_q_paren = -0.001412978902990858, -0.0017003938319959389
+    print('[Q] difference                      % .10e' % (e_q_bracket - e_q_bracket2))
+    print('(Q) difference                      % .10e' % (e_q_paren - e_q_paren2))
+    print('[Q] difference from reference       % .10e' % (e_q_bracket - ref_e_q_bracket))
+    print('(Q) difference from reference       % .10e' % (e_q_paren - ref_e_q_paren))
diff --git a/pyscf/cc/rccsdt_highm.py b/pyscf/cc/rccsdt_highm.py
index 7ab9bb5e1a..32fc128970 100644
--- a/pyscf/cc/rccsdt_highm.py
+++ b/pyscf/cc/rccsdt_highm.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright 2014-2025 The PySCF Developers. All Rights Reserved.
+# Copyright 2014-2026 The PySCF Developers. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,13 +27,11 @@
 '''
 
 import numpy as np
-import numpy
 import functools
 import ctypes
 from pyscf import lib
 from pyscf.lib import logger
-from pyscf.mp.mp2 import get_nocc, get_nmo, get_frozen_mask, get_e_hf, _mo_without_core
-from pyscf.cc import _ccsd, rccsdt
+from pyscf.cc import rccsdt
 from pyscf.cc.rccsdt import (_einsum, t3_spin_summation_inplace_, update_t1_fock_eris, intermediates_t1t2,
                             compute_r1r2, r1r2_divide_e_, intermediates_t3, _PhysicistsERIs, _IMDS)
 from pyscf import __config__
@@ -187,15 +185,9 @@ def update_amps_rccsdt_(mycc, tamps, eris):
     # symmetrization
     r2 += r2.transpose(1, 0, 3, 2)
     time1 = log.timer_debug1('t1t2: symmetrize r2', *time1)
-    # divide by eijkabc
+    # divide by eijab
     r1r2_divide_e_(mycc, r1, r2, mo_energy)
     time1 = log.timer_debug1('t1t2: divide r1 & r2 by eia & eijab', *time1)
-
-    res_norm = [np.linalg.norm(r1), np.linalg.norm(r2)]
-
-    t1 += r1
-    t2 += r2
-    time1 = log.timer_debug1('t1t2: update t1 & t2', *time1)
     time0 = log.timer_debug1('t1t2 total', *time0)
 
     intermediates_t3(mycc, imds, t2)
@@ -215,11 +207,13 @@ def update_amps_rccsdt_(mycc, tamps, eris):
     r3_divide_e_(mycc, r3, mo_energy)
     time1 = log.timer_debug1('t3: divide r3 by eijkabc', *time1)
 
-    res_norm.append(np.linalg.norm(r3))
+    res_norm = [np.linalg.norm(r1), np.linalg.norm(r2), np.linalg.norm(r3)]
 
+    t1 += r1
+    t2 += r2
     t3 += r3
-    r3 = None
-    time1 = log.timer_debug1('t3: update t3', *time1)
+    r1, r2, r3 = None, None, None
+    time1 = log.timer_debug1('t3: update t1, t2, t3', *time1)
     time0 = log.timer_debug1('t3 total', *time0)
     return res_norm
 
diff --git a/pyscf/cc/rccsdt_q.py b/pyscf/cc/rccsdt_q.py
new file mode 100644
index 0000000000..5ee1dd44f9
--- /dev/null
+++ b/pyscf/cc/rccsdt_q.py
@@ -0,0 +1,312 @@
+#!/usr/bin/env python
+# Copyright 2014-2026 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Yu Jin <yjin@flatironinstitute.org>
+#         Huanchen Zhai <hczhai.ok@gmail.com>
+#
+
+'''
+RHF-CCSDT(Q) for real integrals
+'''
+
+import functools
+import numpy as np
+import ctypes
+from pyscf import lib
+from pyscf.lib import logger
+from pyscf.cc.rccsdt import _einsum, _unpack_t3_, setup_tri2block_rhf
+from pyscf.cc.rccsdtq import t4_add_
+
+
+_libccsdt = lib.load_library('libccsdt')
+
+def eijkl_division_single_(A, eocc, evir, i, j, k, l, nvir):
+    assert A.dtype == np.float64 and A.flags['C_CONTIGUOUS'], "A must be a contiguous float64 array"
+    assert eocc.dtype == np.float64 and eocc.flags['C_CONTIGUOUS'], "eocc must be a contiguous float64 array"
+    assert evir.dtype == np.float64 and evir.flags['C_CONTIGUOUS'], "evir must be a contiguous float64 array"
+    _libccsdt.eijkl_division_single_(
+        A.ctypes.data_as(ctypes.c_void_p), eocc.ctypes.data_as(ctypes.c_void_p), evir.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_int64(i), ctypes.c_int64(j), ctypes.c_int64(k), ctypes.c_int64(l), ctypes.c_int64(nvir)
+    )
+    return A
+
+def t4_spin_summation_single_inplace_(A, nvir, pattern, alpha=1.0, beta=0.0):
+    assert A.dtype == np.float64 and A.flags['C_CONTIGUOUS'], "A must be a contiguous float64 array"
+    pattern_c = pattern.encode('utf-8')
+    _libccsdt.t4_spin_summation_single_inplace_(
+        A.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_int64(nvir), ctypes.c_char_p(pattern_c),
+        ctypes.c_double(alpha), ctypes.c_double(beta)
+    )
+    return A
+
+def kernel(mycc, eris=None, tamps=None, verbose=logger.NOTE):
+
+    time0 = logger.process_clock(), logger.perf_counter()
+    log = logger.new_logger(mycc, verbose)
+
+    if tamps is not None:
+        if len(tamps) != 3:
+            raise ValueError("tamps should be a list of length 3, containing T1, T2, and T3 amplitudes.")
+        if mycc.do_tri_max_t and len(tamps[2].shape) != 4:
+            raise ValueError("CC object uses compact T3 amplitudes but the input T3 is full.")
+        if not mycc.do_tri_max_t and len(tamps[2].shape) == 4:
+            raise ValueError("CC object uses full T3 amplitudes but the input T3 is compact.")
+    else:
+        tamps = mycc.tamps
+
+    if eris is None:
+        eris = mycc.ao2mo(mycc.mo_coeff)
+
+    if mycc.do_tri_max_t and (not hasattr(mycc, "tri2block_map") or mycc.tri2block_map is None):
+        mycc.tri2block_map, mycc.tri2block_mask, mycc.tri2block_tp = setup_tri2block_rhf(mycc)
+
+    name = mycc.__class__.__name__
+
+    backend = mycc.einsum_backend
+    einsum = functools.partial(_einsum, backend)
+
+    t1 = tamps[0]
+    nocc, nvir = t1.shape[0], t1.shape[1]
+
+    t2, t3 = tamps[1:3]
+    mo_energy = eris.mo_energy
+    e_occ = mo_energy[:nocc]
+    e_occ = np.ascontiguousarray(e_occ)
+    e_vir = mo_energy[nocc:]
+    e_vir = np.ascontiguousarray(e_vir)
+
+    eris_ovvv = eris.pppp[:nocc, nocc:, nocc:, nocc:].copy()
+    eris_oovo = eris.pppp[:nocc, :nocc, nocc:, :nocc].copy()
+    eris_oovv = eris.pppp[:nocc, :nocc, nocc:, nocc:].copy()
+    eris_ovvo = eris.pppp[:nocc, nocc:, nocc:, :nocc].copy()
+    eris_ovov = eris.pppp[:nocc, nocc:, :nocc, nocc:].copy()
+    eris_vvvv = eris.pppp[nocc:, nocc:, nocc:, nocc:].copy()
+    eris_oooo = eris.pppp[:nocc, :nocc, :nocc, :nocc].copy()
+
+    eris = None
+
+    def get_t3_slice(t3_blk, i, j):
+        if mycc.do_tri_max_t:
+            _unpack_t3_(mycc, t3, t3_blk, i, i + 1, j, j + 1, 0, nocc, 1, 1, nocc)
+        else:
+            t3_blk[0, 0, :nocc] = t3[i, j, :nocc]
+        return t3_blk
+
+    def compute_W_vvvvoo(W_vvvvoo_slice, j, k):
+        einsum('abef,fc->abce', eris_vvvv, t2[j, k], out=W_vvvvoo_slice, alpha=0.5, beta=0.0)
+        einsum('acef,fb->abce', eris_vvvv, t2[k, j], out=W_vvvvoo_slice, alpha=0.5, beta=1.0)
+        return W_vvvvoo_slice
+
+    def compute_W_vvoooo(W_vvoooo_slice, i, j, k):
+        einsum('eam,be->abm', eris_ovvo[i], t2[j, k], out=W_vvoooo_slice, alpha=1.0, beta=0.0)
+        einsum('ebm,ae->abm', eris_ovvo[j], t2[i, k], out=W_vvoooo_slice, alpha=1.0, beta=1.0)
+        einsum('ema,be->abm', eris_ovov[k], t2[j, i], out=W_vvoooo_slice, alpha=1.0, beta=1.0)
+        einsum('emb,ae->abm', eris_ovov[k], t2[i, j], out=W_vvoooo_slice, alpha=1.0, beta=1.0)
+        einsum('mn,nab->abm', eris_oooo[k, i], t2[:, j], out=W_vvoooo_slice, alpha=-0.5, beta=1.0)
+        einsum('mn,nba->abm', eris_oooo[k, j], t2[:, i], out=W_vvoooo_slice, alpha=-0.5, beta=1.0)
+        return W_vvoooo_slice
+
+    time1 = logger.process_clock(), logger.perf_counter()
+    t4_blk = np.empty((nvir,) * 4, dtype=t2.dtype)
+    z4_blk = np.empty_like(t4_blk)
+    t3_blk = np.empty((1,) * 2 + (nocc,) + (nvir,) * 3, dtype=t3.dtype)
+    W_vvoooo_slice = np.empty((nvir, nvir, nocc), dtype=t2.dtype)
+    W_vvvvoo_slice = np.empty((nvir, nvir, nvir, nvir), dtype=t2.dtype)
+    e_q_bracket = 0.0
+    e_q_paren = 0.0
+    for l in range(nocc):
+        for k in range(l + 1):
+            for j in range(k + 1):
+                for i in range(j + 1):
+
+                    if (i == j == k == l) or (i == j and j == k) or (j == k and k == l):
+                        continue
+                    elif i < j and j < k and k < l:
+                        factor = 24.0
+                    elif (i == j and j < k and k < l) or (i < j and j == k and k < l) or (i < j and j < k and k == l):
+                        factor = 12.0
+                    elif (i == j and j < k and k == l):
+                        factor = 6.0
+
+                    # z for (Q)
+                    get_t3_slice(t3_blk, k, l)
+                    einsum('am,mcdb->abcd', eris_oovo[i, j], t3_blk[0, 0], out=z4_blk, alpha=-1.0, beta=0.0)
+                    einsum('bm,mcda->abcd', eris_oovo[j, i], t3_blk[0, 0], out=z4_blk, alpha=-1.0, beta=1.0)
+                    einsum('eba,cde->abcd', eris_ovvv[j], t3_blk[0, 0, i], out=z4_blk, alpha=1.0, beta=1.0)
+                    einsum('eab,cde->abcd', eris_ovvv[i], t3_blk[0, 0, j], out=z4_blk, alpha=1.0, beta=1.0)
+
+                    get_t3_slice(t3_blk, j, l)
+                    einsum('am,mbdc->abcd', eris_oovo[i, k], t3_blk[0, 0], out=z4_blk, alpha=-1.0, beta=1.0)
+                    einsum('cm,mbda->abcd', eris_oovo[k, i], t3_blk[0, 0], out=z4_blk, alpha=-1.0, beta=1.0)
+                    einsum('eca,bde->abcd', eris_ovvv[k], t3_blk[0, 0, i], out=z4_blk, alpha=1.0, beta=1.0)
+                    einsum('eac,bde->abcd', eris_ovvv[i], t3_blk[0, 0, k], out=z4_blk, alpha=1.0, beta=1.0)
+
+                    get_t3_slice(t3_blk, j, k)
+                    einsum('am,mbcd->abcd', eris_oovo[i, l], t3_blk[0, 0], out=z4_blk, alpha=-1.0, beta=1.0)
+                    einsum('dm,mbca->abcd', eris_oovo[l, i], t3_blk[0, 0], out=z4_blk, alpha=-1.0, beta=1.0)
+                    einsum('eda,bce->abcd', eris_ovvv[l], t3_blk[0, 0, i], out=z4_blk, alpha=1.0, beta=1.0)
+                    einsum('ead,bce->abcd', eris_ovvv[i], t3_blk[0, 0, l], out=z4_blk, alpha=1.0, beta=1.0)
+
+                    get_t3_slice(t3_blk, i, l)
+                    einsum('bm,madc->abcd', eris_oovo[j, k], t3_blk[0, 0], out=z4_blk, alpha=-1.0, beta=1.0)
+                    einsum('cm,madb->abcd', eris_oovo[k, j], t3_blk[0, 0], out=z4_blk, alpha=-1.0, beta=1.0)
+                    einsum('ecb,ade->abcd', eris_ovvv[k], t3_blk[0, 0, j], out=z4_blk, alpha=1.0, beta=1.0)
+                    einsum('ebc,ade->abcd', eris_ovvv[j], t3_blk[0, 0, k], out=z4_blk, alpha=1.0, beta=1.0)
+
+                    get_t3_slice(t3_blk, i, k)
+                    einsum('bm,macd->abcd', eris_oovo[j, l], t3_blk[0, 0], out=z4_blk, alpha=-1.0, beta=1.0)
+                    einsum('dm,macb->abcd', eris_oovo[l, j], t3_blk[0, 0], out=z4_blk, alpha=-1.0, beta=1.0)
+                    einsum('edb,ace->abcd', eris_ovvv[l], t3_blk[0, 0, j], out=z4_blk, alpha=1.0, beta=1.0)
+                    einsum('ebd,ace->abcd', eris_ovvv[j], t3_blk[0, 0, l], out=z4_blk, alpha=1.0, beta=1.0)
+
+                    get_t3_slice(t3_blk, i, j)
+                    einsum('cm,mabd->abcd', eris_oovo[k, l], t3_blk[0, 0], out=z4_blk, alpha=-1.0, beta=1.0)
+                    einsum('dm,mabc->abcd', eris_oovo[l, k], t3_blk[0, 0], out=z4_blk, alpha=-1.0, beta=1.0)
+                    einsum('edc,abe->abcd', eris_ovvv[l], t3_blk[0, 0, k], out=z4_blk, alpha=1.0, beta=1.0)
+                    einsum('ecd,abe->abcd', eris_ovvv[k], t3_blk[0, 0, l], out=z4_blk, alpha=1.0, beta=1.0)
+
+                    # t4
+                    compute_W_vvoooo(W_vvoooo_slice, i, j, k)
+                    einsum('abm,mdc->abcd', W_vvoooo_slice, t2[l], out=t4_blk, alpha=-1.0, beta=0.0)
+                    compute_W_vvoooo(W_vvoooo_slice, i, j, l)
+                    einsum('abm,mcd->abcd', W_vvoooo_slice, t2[k], out=t4_blk, alpha=-1.0, beta=1.0)
+                    compute_W_vvoooo(W_vvoooo_slice, i, k, j)
+                    einsum('acm,mdb->abcd', W_vvoooo_slice, t2[l], out=t4_blk, alpha=-1.0, beta=1.0)
+                    compute_W_vvoooo(W_vvoooo_slice, i, k, l)
+                    einsum('acm,mbd->abcd', W_vvoooo_slice, t2[j], out=t4_blk, alpha=-1.0, beta=1.0)
+                    compute_W_vvoooo(W_vvoooo_slice, i, l, j)
+                    einsum('adm,mcb->abcd', W_vvoooo_slice, t2[k], out=t4_blk, alpha=-1.0, beta=1.0)
+                    compute_W_vvoooo(W_vvoooo_slice, i, l, k)
+                    einsum('adm,mbc->abcd', W_vvoooo_slice, t2[j], out=t4_blk, alpha=-1.0, beta=1.0)
+                    compute_W_vvoooo(W_vvoooo_slice, j, k, i)
+                    einsum('bcm,mda->abcd', W_vvoooo_slice, t2[l], out=t4_blk, alpha=-1.0, beta=1.0)
+                    compute_W_vvoooo(W_vvoooo_slice, j, k, l)
+                    einsum('bcm,mad->abcd', W_vvoooo_slice, t2[i], out=t4_blk, alpha=-1.0, beta=1.0)
+                    compute_W_vvoooo(W_vvoooo_slice, j, l, i)
+                    einsum('bdm,mca->abcd', W_vvoooo_slice, t2[k], out=t4_blk, alpha=-1.0, beta=1.0)
+                    compute_W_vvoooo(W_vvoooo_slice, j, l, k)
+                    einsum('bdm,mac->abcd', W_vvoooo_slice, t2[i], out=t4_blk, alpha=-1.0, beta=1.0)
+                    compute_W_vvoooo(W_vvoooo_slice, k, l, i)
+                    einsum('cdm,mba->abcd', W_vvoooo_slice, t2[j], out=t4_blk, alpha=-1.0, beta=1.0)
+                    compute_W_vvoooo(W_vvoooo_slice, k, l, j)
+                    einsum('cdm,mab->abcd', W_vvoooo_slice, t2[i], out=t4_blk, alpha=-1.0, beta=1.0)
+
+                    compute_W_vvvvoo(W_vvvvoo_slice, j, k)
+                    einsum('abce,ed->abcd', W_vvvvoo_slice, t2[i, l], out=t4_blk, alpha=1.0, beta=1.0)
+                    einsum('dbce,ea->abcd', W_vvvvoo_slice, t2[l, i], out=t4_blk, alpha=1.0, beta=1.0)
+                    compute_W_vvvvoo(W_vvvvoo_slice, j, l)
+                    einsum('abde,ec->abcd', W_vvvvoo_slice, t2[i, k], out=t4_blk, alpha=1.0, beta=1.0)
+                    einsum('cbde,ea->abcd', W_vvvvoo_slice, t2[k, i], out=t4_blk, alpha=1.0, beta=1.0)
+                    compute_W_vvvvoo(W_vvvvoo_slice, k, l)
+                    einsum('acde,eb->abcd', W_vvvvoo_slice, t2[i, j], out=t4_blk, alpha=1.0, beta=1.0)
+                    einsum('bcde,ea->abcd', W_vvvvoo_slice, t2[j, i], out=t4_blk, alpha=1.0, beta=1.0)
+                    compute_W_vvvvoo(W_vvvvoo_slice, i, k)
+                    einsum('bace,ed->abcd', W_vvvvoo_slice, t2[j, l], out=t4_blk, alpha=1.0, beta=1.0)
+                    einsum('dace,eb->abcd', W_vvvvoo_slice, t2[l, j], out=t4_blk, alpha=1.0, beta=1.0)
+                    compute_W_vvvvoo(W_vvvvoo_slice, i, l)
+                    einsum('bade,ec->abcd', W_vvvvoo_slice, t2[j, k], out=t4_blk, alpha=1.0, beta=1.0)
+                    einsum('cade,eb->abcd', W_vvvvoo_slice, t2[k, j], out=t4_blk, alpha=1.0, beta=1.0)
+                    compute_W_vvvvoo(W_vvvvoo_slice, i, j)
+                    einsum('cabe,ed->abcd', W_vvvvoo_slice, t2[k, l], out=t4_blk, alpha=1.0, beta=1.0)
+                    einsum('dabe,ec->abcd', W_vvvvoo_slice, t2[l, k], out=t4_blk, alpha=1.0, beta=1.0)
+
+                    t4_add_(t4_blk, z4_blk, 1, nvir)
+                    eijkl_division_single_(t4_blk, e_occ, e_vir, i, j, k, l, nvir)
+                    t4_spin_summation_single_inplace_(t4_blk, nvir, 'P4_444', alpha=1.0, beta=0.0)
+
+                    e_q_paren += np.dot(z4_blk.ravel(), t4_blk.ravel()) * factor
+
+                    # z for [Q]
+                    einsum('ab,cd->abcd', eris_oovv[i, j], t2[k, l], out=z4_blk, alpha=1.0, beta=0.0)
+                    einsum('ac,bd->abcd', eris_oovv[i, k], t2[j, l], out=z4_blk, alpha=1.0, beta=1.0)
+                    einsum('ad,bc->abcd', eris_oovv[i, l], t2[j, k], out=z4_blk, alpha=1.0, beta=1.0)
+                    einsum('bc,ad->abcd', eris_oovv[j, k], t2[i, l], out=z4_blk, alpha=1.0, beta=1.0)
+                    einsum('bd,ac->abcd', eris_oovv[j, l], t2[i, k], out=z4_blk, alpha=1.0, beta=1.0)
+                    einsum('cd,ab->abcd', eris_oovv[k, l], t2[i, j], out=z4_blk, alpha=1.0, beta=1.0)
+
+                    e_q_bracket += np.dot(z4_blk.ravel(), t4_blk.ravel()) * factor
+
+        time1 = log.timer_debug1('%s(Q): iter %3d:' % (name, l), *time1)
+
+    e_q_paren += e_q_bracket
+    e_q_bracket /= 12.0
+    e_q_paren /= 12.0
+
+    log.timer('%s(Q)' % name, *time0)
+    log.info("[Q] correction = % .12e    (Q) correction = % .12e" % (e_q_bracket, e_q_paren))
+    return e_q_bracket, e_q_paren
+
+
+if __name__ == '__main__':
+
+    from pyscf import gto, scf, lib
+    from pyscf.data.elements import chemcore
+    from pyscf.cc.rccsdt import RCCSDT
+    from pyscf.cc.rccsdt_highm import RCCSDT as RCCSDT_highm
+
+    atom = '''
+    O  1.416468653903   0.111264435953   0.000000000000
+    H  1.746241653903  -0.373945564047  -0.758561000000
+    H  2.102765241     -0.898304829      1.578786622
+    '''
+    basis = 'cc-pvdz'
+
+    mol = gto.M(atom=atom, basis=basis)
+    mol.verbose = 1
+    mol.max_memory = 10000
+    frozen = chemcore(mol)
+
+    mf = scf.RHF(mol).density_fit()
+    mf.conv_tol = 1e-12
+    mf.kernel()
+
+    mycc = RCCSDT(mf, frozen=frozen)
+    mycc.set_einsum_backend('numpy')
+    mycc.conv_tol = 1e-10
+    mycc.conv_tol_normt = 1e-8
+    mycc.max_cycle = 100
+    mycc.verbose = 3
+    mycc.blksize = 2
+    mycc.blksize_oovv = 2
+    mycc.blksize_oooo = 2
+    mycc.do_diis_max_t = False
+    mycc.incore_complete = True
+    ecorr, tamps = mycc.kernel()
+
+    ref_e_q_bracket = -0.001462052703
+    ref_e_q_paren = -0.001620887567
+
+    mycc.verbose = 8
+    e_q_bracket, e_q_paren = kernel(mycc)
+    print('[Q] corr: % .12f    Ref: % .12f    Diff: % .12e'%(
+        e_q_bracket, ref_e_q_bracket, e_q_bracket - ref_e_q_bracket))
+    print('(Q) corr: % .12f    Ref: % .12f    Diff: % .12e'%(
+        e_q_paren, ref_e_q_paren, e_q_paren - ref_e_q_paren))
+
+    mycc2 = RCCSDT_highm(mf, frozen=frozen)
+    mycc2.set_einsum_backend('numpy')
+    mycc2.conv_tol = 1e-10
+    mycc2.conv_tol_normt = 1e-8
+    mycc2.max_cycle = 100
+    mycc2.verbose = 3
+    mycc2.do_diis_max_t = False
+    mycc2.incore_complete = True
+    ecorr, tamps = mycc2.kernel()
+    e_q_bracket, e_q_paren = mycc2.ccsdt_q()
+    print('[Q] corr: % .12f    Ref: % .12f    Diff: % .12e'%(
+        e_q_bracket, ref_e_q_bracket, e_q_bracket - ref_e_q_bracket))
+    print('(Q) corr: % .12f    Ref: % .12f    Diff: % .12e'%(
+        e_q_paren, ref_e_q_paren, e_q_paren - ref_e_q_paren))
diff --git a/pyscf/cc/rccsdtq.py b/pyscf/cc/rccsdtq.py
index e9745191e5..eb899ae4ac 100644
--- a/pyscf/cc/rccsdtq.py
+++ b/pyscf/cc/rccsdtq.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright 2014-2025 The PySCF Developers. All Rights Reserved.
+# Copyright 2014-2026 The PySCF Developers. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,13 +27,11 @@
 '''
 
 import numpy as np
-import numpy
 import functools
 import ctypes
 from pyscf import lib
 from pyscf.lib import logger
-from pyscf.mp.mp2 import get_nocc, get_nmo, get_frozen_mask, get_e_hf, _mo_without_core
-from pyscf.cc import ccsd, _ccsd, rccsdt
+from pyscf.cc import rccsdt
 from pyscf.cc.rccsdt import (_einsum, t3_spin_summation_inplace_, symmetrize_tamps_tri_, purify_tamps_tri_,
                             update_t1_fock_eris, intermediates_t1t2, compute_r1r2, r1r2_divide_e_,
                             intermediates_t3, kernel, _PhysicistsERIs, format_size)
@@ -55,6 +53,15 @@ def t4_spin_summation_inplace_(A, nocc4, nvir, pattern, alpha=1.0, beta=0.0):
     )
     return A
 
+def t4_project_1_minus_p4_p31_inplace_(A, nocc4, nvir, alpha=1.0, beta=0.0):
+    assert A.dtype == np.float64 and A.flags['C_CONTIGUOUS'], "A must be a contiguous float64 array"
+    _libccsdt.t4_project_1_minus_p4_p31_inplace_(
+        A.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_int64(nocc4), ctypes.c_int64(nvir),
+        ctypes.c_double(alpha), ctypes.c_double(beta)
+    )
+    return A
+
 def t4_add_(t4, r4, nocc4, nvir):
     assert t4.dtype == np.float64 and t4.flags['C_CONTIGUOUS'], "t4 must be a contiguous float64 array"
     assert r4.dtype == np.float64 and r4.flags['C_CONTIGUOUS'], "r4 must be a contiguous float64 array"
@@ -86,6 +93,28 @@ def unpack_t4_tri2block_(t4, t4_blk, map_, mask, i0, i1, j0, j1, k0, k1, l0, l1,
     )
     return t4_blk
 
+def unpack_t4_tri2block_triples_(t4, t4_blk, map_, mask, i0, i1, j0, j1, k0, k1, l0, l1,
+                                nocc, nvir, blk_i, blk_j, blk_k, blk_l):
+    assert t4.dtype == np.float64 and t4_blk.dtype == np.float64
+    assert map_.dtype == np.int64 and mask.dtype == np.bool_
+    t4 = np.ascontiguousarray(t4)
+    t4_blk = np.ascontiguousarray(t4_blk)
+    map_ = np.ascontiguousarray(map_)
+    mask = np.ascontiguousarray(mask)
+    _libccsdt.unpack_t4_tri2block_triples_(
+        t4.ctypes.data_as(ctypes.POINTER(ctypes.c_double)),
+        t4_blk.ctypes.data_as(ctypes.POINTER(ctypes.c_double)),
+        map_.ctypes.data_as(ctypes.POINTER(ctypes.c_int64)),
+        mask.ctypes.data_as(ctypes.POINTER(ctypes.c_bool)),
+        ctypes.c_int64(i0), ctypes.c_int64(i1),
+        ctypes.c_int64(j0), ctypes.c_int64(j1),
+        ctypes.c_int64(k0), ctypes.c_int64(k1),
+        ctypes.c_int64(l0), ctypes.c_int64(l1),
+        ctypes.c_int64(nocc), ctypes.c_int64(nvir),
+        ctypes.c_int64(blk_i), ctypes.c_int64(blk_j), ctypes.c_int64(blk_k), ctypes.c_int64(blk_l)
+    )
+    return t4_blk
+
 def accumulate_t4_block2tri_(t4, t4_blk, map_, i0, i1, j0, j1, k0, k1, l0, l1,
                                 nocc, nvir, blk_i, blk_j, blk_k, blk_l, alpha, beta):
     assert t4.dtype == np.float64 and t4_blk.dtype == np.float64
@@ -107,6 +136,18 @@ def accumulate_t4_block2tri_(t4, t4_blk, map_, i0, i1, j0, j1, k0, k1, l0, l1,
     )
     return t4
 
+def r4_tri_divide_e_(mycc, r4, mo_energy):
+    nocc, nmo = mycc.nocc, mycc.nmo
+    nvir = nmo - nocc
+    assert r4.dtype == np.float64 and r4.flags['C_CONTIGUOUS'], "r4 must be a contiguous float64 array"
+    eia = np.ascontiguousarray(mo_energy[:nocc, None] - mo_energy[None, nocc:] - mycc.level_shift, dtype=np.float64)
+    _libccsdt.r4_tri_divide_e_(
+        r4.ctypes.data_as(ctypes.POINTER(ctypes.c_double)),
+        eia.ctypes.data_as(ctypes.POINTER(ctypes.c_double)),
+        ctypes.c_int64(nocc), ctypes.c_int64(nvir)
+    )
+    return r4
+
 def _unpack_t4_(mycc, t4, t4_blk, i0, i1, j0, j1, k0, k1, l0, l1,
                     blksize0=None, blksize1=None, blksize2=None, blksize3=None):
     if blksize0 is None: blksize0 = mycc.blksize
@@ -117,6 +158,16 @@ def _unpack_t4_(mycc, t4, t4_blk, i0, i1, j0, j1, k0, k1, l0, l1,
                         mycc.nocc, mycc.nmo - mycc.nocc, blksize0, blksize1, blksize2, blksize3)
     return t4_blk
 
+def _unpack_t4_triples_(mycc, t4, t4_blk, i0, i1, j0, j1, k0, k1, l0, l1,
+                        blksize0=None, blksize1=None, blksize2=None, blksize3=None):
+    if blksize0 is None: blksize0 = mycc.blksize
+    if blksize1 is None: blksize1 = mycc.blksize
+    if blksize2 is None: blksize2 = mycc.blksize
+    if blksize3 is None: blksize3 = mycc.blksize
+    unpack_t4_tri2block_triples_(t4, t4_blk, mycc.tri2block_map, mycc.tri2block_mask, i0, i1, j0, j1, k0, k1, l0, l1,
+                        mycc.nocc, mycc.nmo - mycc.nocc, blksize0, blksize1, blksize2, blksize3)
+    return t4_blk
+
 def _accumulate_t4_(mycc, t4, t4_blk, i0, i1, j0, j1, k0, k1, l0, l1,
                     blksize0=None, blksize1=None, blksize2=None, blksize3=None, alpha=1.0, beta=0.0):
     if blksize0 is None: blksize0 = mycc.blksize
@@ -198,26 +249,26 @@ def intermediates_t4_tri(mycc, imds, t2, t3, t4):
 
     einsum('me,mjab->abej', t1_fock[:nocc, nocc:], t2, out=W_vvvo, alpha=-1.0, beta=1.0)
 
-    W_ovvvoo = np.empty((nocc,) + (nvir,) * 3 + (nocc,) * 2)
-    einsum('maef,jibf->mabeij', t1_eris[:nocc, nocc:, nocc:, nocc:], t2, out=W_ovvvoo, alpha=2.0, beta=0.0)
-    einsum('mafe,jibf->mabeij', t1_eris[:nocc, nocc:, nocc:, nocc:], t2, out=W_ovvvoo, alpha=-1.0, beta=1.0)
-    einsum('mnei,njab->mabeij', t1_eris[:nocc, :nocc, nocc:, :nocc], t2, out=W_ovvvoo, alpha=-2.0, beta=1.0)
-    einsum('nmei,njab->mabeij', t1_eris[:nocc, :nocc, nocc:, :nocc], t2, out=W_ovvvoo, alpha=1.0, beta=1.0)
+    W_oovvvo = np.empty((nocc,) * 2 + (nvir,) * 3 + (nocc,))
+    einsum('maef,jibf->ijeabm', t1_eris[:nocc, nocc:, nocc:, nocc:], t2, out=W_oovvvo, alpha=2.0, beta=0.0)
+    einsum('mafe,jibf->ijeabm', t1_eris[:nocc, nocc:, nocc:, nocc:], t2, out=W_oovvvo, alpha=-1.0, beta=1.0)
+    einsum('mnei,njab->ijeabm', t1_eris[:nocc, :nocc, nocc:, :nocc], t2, out=W_oovvvo, alpha=-2.0, beta=1.0)
+    einsum('nmei,njab->ijeabm', t1_eris[:nocc, :nocc, nocc:, :nocc], t2, out=W_oovvvo, alpha=1.0, beta=1.0)
     c_t3 = np.empty_like(t3)
     t3_spin_summation(t3, c_t3, nocc**3, nvir, "P3_201", 1.0, 0.0)
-    einsum('nmfe,nijfab->mabeij', t1_eris[:nocc, :nocc, nocc:, nocc:], c_t3, out=W_ovvvoo, alpha=0.5, beta=1.0)
-    einsum('mnfe,nijfab->mabeij', t1_eris[:nocc, :nocc, nocc:, nocc:], c_t3, out=W_ovvvoo, alpha=-0.25, beta=1.0)
+    einsum('nmfe,nijfab->ijeabm', t1_eris[:nocc, :nocc, nocc:, nocc:], c_t3, out=W_oovvvo, alpha=0.5, beta=1.0)
+    einsum('mnfe,nijfab->ijeabm', t1_eris[:nocc, :nocc, nocc:, nocc:], c_t3, out=W_oovvvo, alpha=-0.25, beta=1.0)
     c_t3 = None
 
-    W_ovvovo = np.empty((nocc, nvir, nvir, nocc, nvir, nocc))
-    einsum('mafe,jibf->mabiej', t1_eris[:nocc, nocc:, nocc:, nocc:], t2, out=W_ovvovo, alpha=1.0, beta=0.0)
-    einsum('mnie,njab->mabiej', t1_eris[:nocc, :nocc, :nocc, nocc:], t2, out=W_ovvovo, alpha=-1.0, beta=1.0)
-    einsum('nmef,injfab->mabiej', t1_eris[:nocc, :nocc, nocc:, nocc:], t3, out=W_ovvovo, alpha=-0.5, beta=1.0)
+    W_ovovvo = np.empty((nocc, nvir, nocc, nvir, nvir, nocc))
+    einsum('mafe,jibf->iejabm', t1_eris[:nocc, nocc:, nocc:, nocc:], t2, out=W_ovovvo, alpha=1.0, beta=0.0)
+    einsum('mnie,njab->iejabm', t1_eris[:nocc, :nocc, :nocc, nocc:], t2, out=W_ovovvo, alpha=-1.0, beta=1.0)
+    einsum('nmef,injfab->iejabm', t1_eris[:nocc, :nocc, nocc:, nocc:], t3, out=W_ovovvo, alpha=-0.5, beta=1.0)
 
-    W_vooooo = np.empty((nvir,) + (nocc,) * 5)
-    einsum('mnek,ijae->amnijk', t1_eris[:nocc, :nocc, nocc:, :nocc], t2, out=W_vooooo, alpha=1.0, beta=0.0)
-    einsum('mnef,ijkaef->amnijk', t1_eris[:nocc, :nocc, nocc:, nocc:], t3, out=W_vooooo, alpha=0.5, beta=1.0)
-    W_vooooo += W_vooooo.transpose(0, 2, 1, 3, 5, 4)
+    W_ooooov = np.empty((nocc,) * 5 + (nvir,))
+    einsum('mnek,ijae->kjinma', t1_eris[:nocc, :nocc, nocc:, :nocc], t2, out=W_ooooov, alpha=1.0, beta=0.0)
+    einsum('mnef,ijkaef->kjinma', t1_eris[:nocc, :nocc, nocc:, nocc:], t3, out=W_ooooov, alpha=0.5, beta=1.0)
+    W_ooooov += W_ooooov.transpose(1, 0, 2, 4, 3, 5)
 
     W_vvoooo = np.empty((nvir,) * 2 + (nocc,) * 4)
     einsum('amef,ijkebf->abmijk', t1_eris[nocc:, :nocc, nocc:, nocc:], t3, out=W_vvoooo, alpha=1.0, beta=0.0)
@@ -252,10 +303,10 @@ def intermediates_t4_tri(mycc, imds, t2, t3, t4):
                         t4_tmp[:bn, :bi, :bj, :bk], out=W_vvvvoo[..., j0:j1, k0:k1], alpha=-0.5, beta=1.0)
     t4_tmp = None
 
-    W_ovvvoo += W_ovvvoo.transpose(0, 2, 1, 3, 5, 4)
+    W_oovvvo += W_oovvvo.transpose(1, 0, 2, 4, 3, 5)
     W_vvoooo += W_vvoooo.transpose(1, 0, 2, 4, 3, 5)
     W_vvvvoo += W_vvvvoo.transpose(0, 2, 1, 3, 5, 4)
-    imds.W_ovvvoo, imds.W_ovvovo, imds.W_vooooo = W_ovvvoo, W_ovvovo, W_vooooo
+    imds.W_oovvvo, imds.W_ovovvo, imds.W_ooooov = W_oovvvo, W_ovovvo, W_ooooov
     imds.W_vvoooo, imds.W_vvvvoo = W_vvoooo, W_vvvvoo
     return imds
 
@@ -274,17 +325,17 @@ def compute_r4_tri(mycc, imds, t2, t3, t4):
     F_oo, F_vv = imds.F_oo, imds.F_vv
     W_oooo, W_ovvo, W_ovov = imds.W_oooo, imds.W_ovvo, imds.W_ovov
     W_vvvo, W_vooo, W_vvvv = imds.W_vvvo, imds.W_vooo, imds.W_vvvv
-    W_ovvvoo, W_ovvovo, W_vooooo = imds.W_ovvvoo, imds.W_ovvovo, imds.W_vooooo
+    W_oovvvo, W_ovovvo, W_ooooov = imds.W_oovvvo, imds.W_ovovvo, imds.W_ooooov
     W_vvoooo, W_vvvvoo = imds.W_vvoooo, imds.W_vvvvoo
 
+    W_voov = np.ascontiguousarray(W_ovvo.transpose(1, 0, 3, 2))
+
     c_t3 = np.empty_like(t3)
     t3_spin_summation(t3, c_t3, nocc**3, nvir, "P3_201", 1.0, 0.0)
 
     # r4 = np.empty_like(t4)
     r4 = np.zeros_like(t4)
-
     time2 = logger.process_clock(), logger.perf_counter()
-    t4_tmp = np.empty((blksize,) * 4 + (nvir,) * 4, dtype=t4.dtype)
     r4_tmp = np.empty((blksize,) * 4 + (nvir,) * 4, dtype=t4.dtype)
     for l0, l1 in lib.prange(0, nocc, blksize):
         bl = l1 - l0
@@ -345,152 +396,128 @@ def compute_r4_tri(mycc, imds, t2, t3, t4):
                     einsum("dmlk,mijcab->ijklabcd", W_vooo[:, :, l0:l1, k0:k1],
                         t3[:, i0:i1, j0:j1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
 
-                    einsum("mabeij,mklecd->ijklabcd", W_ovvvoo[..., i0:i1, j0:j1],
-                        c_t3[:, k0:k1, l0:l1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=0.25, beta=1.0)
-                    einsum("maceik,mjlebd->ijklabcd", W_ovvvoo[..., i0:i1, k0:k1],
-                        c_t3[:, j0:j1, l0:l1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=0.25, beta=1.0)
-                    einsum("madeil,mjkebc->ijklabcd", W_ovvvoo[..., i0:i1, l0:l1],
-                        c_t3[:, j0:j1, k0:k1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=0.25, beta=1.0)
-                    einsum("mbaeji,mklecd->ijklabcd", W_ovvvoo[..., j0:j1, i0:i1],
-                        c_t3[:, k0:k1, l0:l1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=0.25, beta=1.0)
-                    einsum("mcaeki,mjlebd->ijklabcd", W_ovvvoo[..., k0:k1, i0:i1],
-                        c_t3[:, j0:j1, l0:l1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=0.25, beta=1.0)
-                    einsum("mdaeli,mjkebc->ijklabcd", W_ovvvoo[..., l0:l1, i0:i1],
-                        c_t3[:, j0:j1, k0:k1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=0.25, beta=1.0)
-                    einsum("mbcejk,milead->ijklabcd", W_ovvvoo[..., j0:j1, k0:k1],
-                        c_t3[:, i0:i1, l0:l1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=0.25, beta=1.0)
-                    einsum("mbdejl,mikeac->ijklabcd", W_ovvvoo[..., j0:j1, l0:l1],
-                        c_t3[:, i0:i1, k0:k1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=0.25, beta=1.0)
-                    einsum("mcbekj,milead->ijklabcd", W_ovvvoo[..., k0:k1, j0:j1],
-                        c_t3[:, i0:i1, l0:l1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=0.25, beta=1.0)
-                    einsum("mdbelj,mikeac->ijklabcd", W_ovvvoo[..., l0:l1, j0:j1],
-                        c_t3[:, i0:i1, k0:k1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=0.25, beta=1.0)
-                    einsum("mcdekl,mijeab->ijklabcd", W_ovvvoo[..., k0:k1, l0:l1],
-                        c_t3[:, i0:i1, j0:j1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=0.25, beta=1.0)
-                    einsum("mdcelk,mijeab->ijklabcd", W_ovvvoo[..., l0:l1, k0:k1],
-                        c_t3[:, i0:i1, j0:j1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=0.25, beta=1.0)
-
-                    einsum("mabiej,mklced->ijklabcd", W_ovvovo[..., i0:i1, :, j0:j1],
-                        t3[:, k0:k1, l0:l1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                    einsum("mabiej,mlkdec->ijklabcd", W_ovvovo[..., i0:i1, :, j0:j1],
-                        t3[:, l0:l1, k0:k1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                    einsum("maciek,mjlbed->ijklabcd", W_ovvovo[..., i0:i1, :, k0:k1],
-                        t3[:, j0:j1, l0:l1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                    einsum("madiel,mjkbec->ijklabcd", W_ovvovo[..., i0:i1, :, l0:l1],
-                        t3[:, j0:j1, k0:k1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                    einsum("maciek,mljdeb->ijklabcd", W_ovvovo[..., i0:i1, :, k0:k1],
-                        t3[:, l0:l1, j0:j1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                    einsum("madiel,mkjceb->ijklabcd", W_ovvovo[..., i0:i1, :, l0:l1],
-                        t3[:, k0:k1, j0:j1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                    einsum("mbajei,mklced->ijklabcd", W_ovvovo[..., j0:j1, :, i0:i1],
-                        t3[:, k0:k1, l0:l1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                    einsum("mbajei,mlkdec->ijklabcd", W_ovvovo[..., j0:j1, :, i0:i1],
-                        t3[:, l0:l1, k0:k1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                    einsum("mcakei,mjlbed->ijklabcd", W_ovvovo[..., k0:k1, :, i0:i1],
-                        t3[:, j0:j1, l0:l1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                    einsum("mdalei,mjkbec->ijklabcd", W_ovvovo[..., l0:l1, :, i0:i1],
-                        t3[:, j0:j1, k0:k1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                    einsum("mcakei,mljdeb->ijklabcd", W_ovvovo[..., k0:k1, :, i0:i1],
-                        t3[:, l0:l1, j0:j1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                    einsum("mdalei,mkjceb->ijklabcd", W_ovvovo[..., l0:l1, :, i0:i1],
-                        t3[:, k0:k1, j0:j1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                    einsum("mbcjek,milaed->ijklabcd", W_ovvovo[..., j0:j1, :, k0:k1],
-                        t3[:, i0:i1, l0:l1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                    einsum("mbdjel,mikaec->ijklabcd", W_ovvovo[..., j0:j1, :, l0:l1],
-                        t3[:, i0:i1, k0:k1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                    einsum("mcbkej,milaed->ijklabcd", W_ovvovo[..., k0:k1, :, j0:j1],
-                        t3[:, i0:i1, l0:l1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                    einsum("mdblej,mikaec->ijklabcd", W_ovvovo[..., l0:l1, :, j0:j1],
-                        t3[:, i0:i1, k0:k1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                    einsum("mcdkel,mijaeb->ijklabcd", W_ovvovo[..., k0:k1, :, l0:l1],
-                        t3[:, i0:i1, j0:j1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                    einsum("mdclek,mijaeb->ijklabcd", W_ovvovo[..., l0:l1, :, k0:k1],
-                        t3[:, i0:i1, j0:j1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                    einsum("mbcjek,mlidea->ijklabcd", W_ovvovo[..., j0:j1, :, k0:k1],
-                        t3[:, l0:l1, i0:i1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                    einsum("mbdjel,mkicea->ijklabcd", W_ovvovo[..., j0:j1, :, l0:l1],
-                        t3[:, k0:k1, i0:i1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                    einsum("mcbkej,mlidea->ijklabcd", W_ovvovo[..., k0:k1, :, j0:j1],
-                        t3[:, l0:l1, i0:i1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                    einsum("mdblej,mkicea->ijklabcd", W_ovvovo[..., l0:l1, :, j0:j1],
-                        t3[:, k0:k1, i0:i1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                    einsum("mcdkel,mjibea->ijklabcd", W_ovvovo[..., k0:k1, :, l0:l1],
-                        t3[:, j0:j1, i0:i1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                    einsum("mdclek,mjibea->ijklabcd", W_ovvovo[..., l0:l1, :, k0:k1],
-                        t3[:, j0:j1, i0:i1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-
-                    einsum("mcbiej,mklaed->ijklabcd", W_ovvovo[..., i0:i1, :, j0:j1],
+                    einsum("ijeabm,mklecd->ijklabcd", W_oovvvo[i0:i1, j0:j1],
+                        c_t3[:, k0:k1, l0:l1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=0.5, beta=1.0)
+                    einsum("ikeacm,mjlebd->ijklabcd", W_oovvvo[i0:i1, k0:k1],
+                        c_t3[:, j0:j1, l0:l1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=0.5, beta=1.0)
+                    einsum("ileadm,mjkebc->ijklabcd", W_oovvvo[i0:i1, l0:l1],
+                        c_t3[:, j0:j1, k0:k1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=0.5, beta=1.0)
+                    einsum("jkebcm,milead->ijklabcd", W_oovvvo[j0:j1, k0:k1],
+                        c_t3[:, i0:i1, l0:l1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=0.5, beta=1.0)
+                    einsum("jlebdm,mikeac->ijklabcd", W_oovvvo[j0:j1, l0:l1],
+                        c_t3[:, i0:i1, k0:k1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=0.5, beta=1.0)
+                    einsum("klecdm,mijeab->ijklabcd", W_oovvvo[k0:k1, l0:l1],
+                        c_t3[:, i0:i1, j0:j1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=0.5, beta=1.0)
+
+                    einsum("iejcbm,mklaed->ijklabcd", W_ovovvo[i0:i1, :, j0:j1],
                         t3[:, k0:k1, l0:l1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                    einsum("mdbiej,mlkaec->ijklabcd", W_ovvovo[..., i0:i1, :, j0:j1],
+                    einsum("iejdbm,mlkaec->ijklabcd", W_ovovvo[i0:i1, :, j0:j1],
                         t3[:, l0:l1, k0:k1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                    einsum("mbciek,mjlaed->ijklabcd", W_ovvovo[..., i0:i1, :, k0:k1],
+                    einsum("iekbcm,mjlaed->ijklabcd", W_ovovvo[i0:i1, :, k0:k1],
                         t3[:, j0:j1, l0:l1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                    einsum("mbdiel,mjkaec->ijklabcd", W_ovvovo[..., i0:i1, :, l0:l1],
+                    einsum("ielbdm,mjkaec->ijklabcd", W_ovovvo[i0:i1, :, l0:l1],
                         t3[:, j0:j1, k0:k1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                    einsum("mdciek,mljaeb->ijklabcd", W_ovvovo[..., i0:i1, :, k0:k1],
+                    einsum("iekdcm,mljaeb->ijklabcd", W_ovovvo[i0:i1, :, k0:k1],
                         t3[:, l0:l1, j0:j1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                    einsum("mcdiel,mkjaeb->ijklabcd", W_ovvovo[..., i0:i1, :, l0:l1],
+                    einsum("ielcdm,mkjaeb->ijklabcd", W_ovovvo[i0:i1, :, l0:l1],
                         t3[:, k0:k1, j0:j1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                    einsum("mcajei,mklbed->ijklabcd", W_ovvovo[..., j0:j1, :, i0:i1],
+                    einsum("jeicam,mklbed->ijklabcd", W_ovovvo[j0:j1, :, i0:i1],
                         t3[:, k0:k1, l0:l1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                    einsum("mdajei,mlkbec->ijklabcd", W_ovvovo[..., j0:j1, :, i0:i1],
+                    einsum("jeidam,mlkbec->ijklabcd", W_ovovvo[j0:j1, :, i0:i1],
                         t3[:, l0:l1, k0:k1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                    einsum("mbakei,mjlced->ijklabcd", W_ovvovo[..., k0:k1, :, i0:i1],
+                    einsum("keibam,mjlced->ijklabcd", W_ovovvo[k0:k1, :, i0:i1],
                         t3[:, j0:j1, l0:l1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                    einsum("mbalei,mjkdec->ijklabcd", W_ovvovo[..., l0:l1, :, i0:i1],
+                    einsum("leibam,mjkdec->ijklabcd", W_ovovvo[l0:l1, :, i0:i1],
                         t3[:, j0:j1, k0:k1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                    einsum("mdakei,mljceb->ijklabcd", W_ovvovo[..., k0:k1, :, i0:i1],
+                    einsum("keidam,mljceb->ijklabcd", W_ovovvo[k0:k1, :, i0:i1],
                         t3[:, l0:l1, j0:j1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                    einsum("mcalei,mkjdeb->ijklabcd", W_ovvovo[..., l0:l1, :, i0:i1],
+                    einsum("leicam,mkjdeb->ijklabcd", W_ovovvo[l0:l1, :, i0:i1],
                         t3[:, k0:k1, j0:j1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                    einsum("macjek,milbed->ijklabcd", W_ovvovo[..., j0:j1, :, k0:k1],
+                    einsum("jekacm,milbed->ijklabcd", W_ovovvo[j0:j1, :, k0:k1],
                         t3[:, i0:i1, l0:l1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                    einsum("madjel,mikbec->ijklabcd", W_ovvovo[..., j0:j1, :, l0:l1],
+                    einsum("jeladm,mikbec->ijklabcd", W_ovovvo[j0:j1, :, l0:l1],
                         t3[:, i0:i1, k0:k1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                    einsum("mabkej,milced->ijklabcd", W_ovvovo[..., k0:k1, :, j0:j1],
+                    einsum("kejabm,milced->ijklabcd", W_ovovvo[k0:k1, :, j0:j1],
                         t3[:, i0:i1, l0:l1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                    einsum("mablej,mikdec->ijklabcd", W_ovvovo[..., l0:l1, :, j0:j1],
+                    einsum("lejabm,mikdec->ijklabcd", W_ovovvo[l0:l1, :, j0:j1],
                         t3[:, i0:i1, k0:k1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                    einsum("madkel,mijceb->ijklabcd", W_ovvovo[..., k0:k1, :, l0:l1],
+                    einsum("keladm,mijceb->ijklabcd", W_ovovvo[k0:k1, :, l0:l1],
                         t3[:, i0:i1, j0:j1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                    einsum("maclek,mijdeb->ijklabcd", W_ovvovo[..., l0:l1, :, k0:k1],
+                    einsum("lekacm,mijdeb->ijklabcd", W_ovovvo[l0:l1, :, k0:k1],
                         t3[:, i0:i1, j0:j1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                    einsum("mdcjek,mlibea->ijklabcd", W_ovvovo[..., j0:j1, :, k0:k1],
+                    einsum("jekdcm,mlibea->ijklabcd", W_ovovvo[j0:j1, :, k0:k1],
                         t3[:, l0:l1, i0:i1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                    einsum("mcdjel,mkibea->ijklabcd", W_ovvovo[..., j0:j1, :, l0:l1],
+                    einsum("jelcdm,mkibea->ijklabcd", W_ovovvo[j0:j1, :, l0:l1],
                         t3[:, k0:k1, i0:i1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                    einsum("mdbkej,mlicea->ijklabcd", W_ovvovo[..., k0:k1, :, j0:j1],
+                    einsum("kejdbm,mlicea->ijklabcd", W_ovovvo[k0:k1, :, j0:j1],
                         t3[:, l0:l1, i0:i1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                    einsum("mcblej,mkidea->ijklabcd", W_ovvovo[..., l0:l1, :, j0:j1],
+                    einsum("lejcbm,mkidea->ijklabcd", W_ovovvo[l0:l1, :, j0:j1],
                         t3[:, k0:k1, i0:i1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                    einsum("mbdkel,mjicea->ijklabcd", W_ovvovo[..., k0:k1, :, l0:l1],
+                    einsum("kelbdm,mjicea->ijklabcd", W_ovovvo[k0:k1, :, l0:l1],
                         t3[:, j0:j1, i0:i1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                    einsum("mbclek,mjidea->ijklabcd", W_ovvovo[..., l0:l1, :, k0:k1],
+                    einsum("lekbcm,mjidea->ijklabcd", W_ovovvo[l0:l1, :, k0:k1],
                         t3[:, j0:j1, i0:i1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
 
-                    einsum("amnijk,mnlbcd->ijklabcd", W_vooooo[..., i0:i1, j0:j1, k0:k1],
+                    _accumulate_t4_(mycc, r4, r4_tmp, i0, i1, j0, j1, k0, k1, l0, l1)
+        time2 = log.timer_debug1('t4: iter: W_vvvo * t3, W_vooo * t3, W_oovvvo * t3, W_ovovvo * t3'
+                                 ' [%3d, %3d]:' % (l0, l1), *time2)
+    r4_tmp = None
+    c_t3 = None
+    W_vvvo = imds.W_vvvo = None
+    W_vooo = imds.W_vooo = None
+    W_oovvvo = imds.W_oovvvo = None
+    time1 = log.timer_debug1('t4: W_vvvo * t3, W_vooo * t3, W_oovvvo * t3, W_ovovvo * t3', *time1)
+
+    c_t3 = t3 + t3.transpose(0, 1, 2, 4, 5, 3)
+    W_ovovvo += W_ovovvo.transpose(2, 1, 0, 4, 3, 5)
+    time2 = logger.process_clock(), logger.perf_counter()
+    t4_tmp = np.empty((blksize,) * 4 + (nvir,) * 4, dtype=t4.dtype)
+    r4_tmp = np.empty((blksize,) * 4 + (nvir,) * 4, dtype=t4.dtype)
+    for l0, l1 in lib.prange(0, nocc, blksize):
+        bl = l1 - l0
+        for k0, k1 in lib.prange(0, l1, blksize):
+            bk = k1 - k0
+            for j0, j1 in lib.prange(0, k1, blksize):
+                bj = j1 - j0
+                for i0, i1 in lib.prange(0, j1, blksize):
+                    bi = i1 - i0
+
+                    einsum("iejabm,mklced->ijklabcd", W_ovovvo[i0:i1, :, j0:j1],
+                        c_t3[:, k0:k1, l0:l1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=0.0)
+                    einsum("iekacm,mjlbed->ijklabcd", W_ovovvo[i0:i1, :, k0:k1],
+                        c_t3[:, j0:j1, l0:l1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
+                    einsum("ieladm,mjkbec->ijklabcd", W_ovovvo[i0:i1, :, l0:l1],
+                        c_t3[:, j0:j1, k0:k1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
+                    einsum("jekbcm,milaed->ijklabcd", W_ovovvo[j0:j1, :, k0:k1],
+                        c_t3[:, i0:i1, l0:l1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
+                    einsum("jelbdm,mikaec->ijklabcd", W_ovovvo[j0:j1, :, l0:l1],
+                        c_t3[:, i0:i1, k0:k1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
+                    einsum("kelcdm,mijaeb->ijklabcd", W_ovovvo[k0:k1, :, l0:l1],
+                        c_t3[:, i0:i1, j0:j1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
+
+                    einsum("kjinma,mnlbcd->ijklabcd", W_ooooov[k0:k1, j0:j1, i0:i1],
                         t3[:, :, l0:l1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
-                    einsum("amnijl,mnkbdc->ijklabcd", W_vooooo[..., i0:i1, j0:j1, l0:l1],
+                    einsum("ljinma,mnkbdc->ijklabcd", W_ooooov[l0:l1, j0:j1, i0:i1],
                         t3[:, :, k0:k1,], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
-                    einsum("amnikl,mnjcdb->ijklabcd", W_vooooo[..., i0:i1, k0:k1, l0:l1],
+                    einsum("lkinma,mnjcdb->ijklabcd", W_ooooov[l0:l1, k0:k1, i0:i1],
                         t3[:, :, j0:j1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
-                    einsum("bmnjik,mnlacd->ijklabcd", W_vooooo[..., j0:j1, i0:i1, k0:k1],
+                    einsum("kijnmb,mnlacd->ijklabcd", W_ooooov[k0:k1, i0:i1, j0:j1],
                         t3[:, :, l0:l1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
-                    einsum("bmnjil,mnkadc->ijklabcd", W_vooooo[..., j0:j1, i0:i1, l0:l1],
+                    einsum("lijnmb,mnkadc->ijklabcd", W_ooooov[l0:l1, i0:i1, j0:j1],
                         t3[:, :, k0:k1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
-                    einsum("bmnjkl,mnicda->ijklabcd", W_vooooo[..., j0:j1, k0:k1, l0:l1],
+                    einsum("lkjnmb,mnicda->ijklabcd", W_ooooov[l0:l1, k0:k1, j0:j1],
                         t3[:, :, i0:i1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
-                    einsum("cmnkij,mnlabd->ijklabcd", W_vooooo[..., k0:k1, i0:i1, j0:j1],
+                    einsum("jiknmc,mnlabd->ijklabcd", W_ooooov[j0:j1, i0:i1, k0:k1],
                         t3[:, :, l0:l1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
-                    einsum("cmnkil,mnjadb->ijklabcd", W_vooooo[..., k0:k1, i0:i1, l0:l1],
+                    einsum("liknmc,mnjadb->ijklabcd", W_ooooov[l0:l1, i0:i1, k0:k1],
                         t3[:, :, j0:j1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
-                    einsum("cmnkjl,mnibda->ijklabcd", W_vooooo[..., k0:k1, j0:j1, l0:l1],
+                    einsum("ljknmc,mnibda->ijklabcd", W_ooooov[l0:l1, j0:j1, k0:k1],
                         t3[:, :, i0:i1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
-                    einsum("dmnlij,mnkabc->ijklabcd", W_vooooo[..., l0:l1, i0:i1, j0:j1],
+                    einsum("jilnmd,mnkabc->ijklabcd", W_ooooov[j0:j1, i0:i1, l0:l1],
                         t3[:, :, k0:k1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
-                    einsum("dmnlik,mnjacb->ijklabcd", W_vooooo[..., l0:l1, i0:i1, k0:k1],
+                    einsum("kilnmd,mnjacb->ijklabcd", W_ooooov[k0:k1, i0:i1, l0:l1],
                         t3[:, :, j0:j1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
-                    einsum("dmnljk,mnibca->ijklabcd", W_vooooo[..., l0:l1, j0:j1, k0:k1],
+                    einsum("kjlnmd,mnibca->ijklabcd", W_ooooov[k0:k1, j0:j1, l0:l1],
                         t3[:, :, i0:i1], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
 
                     einsum("mlcd,abmijk->ijklabcd", t2[:, l0:l1], W_vvoooo[..., i0:i1, j0:j1, k0:k1],
@@ -546,57 +573,44 @@ def compute_r4_tri(mycc, imds, t2, t3, t4):
                     _unpack_t4_(mycc, t4, t4_tmp, i0, i1, j0, j1, k0, k1, l0, l1)
                     einsum("ae,ijklebcd->ijklabcd", F_vv, t4_tmp[:bi, :bj, :bk, :bl],
                         out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
-                    _unpack_t4_(mycc, t4, t4_tmp, j0, j1, i0, i1, k0, k1, l0, l1)
-                    einsum("be,jikleacd->ijklabcd", F_vv, t4_tmp[:bj, :bi, :bk, :bl],
+                    einsum("be,ijklaecd->ijklabcd", F_vv, t4_tmp[:bi, :bj, :bk, :bl],
                         out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
-                    _unpack_t4_(mycc, t4, t4_tmp, k0, k1, i0, i1, j0, j1, l0, l1)
-                    einsum("ce,kijleabd->ijklabcd", F_vv, t4_tmp[:bk, :bi, :bj, :bl],
+                    einsum("ce,ijklabed->ijklabcd", F_vv, t4_tmp[:bi, :bj, :bk, :bl],
                         out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
-                    _unpack_t4_(mycc, t4, t4_tmp, l0, l1, i0, i1, j0, j1, k0, k1)
-                    einsum("de,lijkeabc->ijklabcd", F_vv, t4_tmp[:bl, :bi, :bj, :bk],
+                    einsum("de,ijklabce->ijklabcd", F_vv, t4_tmp[:bi, :bj, :bk, :bl],
                         out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
 
-                    _unpack_t4_(mycc, t4, t4_tmp, i0, i1, j0, j1, k0, k1, l0, l1)
                     einsum("abef,ijklefcd->ijklabcd", W_vvvv, t4_tmp[:bi, :bj, :bk, :bl],
                         out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
-                    _unpack_t4_(mycc, t4, t4_tmp, i0, i1, k0, k1, j0, j1, l0, l1)
-                    einsum("acef,ikjlefbd->ijklabcd", W_vvvv, t4_tmp[:bi, :bk, :bj, :bl],
+                    einsum("acef,ijklebfd->ijklabcd", W_vvvv, t4_tmp[:bi, :bj, :bk, :bl],
                         out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
-                    _unpack_t4_(mycc, t4, t4_tmp, i0, i1, l0, l1, j0, j1, k0, k1)
-                    einsum("adef,iljkefbc->ijklabcd", W_vvvv, t4_tmp[:bi, :bl, :bj, :bk],
+                    einsum("adef,ijklebcf->ijklabcd", W_vvvv, t4_tmp[:bi, :bj, :bk, :bl],
                         out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
-                    _unpack_t4_(mycc, t4, t4_tmp, j0, j1, k0, k1, i0, i1, l0, l1)
-                    einsum("bcef,jkilefad->ijklabcd", W_vvvv, t4_tmp[:bj, :bk, :bi, :bl],
+                    einsum("bcef,ijklaefd->ijklabcd", W_vvvv, t4_tmp[:bi, :bj, :bk, :bl],
                         out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
-                    _unpack_t4_(mycc, t4, t4_tmp, j0, j1, l0, l1, i0, i1, k0, k1)
-                    einsum("bdef,jlikefac->ijklabcd", W_vvvv, t4_tmp[:bj, :bl, :bi, :bk],
+                    einsum("bdef,ijklaecf->ijklabcd", W_vvvv, t4_tmp[:bi, :bj, :bk, :bl],
                         out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
-                    _unpack_t4_(mycc, t4, t4_tmp, k0, k1, l0, l1, i0, i1, j0, j1)
-                    einsum("cdef,klijefab->ijklabcd", W_vvvv, t4_tmp[:bk, :bl, :bi, :bj],
+                    einsum("cdef,ijklabef->ijklabcd", W_vvvv, t4_tmp[:bi, :bj, :bk, :bl],
                         out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
 
-                    _accumulate_t4_(mycc, r4, r4_tmp, i0, i1, j0, j1, k0, k1, l0, l1)
-        time2 = log.timer_debug1('t4: iter: W_vvoooo * t2, W_vvvvoo * t2,\n'
-            '                           W_vvvo * t3, W_vooo * t3, W_ovvvoo * t3, W_ovvovo * t3, W_vooooo * t3,\n'
+                    _accumulate_t4_(mycc, r4, r4_tmp, i0, i1, j0, j1, k0, k1, l0, l1, beta=1.0)
+        time2 = log.timer_debug1('t4: iter: W_vvoooo * t2, W_vvvvoo * t2, W_ovovvo * t3, W_ooooov * t3,\n'
             '                           F_vv * t4, W_vvvv * t4 [%3d, %3d]:' % (l0, l1), *time2)
     t4_tmp = None
     r4_tmp = None
     c_t3 = None
     F_vv = imds.F_vv = None
-    W_vvvo = imds.W_vvvo = None
-    W_vooo = imds.W_vooo = None
     W_vvvv = imds.W_vvvv = None
-    W_ovvvoo = imds.W_ovvvoo = None
-    W_ovvovo = imds.W_ovvovo = None
-    W_vooooo = imds.W_vooooo = None
+    W_ovovvo = imds.W_ovovvo = None
+    W_ooooov = imds.W_ooooov = None
     W_vvoooo = imds.W_vvoooo = None
     W_vvvvoo = imds.W_vvvvoo = None
 
-    time1 = log.timer_debug1('t4: W_vvoooo * t2, W_vvvvoo * t2, W_vvvo * t3, W_vooo * t3, W_ovvvoo * t3,\n'
-                        '                     W_ovvovo * t3, W_vooooo * t3, F_vv * t4, W_vvvv * t4', *time1)
+    time1 = log.timer_debug1('t4: W_vvoooo * t2, W_vvvvoo * t2, W_ovovvo * t3, W_ooooov * t3, F_vv * t4, W_vvvv * t4',
+                             *time1)
 
     time2 = logger.process_clock(), logger.perf_counter()
-    t4_tmp = np.empty((blksize,) * 4 + (nvir,) * 4, dtype=t4.dtype)
+    t4_tmp = np.empty((nocc,) + (blksize,) * 3 + (nvir,) * 4, dtype=t4.dtype)
     r4_tmp = np.empty((blksize,) * 4 + (nvir,) * 4, dtype=t4.dtype)
     for l0, l1 in lib.prange(0, nocc, blksize):
         bl = l1 - l0
@@ -607,95 +621,74 @@ def compute_r4_tri(mycc, imds, t2, t3, t4):
                 for i0, i1 in lib.prange(0, j1, blksize):
                     bi = i1 - i0
 
-                    r4_tmp[:] = 0.0
-                    for m0, m1 in lib.prange(0, nocc, blksize):
-                        bm = m1 - m0
+                    _unpack_t4_(mycc, t4, t4_tmp, 0, nocc, j0, j1, k0, k1, l0, l1, nocc, blksize, blksize, blksize)
+                    einsum("mi,mjklabcd->ijklabcd", F_oo[:, i0:i1], t4_tmp[:, :bj, :bk, :bl],
+                        out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=0.0)
+                    einsum("mbie,mjklaecd->ijklabcd", W_ovov[:, :, i0:i1, :], t4_tmp[:, :bj, :bk, :bl],
+                        out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
+                    einsum("mcie,mjklabed->ijklabcd", W_ovov[:, :, i0:i1, :], t4_tmp[:, :bj, :bk, :bl],
+                        out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
+                    einsum("mdie,mjklabce->ijklabcd", W_ovov[:, :, i0:i1, :], t4_tmp[:, :bj, :bk, :bl],
+                        out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
+                    t4_spin_summation_inplace_(t4_tmp, nocc * blksize**3, nvir, "P4_201", 1.0, 0.0)
+                    einsum("amie,mjklebcd->ijklabcd", W_voov[:, :, i0:i1, :], t4_tmp[:, :bj, :bk, :bl],
+                        out=r4_tmp[:bi, :bj, :bk, :bl], alpha=0.5, beta=1.0)
 
-                        _unpack_t4_(mycc, t4, t4_tmp, m0, m1, j0, j1, k0, k1, l0, l1)
-                        einsum("mi,mjklabcd->ijklabcd", F_oo[m0:m1, i0:i1], t4_tmp[:bm, :bj, :bk, :bl],
-                            out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                        t4_spin_summation_inplace_(t4_tmp, blksize**4, nvir, "P4_201", 1.0, 0.0)
-                        einsum("maei,mjklebcd->ijklabcd", W_ovvo[m0:m1, :, :, i0:i1],
-                            t4_tmp[:bm, :bj, :bk, :bl], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=0.5, beta=1.0)
-                        _unpack_t4_(mycc, t4, t4_tmp, m0, m1, i0, i1, k0, k1, l0, l1)
-                        einsum("mj,miklbacd->ijklabcd", F_oo[m0:m1, j0:j1], t4_tmp[:bm, :bi, :bk, :bl],
-                            out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                        t4_spin_summation_inplace_(t4_tmp, blksize**4, nvir, "P4_201", 1.0, 0.0)
-                        einsum("mbej,mikleacd->ijklabcd", W_ovvo[m0:m1, :, :, j0:j1],
-                            t4_tmp[:bm, :bi, :bk, :bl], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=0.5, beta=1.0)
-                        _unpack_t4_(mycc, t4, t4_tmp, m0, m1, i0, i1, j0, j1, l0, l1)
-                        einsum("mk,mijlcabd->ijklabcd", F_oo[m0:m1, k0:k1], t4_tmp[:bm, :bi, :bj, :bl],
-                            out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                        t4_spin_summation_inplace_(t4_tmp, blksize**4, nvir, "P4_201", 1.0, 0.0)
-                        einsum("mcek,mijleabd->ijklabcd", W_ovvo[m0:m1, :, :, k0:k1],
-                            t4_tmp[:bm, :bi, :bj, :bl], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=0.5, beta=1.0)
-                        _unpack_t4_(mycc, t4, t4_tmp, m0, m1, i0, i1, j0, j1, k0, k1)
-                        einsum("ml,mijkdabc->ijklabcd", F_oo[m0:m1, l0:l1], t4_tmp[:bm, :bi, :bj, :bk],
-                            out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                        t4_spin_summation_inplace_(t4_tmp, blksize**4, nvir, "P4_201", 1.0, 0.0)
-                        einsum("mdel,mijkeabc->ijklabcd", W_ovvo[m0:m1, :, :, l0:l1],
-                            t4_tmp[:bm, :bi, :bj, :bk], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=0.5, beta=1.0)
-
-                        _unpack_t4_(mycc, t4, t4_tmp, m0, m1, j0, j1, k0, k1, l0, l1)
-                        einsum("maie,mjklbecd->ijklabcd", W_ovov[m0:m1, :, i0:i1, :],
-                            t4_tmp[:bm, :bj, :bk, :bl], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                        einsum("mbie,mjklaecd->ijklabcd", W_ovov[m0:m1, :, i0:i1, :],
-                            t4_tmp[:bm, :bj, :bk, :bl], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                        _unpack_t4_(mycc, t4, t4_tmp, m0, m1, k0, k1, j0, j1, l0, l1)
-                        einsum("maie,mkjlcebd->ijklabcd", W_ovov[m0:m1, :, i0:i1, :],
-                            t4_tmp[:bm, :bk, :bj, :bl], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                        einsum("mcie,mkjlaebd->ijklabcd", W_ovov[m0:m1, :, i0:i1, :],
-                            t4_tmp[:bm, :bk, :bj, :bl], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                        _unpack_t4_(mycc, t4, t4_tmp, m0, m1, l0, l1, j0, j1, k0, k1)
-                        einsum("maie,mljkdebc->ijklabcd", W_ovov[m0:m1, :, i0:i1, :],
-                            t4_tmp[:bm, :bl, :bj, :bk], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                        einsum("mdie,mljkaebc->ijklabcd", W_ovov[m0:m1, :, i0:i1, :],
-                            t4_tmp[:bm, :bl, :bj, :bk], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                        _unpack_t4_(mycc, t4, t4_tmp, m0, m1, i0, i1, k0, k1, l0, l1)
-                        einsum("mbje,miklaecd->ijklabcd", W_ovov[m0:m1, :, j0:j1, :],
-                            t4_tmp[:bm, :bi, :bk, :bl], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                        einsum("maje,miklbecd->ijklabcd", W_ovov[m0:m1, :, j0:j1, :],
-                            t4_tmp[:bm, :bi, :bk, :bl], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                        _unpack_t4_(mycc, t4, t4_tmp, m0, m1, i0, i1, j0, j1, l0, l1)
-                        einsum("mcke,mijlaebd->ijklabcd", W_ovov[m0:m1, :, k0:k1, :],
-                            t4_tmp[:bm, :bi, :bj, :bl], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                        einsum("make,mijlcebd->ijklabcd", W_ovov[m0:m1, :, k0:k1, :],
-                            t4_tmp[:bm, :bi, :bj, :bl], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                        _unpack_t4_(mycc, t4, t4_tmp, m0, m1, i0, i1, j0, j1, k0, k1)
-                        einsum("mdle,mijkaebc->ijklabcd", W_ovov[m0:m1, :, l0:l1, :],
-                            t4_tmp[:bm, :bi, :bj, :bk], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                        einsum("male,mijkdebc->ijklabcd", W_ovov[m0:m1, :, l0:l1, :],
-                            t4_tmp[:bm, :bi, :bj, :bk], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                        _unpack_t4_(mycc, t4, t4_tmp, m0, m1, k0, k1, i0, i1, l0, l1)
-                        einsum("mbje,mkilcead->ijklabcd", W_ovov[m0:m1, :, j0:j1, :],
-                            t4_tmp[:bm, :bk, :bi, :bl], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                        einsum("mcje,mkilbead->ijklabcd", W_ovov[m0:m1, :, j0:j1, :],
-                            t4_tmp[:bm, :bk, :bi, :bl], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                        _unpack_t4_(mycc, t4, t4_tmp, m0, m1, l0, l1, i0, i1, k0, k1)
-                        einsum("mbje,mlikdeac->ijklabcd", W_ovov[m0:m1, :, j0:j1, :],
-                            t4_tmp[:bm, :bl, :bi, :bk], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                        einsum("mdje,mlikbeac->ijklabcd", W_ovov[m0:m1, :, j0:j1, :],
-                            t4_tmp[:bm, :bl, :bi, :bk], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                        _unpack_t4_(mycc, t4, t4_tmp, m0, m1, j0, j1, i0, i1, l0, l1)
-                        einsum("mcke,mjilbead->ijklabcd", W_ovov[m0:m1, :, k0:k1, :],
-                            t4_tmp[:bm, :bj, :bi, :bl], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                        einsum("mbke,mjilcead->ijklabcd", W_ovov[m0:m1, :, k0:k1, :],
-                            t4_tmp[:bm, :bj, :bi, :bl], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                        _unpack_t4_(mycc, t4, t4_tmp, m0, m1, j0, j1, i0, i1, k0, k1)
-                        einsum("mdle,mjikbeac->ijklabcd", W_ovov[m0:m1, :, l0:l1, :],
-                            t4_tmp[:bm, :bj, :bi, :bk], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                        einsum("mble,mjikdeac->ijklabcd", W_ovov[m0:m1, :, l0:l1, :],
-                            t4_tmp[:bm, :bj, :bi, :bk], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                        _unpack_t4_(mycc, t4, t4_tmp, m0, m1, l0, l1, i0, i1, j0, j1)
-                        einsum("mcke,mlijdeab->ijklabcd", W_ovov[m0:m1, :, k0:k1, :],
-                            t4_tmp[:bm, :bl, :bi, :bj], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                        einsum("mdke,mlijceab->ijklabcd", W_ovov[m0:m1, :, k0:k1, :],
-                            t4_tmp[:bm, :bl, :bi, :bj], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
-                        _unpack_t4_(mycc, t4, t4_tmp, m0, m1, k0, k1, i0, i1, j0, j1)
-                        einsum("mdle,mkijceab->ijklabcd", W_ovov[m0:m1, :, l0:l1, :],
-                            t4_tmp[:bm, :bk, :bi, :bj], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
-                        einsum("mcle,mkijdeab->ijklabcd", W_ovov[m0:m1, :, l0:l1, :],
-                            t4_tmp[:bm, :bk, :bi, :bj], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
+                    _unpack_t4_(mycc, t4, t4_tmp, 0, nocc, i0, i1, k0, k1, l0, l1, nocc, blksize, blksize, blksize)
+                    einsum("mj,miklbacd->ijklabcd", F_oo[:, j0:j1], t4_tmp[:, :bi, :bk, :bl],
+                        out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
+                    einsum("maje,miklbecd->ijklabcd", W_ovov[:, :, j0:j1, :], t4_tmp[:, :bi, :bk, :bl],
+                        out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
+                    einsum("mcje,miklbaed->ijklabcd", W_ovov[:, :, j0:j1, :], t4_tmp[:, :bi, :bk, :bl],
+                        out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
+                    einsum("mdje,miklbace->ijklabcd", W_ovov[:, :, j0:j1, :], t4_tmp[:, :bi, :bk, :bl],
+                        out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
+                    t4_spin_summation_inplace_(t4_tmp, nocc * blksize**3, nvir, "P4_201", 1.0, 0.0)
+                    einsum("bmje,mikleacd->ijklabcd", W_voov[:, :, j0:j1, :], t4_tmp[:, :bi, :bk, :bl],
+                        out=r4_tmp[:bi, :bj, :bk, :bl], alpha=0.5, beta=1.0)
+
+                    _unpack_t4_(mycc, t4, t4_tmp, 0, nocc, i0, i1, j0, j1, l0, l1, nocc, blksize, blksize, blksize)
+                    einsum("mk,mijlcabd->ijklabcd", F_oo[:, k0:k1], t4_tmp[:, :bi, :bj, :bl],
+                        out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
+                    einsum("make,mijlcebd->ijklabcd", W_ovov[:, :, k0:k1, :], t4_tmp[:, :bi, :bj, :bl],
+                        out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
+                    einsum("mbke,mijlcaed->ijklabcd", W_ovov[:, :, k0:k1, :], t4_tmp[:, :bi, :bj, :bl],
+                        out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
+                    einsum("mdke,mijlcabe->ijklabcd", W_ovov[:, :, k0:k1, :], t4_tmp[:, :bi, :bj, :bl],
+                        out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
+                    t4_spin_summation_inplace_(t4_tmp, nocc * blksize**3, nvir, "P4_201", 1.0, 0.0)
+                    einsum("cmke,mijleabd->ijklabcd", W_voov[:, :, k0:k1, :], t4_tmp[:, :bi, :bj, :bl],
+                        out=r4_tmp[:bi, :bj, :bk, :bl], alpha=0.5, beta=1.0)
+
+                    _unpack_t4_(mycc, t4, t4_tmp, 0, nocc, i0, i1, j0, j1, k0, k1, nocc, blksize, blksize, blksize)
+                    einsum("ml,mijkdabc->ijklabcd", F_oo[:, l0:l1], t4_tmp[:, :bi, :bj, :bk],
+                        out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
+                    einsum("male,mijkdebc->ijklabcd", W_ovov[:, :, l0:l1, :], t4_tmp[:, :bi, :bj, :bk],
+                        out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
+                    einsum("mble,mijkdaec->ijklabcd", W_ovov[:, :, l0:l1, :], t4_tmp[:, :bi, :bj, :bk],
+                        out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
+                    einsum("mcle,mijkdabe->ijklabcd", W_ovov[:, :, l0:l1, :], t4_tmp[:, :bi, :bj, :bk],
+                        out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-1.0, beta=1.0)
+                    t4_spin_summation_inplace_(t4_tmp, nocc * blksize**3, nvir, "P4_201", 1.0, 0.0)
+                    einsum("dmle,mijkeabc->ijklabcd", W_voov[:, :, l0:l1, :], t4_tmp[:, :bi, :bj, :bk],
+                        out=r4_tmp[:bi, :bj, :bk, :bl], alpha=0.5, beta=1.0)
+
+                    _unpack_t4_triples_(mycc, t4, t4_tmp, 0, nocc, j0, j1, k0, k1, l0, l1,
+                                        nocc, blksize, blksize, blksize)
+                    einsum("maie,mjklbecd->ijklabcd", W_ovov[:, :, i0:i1, :],
+                        t4_tmp[:, :bj, :bk, :bl], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
+                    _unpack_t4_triples_(mycc, t4, t4_tmp, 0, nocc, i0, i1, k0, k1, l0, l1,
+                                        nocc, blksize, blksize, blksize)
+                    einsum("mbje,miklaecd->ijklabcd", W_ovov[:, :, j0:j1, :],
+                        t4_tmp[:, :bi, :bk, :bl], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
+                    _unpack_t4_triples_(mycc, t4, t4_tmp, 0, nocc, i0, i1, j0, j1, l0, l1,
+                                        nocc, blksize, blksize, blksize)
+                    einsum("mcke,mijlaebd->ijklabcd", W_ovov[:, :, k0:k1, :],
+                        t4_tmp[:, :bi, :bj, :bl], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
+                    _unpack_t4_triples_(mycc, t4, t4_tmp, 0, nocc, i0, i1, j0, j1, k0, k1,
+                                        nocc, blksize, blksize, blksize)
+                    einsum("mdle,mijkaebc->ijklabcd", W_ovov[:, :, l0:l1, :],
+                        t4_tmp[:, :bi, :bj, :bk], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=-0.5, beta=1.0)
 
                     _accumulate_t4_(mycc, r4, r4_tmp, i0, i1, j0, j1, k0, k1, l0, l1, beta=1.0)
         time2 = log.timer_debug1('t4: iter: F_oo * t4, W_ovvo * t4, W_ovov * t4 [%3d, %3d]:'%(l0, l1), *time2)
@@ -703,11 +696,11 @@ def compute_r4_tri(mycc, imds, t2, t3, t4):
     r4_tmp = None
     F_oo = imds.F_oo = None
     W_ovvo = imds.W_ovvo = None
-    W_ovov = imds.V_ovov = None
+    W_ovov = imds.W_ovov = None
     time1 = log.timer_debug1('t4: F_oo * t4, W_ovvo * t4, W_ovov * t4', *time1)
 
     time2 = logger.process_clock(), logger.perf_counter()
-    t4_tmp = np.empty((blksize,) * 4 + (nvir,) * 4, dtype=t4.dtype)
+    t4_tmp = np.empty((blksize,) * 3 + (nocc,) + (nvir,) * 4, dtype=t4.dtype)
     r4_tmp = np.empty((blksize,) * 4 + (nvir,) * 4, dtype=t4.dtype)
     for l0, l1 in lib.prange(0, nocc, blksize):
         bl = l1 - l0
@@ -717,33 +710,30 @@ def compute_r4_tri(mycc, imds, t2, t3, t4):
                 bj = j1 - j0
                 for i0, i1 in lib.prange(0, j1, blksize):
                     bi = i1 - i0
-
-                    r4_tmp[:] = 0.0
                     for m0, m1 in lib.prange(0, nocc, blksize):
                         bm = m1 - m0
-                        for n0, n1 in lib.prange(0, nocc, blksize):
-                            bn = n1 - n0
-
-                            _unpack_t4_(mycc, t4, t4_tmp, m0, m1, n0, n1, k0, k1, l0, l1)
-                            einsum("mnij,mnklabcd->ijklabcd", W_oooo[m0:m1, n0:n1, i0:i1, j0:j1],
-                                t4_tmp[:bm, :bn, :bk, :bl], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
-                            _unpack_t4_(mycc, t4, t4_tmp, m0, m1, n0, n1, j0, j1, l0, l1)
-                            einsum("mnik,mnjlacbd->ijklabcd", W_oooo[m0:m1, n0:n1, i0:i1, k0:k1],
-                                t4_tmp[:bm, :bn, :bj, :bl], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
-                            _unpack_t4_(mycc, t4, t4_tmp, m0, m1, n0, n1, j0, j1, k0, k1)
-                            einsum("mnil,mnjkadbc->ijklabcd", W_oooo[m0:m1, n0:n1, i0:i1, l0:l1],
-                                t4_tmp[:bm, :bn, :bj, :bk], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
-                            _unpack_t4_(mycc, t4, t4_tmp, m0, m1, n0, n1, i0, i1, l0, l1)
-                            einsum("mnjk,mnilbcad->ijklabcd", W_oooo[m0:m1, n0:n1, j0:j1, k0:k1],
-                                t4_tmp[:bm, :bn, :bi, :bl], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
-                            _unpack_t4_(mycc, t4, t4_tmp, m0, m1, n0, n1, i0, i1, k0, k1)
-                            einsum("mnjl,mnikbdac->ijklabcd", W_oooo[m0:m1, n0:n1, j0:j1, l0:l1],
-                                t4_tmp[:bm, :bn, :bi, :bk], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
-                            _unpack_t4_(mycc, t4, t4_tmp, m0, m1, n0, n1, i0, i1, j0, j1)
-                            einsum("mnkl,mnijcdab->ijklabcd", W_oooo[m0:m1, n0:n1, k0:k1, l0:l1],
-                                t4_tmp[:bm, :bn, :bi, :bj], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
 
-                    _accumulate_t4_(mycc, r4, r4_tmp, i0, i1, j0, j1, k0, k1, l0, l1, beta=1.0)
+                        _unpack_t4_(mycc, t4, t4_tmp, k0, k1, l0, l1,  m0, m1, 0, nocc, blksize, blksize, blksize, nocc)
+                        einsum("mnij,klmncdab->ijklabcd", W_oooo[m0:m1, :, i0:i1, j0:j1],
+                            t4_tmp[:bk, :bl, :bm], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=0.0)
+                        _unpack_t4_(mycc, t4, t4_tmp, j0, j1, l0, l1, m0, m1, 0, nocc, blksize, blksize, blksize, nocc)
+                        einsum("mnik,jlmnbdac->ijklabcd", W_oooo[m0:m1, :, i0:i1, k0:k1],
+                            t4_tmp[:bj, :bl, :bm], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
+                        _unpack_t4_(mycc, t4, t4_tmp, j0, j1, k0, k1, m0, m1, 0, nocc, blksize, blksize, blksize, nocc)
+                        einsum("mnil,jkmnbcad->ijklabcd", W_oooo[m0:m1, :, i0:i1, l0:l1],
+                            t4_tmp[:bj, :bk, :bm], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
+                        _unpack_t4_(mycc, t4, t4_tmp, i0, i1, l0, l1, m0, m1, 0, nocc, blksize, blksize, blksize, nocc)
+                        einsum("mnjk,ilmnadbc->ijklabcd", W_oooo[m0:m1, :, j0:j1, k0:k1],
+                            t4_tmp[:bi, :bl, :bm], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
+                        _unpack_t4_(mycc, t4, t4_tmp, i0, i1, k0, k1, m0, m1, 0, nocc, blksize, blksize, blksize, nocc)
+                        einsum("mnjl,ikmnacbd->ijklabcd", W_oooo[m0:m1, :, j0:j1, l0:l1],
+                            t4_tmp[:bi, :bk, :bm], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
+                        _unpack_t4_(mycc, t4, t4_tmp, i0, i1, j0, j1, m0, m1, 0, nocc, blksize, blksize, blksize, nocc)
+                        einsum("mnkl,ijmnabcd->ijklabcd", W_oooo[m0:m1, :, k0:k1, l0:l1],
+                            t4_tmp[:bi, :bj, :bm], out=r4_tmp[:bi, :bj, :bk, :bl], alpha=1.0, beta=1.0)
+
+                        _accumulate_t4_(mycc, r4, r4_tmp, i0, i1, j0, j1, k0, k1, l0, l1, beta=1.0)
+
         time2 = log.timer_debug1('t4: iter: W_oooo * t4 [%3d, %3d]:'%(l0, l1), *time2)
     t4_tmp = None
     r4_tmp = None
@@ -751,7 +741,8 @@ def compute_r4_tri(mycc, imds, t2, t3, t4):
     time1 = log.timer_debug1('t4: W_oooo * t4', *time1)
     return r4
 
-def r4_tri_divide_e_(mycc, r4, mo_energy):
+def r4_tri_divide_e_py_(mycc, r4, mo_energy):
+    # NOTE: For reference, not used in the actual code.
     nocc, nmo = mycc.nocc, mycc.nmo
     nvir = nmo - nocc
     blksize = mycc.blksize
@@ -804,15 +795,9 @@ def update_amps_rccsdtq_tri_(mycc, tamps, eris):
     # symmetrization
     r2 += r2.transpose(1, 0, 3, 2)
     time1 = log.timer_debug1('t1t2: symmetrize r2', *time1)
-    # divide by eijkabc
+    # divide by eijab
     r1r2_divide_e_(mycc, r1, r2, mo_energy)
     time1 = log.timer_debug1('t1t2: divide r1 & r2 by eia & eijab', *time1)
-
-    res_norm = [np.linalg.norm(r1), np.linalg.norm(r2)]
-
-    t1 += r1
-    t2 += r2
-    time1 = log.timer_debug1('t1t2: update t1 & t2', *time1)
     time0 = log.timer_debug1('t1t2 total', *time0)
 
     # t3
@@ -830,12 +815,6 @@ def update_amps_rccsdtq_tri_(mycc, tamps, eris):
     # divide by eijkabc
     r3_divide_e_(mycc, r3, mo_energy)
     time1 = log.timer_debug1('t3: divide r3 by eijkabc', *time1)
-
-    res_norm.append(np.linalg.norm(r3))
-
-    t3 += r3
-    r3 = None
-    time1 = log.timer_debug1('t3: update t3', *time1)
     time0 = log.timer_debug1('t3 total', *time0)
 
     # t4
@@ -847,19 +826,22 @@ def update_amps_rccsdtq_tri_(mycc, tamps, eris):
     time1 = log.timer_debug1('t4: compute r4', *time1)
     # symmetrization
     symmetrize_tamps_tri_(r4, nocc)
-    t4_spin_summation_inplace_(r4, nocc4, nvir, "P4_full", -1.0 / 24.0, 1.0)
+    t4_project_1_minus_p4_p31_inplace_(r4, nocc4, nvir)
     purify_tamps_tri_(r4, nocc)
     time1 = log.timer_debug1('t4: symmetrize r4', *time1)
     # divide by eijkabc
     r4_tri_divide_e_(mycc, r4, mo_energy)
     time1 = log.timer_debug1('t4: divide r4 by eijklabcd', *time1)
 
-    res_norm.append(np.linalg.norm(r4))
+    res_norm = [np.linalg.norm(r1), np.linalg.norm(r2), np.linalg.norm(r3), np.linalg.norm(r4)]
 
-    # t4 += r4
+    t1 += r1
+    t2 += r2
+    t3 += r3
+    # C implementation of t4 += r4
     t4_add_(t4, r4, nocc4, nvir)
-    r4 = None
-    time1 = log.timer_debug1('t4: update t4', *time1)
+    r1, r2, r3, r4 = None, None, None, None
+    time1 = log.timer_debug1('t4: update t1, t2, t3, t4', *time1)
     time0 = log.timer_debug1('t4 total', *time0)
     return res_norm
 
@@ -956,6 +938,7 @@ def dump_chk(mycc, tamps=None, frozen=None, mo_coeff=None, mo_occ=None):
         lib.chkfile.save(mycc.chkfile, 'rccsdtq', cc_chk)
     else:
         lib.chkfile.save(mycc.chkfile, 'rccsdtq_highm', cc_chk)
+    return mycc
 
 
 class RCCSDTQ(rccsdt.RCCSDT):
@@ -987,7 +970,7 @@ class RCCSDTQ(rccsdt.RCCSDT):
         T amplitudes t1[i,a], t2[i,j,a,b], t3[i,j,k,a,b,c]
     t4 :
         An array of shape (compressed_occ, nvir, nvir, nvir, nvir) for T4 amplitudes.
-        The occupied-oribtal dimension is stored in a compressed form for the
+        The occupied-orbital dimension is stored in a compressed form for the
         i <= j <= k <= l index combinations. The compressed tensor can be expanded to
         the full tensor by self.tamps_tri2full(t4)
     tamps :
@@ -1071,9 +1054,9 @@ def __init__(self):
         self.W_vooo = None
         self.W_vvvo = None
         self.W_vvvv = None
-        self.W_ovvvoo = None
-        self.W_ovvovo = None
-        self.W_vooooo = None
+        self.W_oovvvo = None
+        self.W_ovovvo = None
+        self.W_ooooov = None
         self.W_vvoooo = None
         self.W_vvvvoo = None
 
diff --git a/pyscf/cc/rccsdtq_highm.py b/pyscf/cc/rccsdtq_highm.py
index 0bcd01dde6..18aba0c11c 100644
--- a/pyscf/cc/rccsdtq_highm.py
+++ b/pyscf/cc/rccsdtq_highm.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright 2014-2025 The PySCF Developers. All Rights Reserved.
+# Copyright 2014-2026 The PySCF Developers. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,18 +27,16 @@
 '''
 
 import numpy as np
-import numpy
 import functools
 import ctypes
 from pyscf import lib
 from pyscf.lib import logger
-from pyscf.mp.mp2 import get_nocc, get_nmo, get_frozen_mask, get_e_hf, _mo_without_core
-from pyscf.cc import _ccsd, rccsdtq
+from pyscf.cc import rccsdtq
 from pyscf.cc.rccsdt import (_einsum, t3_spin_summation_inplace_, update_t1_fock_eris, intermediates_t1t2,
                             compute_r1r2, r1r2_divide_e_, intermediates_t3, _PhysicistsERIs)
 from pyscf.cc.rccsdt_highm import (t3_spin_summation, t3_perm_symmetrize_inplace_, purify_tamps_, r1r2_add_t3_,
                                     intermediates_t3_add_t3, compute_r3, r3_divide_e_)
-from pyscf.cc.rccsdtq import t4_spin_summation_inplace_, t4_add_, _IMDS
+from pyscf.cc.rccsdtq import t4_project_1_minus_p4_p31_inplace_, t4_add_, _IMDS
 from pyscf import __config__
 
 
@@ -119,26 +117,26 @@ def intermediates_t4(mycc, imds, t2, t3, t4):
 
     einsum('me,mjab->abej', t1_fock[:nocc, nocc:], t2, out=W_vvvo, alpha=-1.0, beta=1.0)
 
-    W_ovvvoo = np.empty((nocc,) + (nvir,) * 3 + (nocc,) * 2, dtype=t2.dtype)
-    einsum('maef,jibf->mabeij', t1_eris[:nocc, nocc:, nocc:, nocc:], t2, out=W_ovvvoo, alpha=2.0, beta=0.0)
-    einsum('mafe,jibf->mabeij', t1_eris[:nocc, nocc:, nocc:, nocc:], t2, out=W_ovvvoo, alpha=-1.0, beta=1.0)
-    einsum('mnei,njab->mabeij', t1_eris[:nocc, :nocc, nocc:, :nocc], t2, out=W_ovvvoo, alpha=-2.0, beta=1.0)
-    einsum('nmei,njab->mabeij', t1_eris[:nocc, :nocc, nocc:, :nocc], t2, out=W_ovvvoo, alpha=1.0, beta=1.0)
+    W_oovvvo = np.empty((nocc,) * 2 + (nvir,) * 3 + (nocc,), dtype=t2.dtype)
+    einsum('maef,jibf->ijeabm', t1_eris[:nocc, nocc:, nocc:, nocc:], t2, out=W_oovvvo, alpha=2.0, beta=0.0)
+    einsum('mafe,jibf->ijeabm', t1_eris[:nocc, nocc:, nocc:, nocc:], t2, out=W_oovvvo, alpha=-1.0, beta=1.0)
+    einsum('mnei,njab->ijeabm', t1_eris[:nocc, :nocc, nocc:, :nocc], t2, out=W_oovvvo, alpha=-2.0, beta=1.0)
+    einsum('nmei,njab->ijeabm', t1_eris[:nocc, :nocc, nocc:, :nocc], t2, out=W_oovvvo, alpha=1.0, beta=1.0)
     c_t3 = np.empty_like(t3)
     t3_spin_summation(t3, c_t3, nocc**3, nvir, "P3_201", 1.0, 0.0)
-    einsum('nmfe,nijfab->mabeij', t1_eris[:nocc, :nocc, nocc:, nocc:], c_t3, out=W_ovvvoo, alpha=0.5, beta=1.0)
-    einsum('mnfe,nijfab->mabeij', t1_eris[:nocc, :nocc, nocc:, nocc:], c_t3, out=W_ovvvoo, alpha=-0.25, beta=1.0)
+    einsum('nmfe,nijfab->ijeabm', t1_eris[:nocc, :nocc, nocc:, nocc:], c_t3, out=W_oovvvo, alpha=0.5, beta=1.0)
+    einsum('mnfe,nijfab->ijeabm', t1_eris[:nocc, :nocc, nocc:, nocc:], c_t3, out=W_oovvvo, alpha=-0.25, beta=1.0)
     c_t3 = None
 
-    W_ovvovo = np.empty((nocc,) + (nvir,) * 2 + (nocc, nvir, nocc), dtype=t2.dtype)
-    einsum('mafe,jibf->mabiej', t1_eris[:nocc, nocc:, nocc:, nocc:], t2, out=W_ovvovo, alpha=1.0, beta=0.0)
-    einsum('mnie,njab->mabiej', t1_eris[:nocc, :nocc, :nocc, nocc:], t2, out=W_ovvovo, alpha=-1.0, beta=1.0)
-    einsum('nmef,injfab->mabiej', t1_eris[:nocc, :nocc, nocc:, nocc:], t3, out=W_ovvovo, alpha=-0.5, beta=1.0)
+    W_ovovvo = np.empty((nocc,) + (nvir,) + (nocc, nvir, nvir, nocc), dtype=t2.dtype)
+    einsum('mafe,jibf->iejabm', t1_eris[:nocc, nocc:, nocc:, nocc:], t2, out=W_ovovvo, alpha=1.0, beta=0.0)
+    einsum('mnie,njab->iejabm', t1_eris[:nocc, :nocc, :nocc, nocc:], t2, out=W_ovovvo, alpha=-1.0, beta=1.0)
+    einsum('nmef,injfab->iejabm', t1_eris[:nocc, :nocc, nocc:, nocc:], t3, out=W_ovovvo, alpha=-0.5, beta=1.0)
 
-    W_vooooo = np.empty((nvir,) + (nocc,) * 5, dtype=t2.dtype)
-    einsum('mnek,ijae->amnijk', t1_eris[:nocc, :nocc, nocc:, :nocc], t2, out=W_vooooo, alpha=1.0, beta=0.0)
-    einsum('mnef,ijkaef->amnijk', t1_eris[:nocc, :nocc, nocc:, nocc:], t3, out=W_vooooo, alpha=0.5, beta=1.0)
-    W_vooooo += W_vooooo.transpose(0, 2, 1, 3, 5, 4)
+    W_ooooov = np.empty((nocc,) * 5 + (nvir,), dtype=t2.dtype)
+    einsum('mnek,ijae->kjinma', t1_eris[:nocc, :nocc, nocc:, :nocc], t2, out=W_ooooov, alpha=1.0, beta=0.0)
+    einsum('mnef,ijkaef->kjinma', t1_eris[:nocc, :nocc, nocc:, nocc:], t3, out=W_ooooov, alpha=0.5, beta=1.0)
+    W_ooooov += W_ooooov.transpose(1, 0, 2, 4, 3, 5)
 
     W_vvoooo = np.empty((nvir,) * 2 + (nocc,) * 4, dtype=t2.dtype)
     einsum('amef,ijkebf->abmijk', t1_eris[nocc:, :nocc, nocc:, nocc:], t3, out=W_vvoooo, alpha=1.0, beta=0.0)
@@ -161,10 +159,10 @@ def intermediates_t4(mycc, imds, t2, t3, t4):
     einsum('mnef,nmjkfabc->abcejk', t1_eris[:nocc, :nocc, nocc:, nocc:], c_t4, out=W_vvvvoo, alpha=-0.5, beta=1.0)
     c_t4 = None
 
-    W_ovvvoo += W_ovvvoo.transpose(0, 2, 1, 3, 5, 4)
+    W_oovvvo += W_oovvvo.transpose(1, 0, 2, 4, 3, 5)
     W_vvoooo += W_vvoooo.transpose(1, 0, 2, 4, 3, 5)
     W_vvvvoo += W_vvvvoo.transpose(0, 2, 1, 3, 5, 4)
-    imds.W_ovvvoo, imds.W_ovvovo, imds.W_vooooo = W_ovvvoo, W_ovvovo, W_vooooo
+    imds.W_oovvvo, imds.W_ovovvo, imds.W_ooooov = W_oovvvo, W_ovovvo, W_ooooov
     imds.W_vvoooo, imds.W_vvvvoo = W_vvoooo, W_vvvvoo
     return imds
 
@@ -183,7 +181,7 @@ def compute_r4(mycc, imds, t2, t3, t4):
     F_oo, F_vv = imds.F_oo, imds.F_vv
     W_oooo, W_ovvo, W_ovov = imds.W_oooo, imds.W_ovvo, imds.W_ovov
     W_vvvo, W_vooo, W_vvvv = imds.W_vvvo, imds.W_vooo, imds.W_vvvv
-    W_ovvvoo, W_ovvovo, W_vooooo = imds.W_ovvvoo, imds.W_ovvovo, imds.W_vooooo
+    W_oovvvo, W_ovovvo, W_ooooov = imds.W_oovvvo, imds.W_ovovvo, imds.W_ooooov
     W_vvoooo, W_vvvvoo = imds.W_vvoooo, imds.W_vvvvoo
 
     r4 = np.empty_like(t4)
@@ -225,19 +223,19 @@ def compute_r4(mycc, imds, t2, t3, t4):
 
     c_t3 = np.empty_like(t3)
     t3_spin_summation(t3, c_t3, nocc**3, nvir, "P3_201", 1.0, 0.0)
-    einsum('mabeij,mklecd->ijklabcd', W_ovvvoo, c_t3, out=r4, alpha=0.125, beta=1.0)
-    W_ovvvoo = imds.W_ovvvoo = None
+    einsum('ijeabm,mklecd->ijklabcd', W_oovvvo, c_t3, out=r4, alpha=0.125, beta=1.0)
+    W_oovvvo = imds.W_oovvvo = None
     c_t3 = None
-    time1 = log.timer_debug1('t4: W_ovvvoo * c_t3', *time1)
+    time1 = log.timer_debug1('t4: W_oovvvo * c_t3', *time1)
 
-    einsum('mabiej,kmlecd->ijklabcd', W_ovvovo, t3, out=r4, alpha=-0.5, beta=1.0)
-    einsum('mcbiej,kmlead->ijklabcd', W_ovvovo, t3, out=r4, alpha=-1.0, beta=1.0)
-    W_ovvovo = imds.W_ovvovo = None
-    time1 = log.timer_debug1('t4: W_ovvovo * t3', *time1)
+    einsum('iejabm,kmlecd->ijklabcd', W_ovovvo, t3, out=r4, alpha=-0.5, beta=1.0)
+    einsum('iejcbm,kmlead->ijklabcd', W_ovovvo, t3, out=r4, alpha=-1.0, beta=1.0)
+    W_ovovvo = imds.W_ovovvo = None
+    time1 = log.timer_debug1('t4: W_ovovvo * t3', *time1)
 
-    einsum('amnijk,mnlbcd->ijklabcd', W_vooooo, t3, out=r4, alpha=0.5, beta=1.0)
-    W_vooooo = imds.W_vooooo = None
-    time1 = log.timer_debug1('t4: W_vooooo * t3', *time1)
+    einsum('kjinma,mnlbcd->ijklabcd', W_ooooov, t3, out=r4, alpha=0.5, beta=1.0)
+    W_ooooov = imds.W_ooooov = None
+    time1 = log.timer_debug1('t4: W_ooooov * t3', *time1)
 
     einsum('abmijk,mlcd->ijklabcd', W_vvoooo, t2, out=r4, alpha=-0.5, beta=1.0)
     W_vvoooo = imds.W_vvoooo = None
@@ -266,7 +264,7 @@ def update_amps_rccsdtq_(mycc, tamps, eris):
     t1, t2, t3, t4 = tamps
     mo_energy = eris.mo_energy
 
-    imds = _IMDS
+    imds = _IMDS()
 
     # t1, t2
     update_t1_fock_eris(mycc, imds, t1, eris)
@@ -280,15 +278,9 @@ def update_amps_rccsdtq_(mycc, tamps, eris):
     # symmetrization
     r2 += r2.transpose(1, 0, 3, 2)
     time1 = log.timer_debug1('t1t2: symmetrize r2', *time1)
-    # divide by eijkabc
+    # divide by eijab
     r1r2_divide_e_(mycc, r1, r2, mo_energy)
     time1 = log.timer_debug1('t1t2: divide r1 & r2 by eia & eijab', *time1)
-
-    res_norm = [np.linalg.norm(r1), np.linalg.norm(r2)]
-
-    t1 += r1
-    t2 += r2
-    time1 = log.timer_debug1('t1t2: update t1 & t2', *time1)
     time0 = log.timer_debug1('t1t2 total', *time0)
 
     # t3
@@ -306,12 +298,6 @@ def update_amps_rccsdtq_(mycc, tamps, eris):
     # divide by eijkabc
     r3_divide_e_(mycc, r3, mo_energy)
     time1 = log.timer_debug1('t3: divide r3 by eijkabc', *time1)
-
-    res_norm.append(np.linalg.norm(r3))
-
-    t3 += r3
-    r3 = None
-    time1 = log.timer_debug1('t3: update t3', *time1)
     time0 = log.timer_debug1('t3 total', *time0)
 
     # t4
@@ -323,18 +309,22 @@ def update_amps_rccsdtq_(mycc, tamps, eris):
     time1 = log.timer_debug1('t4: compute r4', *time1)
     # symmetrization
     t4_perm_symmetrize_inplace_(r4, nocc, nvir, 1.0, 0.0)
-    t4_spin_summation_inplace_(r4, nocc**4, nvir, "P4_full", -1.0 / 24.0, 1.0)
+    t4_project_1_minus_p4_p31_inplace_(r4, nocc**4, nvir)
     purify_tamps_(r4)
     time1 = log.timer_debug1('t4: symmetrize r4', *time1)
     # divide by eijkabc
     r4_divide_e_(mycc, r4, mo_energy)
     time1 = log.timer_debug1('t4: divide r4 by eijklabcd', *time1)
 
-    res_norm.append(np.linalg.norm(r4))
+    res_norm = [np.linalg.norm(r1), np.linalg.norm(r2), np.linalg.norm(r3), np.linalg.norm(r4)]
 
+    t1 += r1
+    t2 += r2
+    t3 += r3
+    # C implementation of t4 += r4
     t4_add_(t4, r4, nocc**4, nvir)
-    r4 = None
-    time1 = log.timer_debug1('t4: update t4', *time1)
+    r1, r2, r3, r4 = None, None, None, None
+    time1 = log.timer_debug1('t4: update t1, t2, t3, t4', *time1)
     time0 = log.timer_debug1('t4 total', *time0)
     return res_norm
 
diff --git a/pyscf/cc/test/test_ccsd_lambda.py b/pyscf/cc/test/test_ccsd_lambda.py
index 3bbc7d0ebd..831185325c 100644
--- a/pyscf/cc/test/test_ccsd_lambda.py
+++ b/pyscf/cc/test/test_ccsd_lambda.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import tempfile
 import unittest
 import numpy
 from functools import reduce
@@ -112,7 +111,7 @@ def test_ccsd(self):
         self.assertAlmostEqual(numpy.dot(numpy.sin(l2new.flatten()), numpy.arange(35**2)), 507.656936701192, 8)
 
     def test_restart(self):
-        ftmp = tempfile.NamedTemporaryFile()
+        ftmp = lib.NamedTemporaryFile()
         cc1 = mycc.copy()
         cc1.max_cycle = 5
         cc1.solve_lambda()
diff --git a/pyscf/cc/test/test_gccsd.py b/pyscf/cc/test/test_gccsd.py
index ed248645d1..bc021fd8e2 100644
--- a/pyscf/cc/test/test_gccsd.py
+++ b/pyscf/cc/test/test_gccsd.py
@@ -505,6 +505,36 @@ def test_mbpt2(self):
         emp2 = mp.MP2(mf).kernel()[0]
         self.assertAlmostEqual(e, emp2, 9)
 
+    def test_complex_orbitals(self):
+        mol = gto.M(atom='''
+        O    0.   0.       0.
+        H    0.   -0.757   0.587
+        H    0.   0.757    0.587''',
+        basis='6-31g*')
+        mf = mol.RHF().run()
+        cc = mf.CCSD().run()
+        nr_ref = cc.ecc
+
+        mf = mol.GHF()
+        dm = mf.get_init_guess() + 0j
+        nao = mol.nao
+        # Mixing alpha and beta spins
+        dm[nao:,:nao] = .02j
+        dm[:nao,nao:] = -.02j
+        mf.kernel(dm0=dm)
+        # test eris_incore
+        cc = mf.CCSD().run()
+        self.assertAlmostEqual(cc.ecc, nr_ref, 6)
+
+        # test eris_outcore
+        mf._eri = None
+        cc = mf.CCSD().run()
+        self.assertAlmostEqual(cc.ecc, nr_ref, 6)
+
+        # With SOC, correlation energy is slightly different from NR ref value
+        mf = mf.x2c().run()
+        cc = mf.CCSD().run()
+        self.assertAlmostEqual(cc.ecc, -0.19527045, 6)
 
 if __name__ == "__main__":
     print("Tests for GCCSD")
diff --git a/pyscf/cc/test/test_rccsd.py b/pyscf/cc/test/test_rccsd.py
index ff77224dae..d53cfb2217 100644
--- a/pyscf/cc/test/test_rccsd.py
+++ b/pyscf/cc/test/test_rccsd.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import tempfile
 from functools import reduce
 import unittest
 import copy
@@ -43,8 +42,8 @@ def setUpModule():
     mol.basis = '631g'
     mol.build()
     mf = scf.RHF(mol)
-    mf.chkfile = tempfile.NamedTemporaryFile().name
     mf.conv_tol_grad = 1e-8
+    mf.chkfile = lib.NamedTemporaryFile().name
     mf.kernel()
 
     mycc = rccsd.RCCSD(mf)
@@ -152,7 +151,7 @@ def test_no_diis(self):
         self.assertAlmostEqual(cc1.e_corr, -0.13516622806104395, 7)
 
     def test_restart(self):
-        ftmp = tempfile.NamedTemporaryFile()
+        ftmp = lib.NamedTemporaryFile()
         cc1 = cc.CCSD(mf)
         cc1.max_cycle = 5
         cc1.kernel()
diff --git a/pyscf/cc/test/test_rccsd_lambda.py b/pyscf/cc/test/test_rccsd_lambda.py
index ae46eb507e..12945144f3 100644
--- a/pyscf/cc/test/test_rccsd_lambda.py
+++ b/pyscf/cc/test/test_rccsd_lambda.py
@@ -262,7 +262,7 @@ def test_rdm_trace(self):
             +numpy.einsum('pkkq->pq', eri0[:nocc,:nocc,:nocc,:nocc]).trace())
         self.assertAlmostEqual(e2, -794721.197459942, 8)
         self.assertAlmostEqual(numpy.einsum('pqrs,pqrs', dm2, eri0)*.5 +
-                               numpy.einsum('pq,qp', dm1, h1), e2, 9)
+                               numpy.einsum('pq,qp', dm1, h1), e2, 8)
 
         self.assertAlmostEqual(abs(dm2-dm2.transpose(1,0,3,2)).max(), 0, 9)
         self.assertAlmostEqual(abs(dm2-dm2.transpose(2,3,0,1)).max(), 0, 9)
diff --git a/pyscf/cc/test/test_rccsdt.py b/pyscf/cc/test/test_rccsdt.py
index 90fc4ef609..83e5f53f10 100644
--- a/pyscf/cc/test/test_rccsdt.py
+++ b/pyscf/cc/test/test_rccsdt.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright 2014-2025 The PySCF Developers. All Rights Reserved.
+# Copyright 2014-2026 The PySCF Developers. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import tempfile
 from functools import reduce
 import unittest
 import copy
@@ -41,8 +40,8 @@ def setUpModule():
     mol.basis = '631g'
     mol.build()
     mf = scf.RHF(mol)
-    mf.chkfile = tempfile.NamedTemporaryFile().name
     mf.conv_tol_grad = 1e-8
+    mf.chkfile = lib.NamedTemporaryFile().name
     mf.kernel()
 
     mycc = rccsdt.RCCSDT(mf)
@@ -135,10 +134,10 @@ def test_no_diis(self):
         cc1.diis = False
         cc1.max_cycle = 4
         cc1.kernel()
-        self.assertAlmostEqual(cc1.e_corr, -0.1362172678103062, 7)
+        self.assertAlmostEqual(cc1.e_corr, -0.13620561873465928, 7)
 
     def test_restart(self):
-        ftmp = tempfile.NamedTemporaryFile()
+        ftmp = lib.NamedTemporaryFile()
         cc1 = cc.RCCSDT(mf)
         cc1.max_cycle = 5
         cc1.kernel()
@@ -149,7 +148,7 @@ def test_restart(self):
         cc1.diis = adiis
         cc1.max_cycle = 3
         cc1.kernel(tamps=None)
-        self.assertAlmostEqual(cc1.e_corr, -0.13618790413398396, 7)
+        self.assertAlmostEqual(cc1.e_corr, -0.13601543222004697, 7)
 
         tamps = cc1.vector_to_amplitudes(adiis.extrapolate())
         self.assertAlmostEqual(abs(tamps[0] - cc1.t1).max(), 0, 9)
@@ -160,7 +159,7 @@ def test_restart(self):
         import copy
         tmp_tamps = copy.deepcopy(tamps)
         cc1.kernel(tmp_tamps)
-        self.assertAlmostEqual(cc1.e_corr, -0.13636637468987364, 7)
+        self.assertAlmostEqual(cc1.e_corr, -0.13632994594327189, 7)
 
         cc1.diis = adiis
         cc1.max_cycle = 2
diff --git a/pyscf/cc/test/test_rccsdt_highm.py b/pyscf/cc/test/test_rccsdt_highm.py
index 6232b636e1..aad955ea84 100644
--- a/pyscf/cc/test/test_rccsdt_highm.py
+++ b/pyscf/cc/test/test_rccsdt_highm.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright 2014-2025 The PySCF Developers. All Rights Reserved.
+# Copyright 2014-2026 The PySCF Developers. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import tempfile
 from functools import reduce
 import unittest
 import copy
@@ -42,8 +41,8 @@ def setUpModule():
     mol.basis = '631g'
     mol.build()
     mf = scf.RHF(mol)
-    mf.chkfile = tempfile.NamedTemporaryFile().name
     mf.conv_tol_grad = 1e-8
+    mf.chkfile = lib.NamedTemporaryFile().name
     mf.kernel()
 
     mycc = rccsdt_highm.RCCSDT(mf)
@@ -92,10 +91,10 @@ def test_no_diis(self):
         cc1.diis = False
         cc1.max_cycle = 4
         cc1.kernel()
-        self.assertAlmostEqual(cc1.e_corr, -0.1362172678103062, 7)
+        self.assertAlmostEqual(cc1.e_corr, -0.13620561873465487, 7)
 
     def test_restart(self):
-        ftmp = tempfile.NamedTemporaryFile()
+        ftmp = lib.NamedTemporaryFile()
         cc1 = cc.RCCSDT(mf, compact_tamps=False)
         cc1.max_cycle = 5
         cc1.kernel()
@@ -106,7 +105,7 @@ def test_restart(self):
         cc1.diis = adiis
         cc1.max_cycle = 3
         cc1.kernel(tamps=None)
-        self.assertAlmostEqual(cc1.e_corr, -0.13618790413398396, 7)
+        self.assertAlmostEqual(cc1.e_corr, -0.13601543222004753, 7)
 
         tamps = cc1.vector_to_amplitudes(adiis.extrapolate())
         self.assertAlmostEqual(abs(tamps[0] - cc1.t1).max(), 0, 9)
@@ -117,7 +116,7 @@ def test_restart(self):
         import copy
         tmp_tamps = copy.deepcopy(tamps)
         cc1.kernel(tmp_tamps)
-        self.assertAlmostEqual(cc1.e_corr, -0.13636637468987364, 7)
+        self.assertAlmostEqual(cc1.e_corr, -0.1363299459432733, 7)
 
         cc1.diis = adiis
         cc1.max_cycle = 2
diff --git a/pyscf/cc/test/test_rccsdt_q.py b/pyscf/cc/test/test_rccsdt_q.py
new file mode 100644
index 0000000000..9dd4427f3d
--- /dev/null
+++ b/pyscf/cc/test/test_rccsdt_q.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python
+# Copyright 2014-2026 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy
+from functools import reduce
+
+from pyscf import gto, scf, lib, symm
+from pyscf import cc
+from pyscf import ao2mo
+from pyscf.cc import rccsdt_q
+
+
+def setUpModule():
+    global mol, rhf, mcc, mcc2
+    mol = gto.Mole()
+    mol.atom = [
+        [8 , (0. , 0.     , 0.)],
+        [1 , (0. , -.757 , .487)],
+        [1 , (0. ,  .757 , .687)]]
+    mol.symmetry = True
+    mol.verbose = 7
+    mol.output = '/dev/null'
+    mol.basis = 'ccpvdz'
+    mol.build()
+    rhf = scf.RHF(mol)
+    rhf.conv_tol = 1e-14
+    rhf.scf()
+
+    mcc = cc.CCSDT(rhf, compact_tamps=True)
+    mcc.conv_tol = 1e-10
+    mcc.blksize = 2
+    mcc.blksize_oooo = 2
+    mcc.blksize_oovv = 2
+    mcc.ccsdt()
+
+    mcc2 = cc.CCSDT(rhf, compact_tamps=False)
+    mcc2.conv_tol = 1e-10
+    mcc2.ccsdt()
+
+def tearDownModule():
+    global mol, rhf, mcc, mcc2
+    mol.stdout.close()
+    del mol, rhf, mcc, mcc2
+
+class KnownValues(unittest.TestCase):
+    def test_rccsdt_q(self):
+        e_q_bracket, e_q_paren  = mcc.ccsdt_q()
+        self.assertAlmostEqual(e_q_bracket, -0.00044374834015582527, 9)
+        self.assertAlmostEqual(e_q_paren, -0.0004917163848923114, 9)
+        e_q_bracket2, e_q_paren2 = mcc2.ccsdt_q()
+        self.assertAlmostEqual(e_q_bracket2, -0.00044374834015582527, 9)
+        self.assertAlmostEqual(e_q_paren2, -0.0004917163848923114, 9)
+
+    def test_random(self):
+        mol = gto.M()
+        numpy.random.seed(42)
+        nocc, nvir = 5, 9
+        nmo = nocc + nvir
+
+        eris = cc.rccsdt._PhysicistsERIs()
+        eri1 = numpy.random.random((nmo, nmo, nmo, nmo)) - .5
+        eri1 = eri1 + eri1.transpose(2, 1, 0, 3)
+        eri1 = eri1 + eri1.transpose(0, 3, 2, 1)
+        eri1 = eri1 + eri1.transpose(1, 0, 3, 2)
+        eri1 *= .1
+        eris.pppp = eri1
+        f = numpy.random.random((nmo, nmo)) * .1
+        eris.fock = f + f.T + numpy.diag(numpy.arange(nmo))
+        eris.mo_energy = eris.fock.diagonal()
+
+        t1 = numpy.random.random((nocc, nvir)) * .1
+        t2 = numpy.random.random((nocc, nocc, nvir, nvir)) * .1
+        t2 = t2 + t2.transpose(1, 0, 3, 2)
+        t3_full = numpy.random.random((nocc, nocc, nocc, nvir, nvir, nvir)) * .1
+        t3_full = t3_full + t3_full.transpose(1, 0, 2, 4, 3, 5) + t3_full.transpose(2, 1, 0, 5, 4, 3)
+        t3_full = t3_full + t3_full.transpose(0, 2, 1, 3, 5, 4)
+        mf = scf.RHF(mol)
+        mycc = cc.CCSDT(mf, compact_tamps=False)
+        mycc.incore_complete = True
+        mycc.mo_energy = mycc._scf.mo_energy = numpy.arange(0., nocc + nvir)
+        e_q_bracket, e_q_paren = rccsdt_q.kernel(mycc, eris, (t1, t2, t3_full))
+        self.assertAlmostEqual(e_q_bracket, -1.1359579193293403, 9)
+        self.assertAlmostEqual(e_q_paren, -256.1325101409764, 9)
+
+        idx_i, idx_j, idx_k = numpy.meshgrid(numpy.arange(nocc), numpy.arange(nocc), numpy.arange(nocc), indexing='ij')
+        t3_tri = t3_full[(idx_i <= idx_j) & (idx_j <= idx_k)].reshape(-1, nvir, nvir, nvir)
+        mycc2 = cc.CCSDT(mf, compact_tamps=True)
+        mycc2.incore_complete = True
+        mycc2.mo_energy = mycc2._scf.mo_energy = numpy.arange(0., nocc + nvir)
+        mycc2.nocc, mycc2.nmo = nocc, nmo
+        e_q_bracket2, e_q_paren2 = rccsdt_q.kernel(mycc2, eris, (t1, t2, t3_tri))
+        self.assertAlmostEqual(e_q_bracket2, -1.1359579193293403, 9)
+        self.assertAlmostEqual(e_q_paren2, -256.1325101409764, 9)
+
+if __name__ == "__main__":
+    print("Full Tests for RCCSDT(Q)")
+    unittest.main()
diff --git a/pyscf/cc/test/test_rccsdtq.py b/pyscf/cc/test/test_rccsdtq.py
index 361a722dc2..e994241657 100644
--- a/pyscf/cc/test/test_rccsdtq.py
+++ b/pyscf/cc/test/test_rccsdtq.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright 2014-2025 The PySCF Developers. All Rights Reserved.
+# Copyright 2014-2026 The PySCF Developers. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import tempfile
 from functools import reduce
 import unittest
 import copy
@@ -42,8 +41,8 @@ def setUpModule():
     mol.basis = 'sto3g'
     mol.build()
     mf = scf.RHF(mol)
-    mf.chkfile = tempfile.NamedTemporaryFile().name
     mf.conv_tol_grad = 1e-8
+    mf.chkfile = lib.NamedTemporaryFile().name
     mf.kernel()
 
     mycc = rccsdtq.RCCSDTQ(mf)
@@ -120,10 +119,10 @@ def test_no_diis(self):
         cc1.diis = False
         cc1.max_cycle = 4
         cc1.kernel()
-        self.assertAlmostEqual(cc1.e_corr, -0.04931187059105583, 7)
+        self.assertAlmostEqual(cc1.e_corr, -0.049309044956853954, 7)
 
     def test_restart(self):
-        ftmp = tempfile.NamedTemporaryFile()
+        ftmp = lib.NamedTemporaryFile()
         cc1 = cc.RCCSDTQ(mf)
         cc1.max_cycle = 5
         cc1.kernel()
@@ -134,7 +133,7 @@ def test_restart(self):
         cc1.diis = adiis
         cc1.max_cycle = 3
         cc1.kernel(tamps=None)
-        self.assertAlmostEqual(cc1.e_corr, -0.04958018529884438, 7)
+        self.assertAlmostEqual(cc1.e_corr, -0.04957847496659795, 7)
 
         tamps = cc1.vector_to_amplitudes(adiis.extrapolate())
         self.assertAlmostEqual(abs(tamps[0] - cc1.t1).max(), 0, 9)
@@ -146,7 +145,7 @@ def test_restart(self):
         import copy
         tmp_tamps = copy.deepcopy(tamps)
         cc1.kernel(tmp_tamps)
-        self.assertAlmostEqual(cc1.e_corr, -0.04956154962282544, 7)
+        self.assertAlmostEqual(cc1.e_corr, -0.04956142543268752, 7)
 
         cc1.diis = adiis
         cc1.max_cycle = 2
diff --git a/pyscf/cc/test/test_rccsdtq_highm.py b/pyscf/cc/test/test_rccsdtq_highm.py
index 5761f5c69c..a768e4548c 100644
--- a/pyscf/cc/test/test_rccsdtq_highm.py
+++ b/pyscf/cc/test/test_rccsdtq_highm.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright 2014-2025 The PySCF Developers. All Rights Reserved.
+# Copyright 2014-2026 The PySCF Developers. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import tempfile
 from functools import reduce
 import unittest
 import copy
@@ -42,8 +41,8 @@ def setUpModule():
     mol.basis = 'sto3g'
     mol.build()
     mf = scf.RHF(mol)
-    mf.chkfile = tempfile.NamedTemporaryFile().name
     mf.conv_tol_grad = 1e-8
+    mf.chkfile = lib.NamedTemporaryFile().name
     mf.kernel()
 
     mycc = rccsdtq_highm.RCCSDTQ(mf)
@@ -88,10 +87,10 @@ def test_no_diis(self):
         cc1.diis = False
         cc1.max_cycle = 4
         cc1.kernel()
-        self.assertAlmostEqual(cc1.e_corr, -0.04931187059105583, 7)
+        self.assertAlmostEqual(cc1.e_corr, -0.04930904495685323, 7)
 
     def test_restart(self):
-        ftmp = tempfile.NamedTemporaryFile()
+        ftmp = lib.NamedTemporaryFile()
         cc1 = cc.RCCSDTQ(mf, compact_tamps=False)
         cc1.max_cycle = 5
         cc1.kernel()
@@ -102,7 +101,7 @@ def test_restart(self):
         cc1.diis = adiis
         cc1.max_cycle = 3
         cc1.kernel(tamps=None)
-        self.assertAlmostEqual(cc1.e_corr, -0.04958018529884438, 7)
+        self.assertAlmostEqual(cc1.e_corr, -0.04957847496659781, 7)
 
         tamps = cc1.vector_to_amplitudes(adiis.extrapolate())
         self.assertAlmostEqual(abs(tamps[0] - cc1.t1).max(), 0, 9)
@@ -114,7 +113,7 @@ def test_restart(self):
         import copy
         tmp_tamps = copy.deepcopy(tamps)
         cc1.kernel(tmp_tamps)
-        self.assertAlmostEqual(cc1.e_corr, -0.04956154962282544, 7)
+        self.assertAlmostEqual(cc1.e_corr, -0.04956142543268758, 7)
 
         cc1.diis = adiis
         cc1.max_cycle = 2
diff --git a/pyscf/cc/test/test_uccsdt.py b/pyscf/cc/test/test_uccsdt.py
index f68bc63fdc..eaeb66b699 100644
--- a/pyscf/cc/test/test_uccsdt.py
+++ b/pyscf/cc/test/test_uccsdt.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright 2014-2025 The PySCF Developers. All Rights Reserved.
+# Copyright 2014-2026 The PySCF Developers. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import tempfile
 import unittest
 import copy
 import numpy
@@ -185,7 +184,7 @@ def test_with_df_s2(self):
         self.assertAlmostEqual(mycc.e_tot, -75.83479685448731, 8)
 
     def test_restart_s0(self):
-        ftmp = tempfile.NamedTemporaryFile()
+        ftmp = lib.NamedTemporaryFile()
         cc1 = cc.UCCSDT(mf)
         cc1.max_cycle = 5
         cc1.kernel()
@@ -196,7 +195,7 @@ def test_restart_s0(self):
         cc1.diis = adiis
         cc1.max_cycle = 3
         cc1.kernel(tamps=None)
-        self.assertAlmostEqual(cc1.e_corr, -0.13617537767875998, 7)
+        self.assertAlmostEqual(cc1.e_corr, -0.13598921953216506, 7)
 
         tamps = cc1.vector_to_amplitudes(adiis.extrapolate())
         self.assertAlmostEqual(abs(tamps[0][0] - cc1.t1[0]).max(), 0, 9)
@@ -213,7 +212,7 @@ def test_restart_s0(self):
         import copy
         tmp_tamps = copy.deepcopy(tamps)
         cc1.kernel(tmp_tamps)
-        self.assertAlmostEqual(cc1.e_corr, -0.13636112399459543, 7)
+        self.assertAlmostEqual(cc1.e_corr, -0.13631662652255083, 7)
 
         cc1.diis = adiis
         cc1.max_cycle = 2
@@ -233,7 +232,7 @@ def test_restart_s0(self):
         self.assertAlmostEqual(abs(cc1.t3[3] - cc2.t3[3]).max(), 0, 9)
 
     def test_restart_s2(self):
-        ftmp = tempfile.NamedTemporaryFile()
+        ftmp = lib.NamedTemporaryFile()
         cc1 = cc.UCCSDT(mf_s2)
         cc1.max_cycle = 5
         cc1.kernel()
@@ -244,7 +243,7 @@ def test_restart_s2(self):
         cc1.diis = adiis
         cc1.max_cycle = 3
         cc1.kernel(tamps=None)
-        self.assertAlmostEqual(cc1.e_corr, -0.10899528342067309, 7)
+        self.assertAlmostEqual(cc1.e_corr, -0.10890900976962495, 7)
 
         tamps = cc1.vector_to_amplitudes(adiis.extrapolate())
         self.assertAlmostEqual(abs(tamps[0][0] - cc1.t1[0]).max(), 0, 9)
@@ -261,7 +260,7 @@ def test_restart_s2(self):
         import copy
         tmp_tamps = copy.deepcopy(tamps)
         cc1.kernel(tmp_tamps)
-        self.assertAlmostEqual(cc1.e_corr, -0.10909663534556953, 7)
+        self.assertAlmostEqual(cc1.e_corr, -0.10908025852894825, 7)
 
         cc1.diis = adiis
         cc1.max_cycle = 2
@@ -281,7 +280,7 @@ def test_restart_s2(self):
         self.assertAlmostEqual(abs(cc1.t3[3] - cc2.t3[3]).max(), 0, 9)
 
     def test_restart_s2_not_do_diis_max_t(self):
-        ftmp = tempfile.NamedTemporaryFile()
+        ftmp = lib.NamedTemporaryFile()
         cc1 = cc.UCCSDT(mf_s2)
         cc1.max_cycle = 5
         cc1.do_diis_max_t = False
@@ -293,7 +292,7 @@ def test_restart_s2_not_do_diis_max_t(self):
         cc1.diis = adiis
         cc1.max_cycle = 3
         cc1.kernel(tamps=None)
-        self.assertAlmostEqual(cc1.e_corr, -0.10900065442286336, 7)
+        self.assertAlmostEqual(cc1.e_corr, -0.10890253107679486, 7)
 
         tamps = cc1.vector_to_amplitudes(adiis.extrapolate())
         tamps.append(cc1.tamps[2])
@@ -307,7 +306,7 @@ def test_restart_s2_not_do_diis_max_t(self):
         import copy
         tmp_tamps = copy.deepcopy(tamps)
         cc1.kernel(tmp_tamps)
-        self.assertAlmostEqual(cc1.e_corr, -0.10907414414270558, 7)
+        self.assertAlmostEqual(cc1.e_corr, -0.10903201331931782, 7)
 
         cc1.diis = adiis
         cc1.max_cycle = 2
diff --git a/pyscf/cc/test/test_uccsdt_highm.py b/pyscf/cc/test/test_uccsdt_highm.py
index 8399638510..acb6c2bd80 100644
--- a/pyscf/cc/test/test_uccsdt_highm.py
+++ b/pyscf/cc/test/test_uccsdt_highm.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright 2014-2025 The PySCF Developers. All Rights Reserved.
+# Copyright 2014-2026 The PySCF Developers. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import tempfile
 import unittest
 import copy
 import numpy
@@ -85,7 +84,7 @@ def test_with_df_s2(self):
         self.assertAlmostEqual(mycc.e_tot, -75.83479685448731, 8)
 
     def test_restart_s0(self):
-        ftmp = tempfile.NamedTemporaryFile()
+        ftmp = lib.NamedTemporaryFile()
         cc1 = cc.UCCSDT(mf, compact_tamps=False)
         cc1.max_cycle = 5
         cc1.kernel()
@@ -96,7 +95,7 @@ def test_restart_s0(self):
         cc1.diis = adiis
         cc1.max_cycle = 3
         cc1.kernel(tamps=None)
-        self.assertAlmostEqual(cc1.e_corr, -0.13617537767875998, 7)
+        self.assertAlmostEqual(cc1.e_corr, -0.13598921953216658, 7)
 
         tamps = cc1.vector_to_amplitudes(adiis.extrapolate())
         self.assertAlmostEqual(abs(tamps[0][0] - cc1.t1[0]).max(), 0, 9)
@@ -121,7 +120,7 @@ def test_restart_s0(self):
         import copy
         tmp_tamps = copy.deepcopy(tamps)
         cc1.kernel(tmp_tamps)
-        self.assertAlmostEqual(cc1.e_corr, -0.13636112399459543, 7)
+        self.assertAlmostEqual(cc1.e_corr, -0.1363166265225506, 7)
 
         cc1.diis = adiis
         cc1.max_cycle = 2
@@ -141,7 +140,7 @@ def test_restart_s0(self):
         self.assertAlmostEqual(abs(cc1.t3[3] - cc2.t3[3]).max(), 0, 9)
 
     def test_restart_s2(self):
-        ftmp = tempfile.NamedTemporaryFile()
+        ftmp = lib.NamedTemporaryFile()
         cc1 = cc.UCCSDT(mf_s2, compact_tamps=False)
         cc1.max_cycle = 5
         cc1.kernel()
@@ -152,7 +151,7 @@ def test_restart_s2(self):
         cc1.diis = adiis
         cc1.max_cycle = 3
         cc1.kernel(tamps=None)
-        self.assertAlmostEqual(cc1.e_corr, -0.10899528342067309, 7)
+        self.assertAlmostEqual(cc1.e_corr, -0.10890900976962473, 7)
 
         tamps = cc1.vector_to_amplitudes(adiis.extrapolate())
         self.assertAlmostEqual(abs(tamps[0][0] - cc1.t1[0]).max(), 0, 9)
@@ -169,7 +168,7 @@ def test_restart_s2(self):
         import copy
         tmp_tamps = copy.deepcopy(tamps)
         cc1.kernel(tmp_tamps)
-        self.assertAlmostEqual(cc1.e_corr, -0.10909663534556953, 7)
+        self.assertAlmostEqual(cc1.e_corr, -0.10908025852894809, 7)
 
         cc1.diis = adiis
         cc1.max_cycle = 2
@@ -189,7 +188,7 @@ def test_restart_s2(self):
         self.assertAlmostEqual(abs(cc1.t3[3] - cc2.t3[3]).max(), 0, 9)
 
     def test_restart_s2_not_do_diis_max_t(self):
-        ftmp = tempfile.NamedTemporaryFile()
+        ftmp = lib.NamedTemporaryFile()
         cc1 = cc.UCCSDT(mf_s2, compact_tamps=False)
         cc1.max_cycle = 5
         cc1.do_diis_max_t = False
@@ -201,7 +200,7 @@ def test_restart_s2_not_do_diis_max_t(self):
         cc1.diis = adiis
         cc1.max_cycle = 3
         cc1.kernel(tamps=None)
-        self.assertAlmostEqual(cc1.e_corr, -0.10900065442286336, 7)
+        self.assertAlmostEqual(cc1.e_corr, -0.10890253107679505, 7)
 
         tamps = cc1.vector_to_amplitudes(adiis.extrapolate())
         tamps.append(cc1.tamps[2])
@@ -215,7 +214,7 @@ def test_restart_s2_not_do_diis_max_t(self):
         import copy
         tmp_tamps = copy.deepcopy(tamps)
         cc1.kernel(tmp_tamps)
-        self.assertAlmostEqual(cc1.e_corr, -0.10907414414270558, 7)
+        self.assertAlmostEqual(cc1.e_corr, -0.10903201331931785, 7)
 
         cc1.diis = adiis
         cc1.max_cycle = 2
diff --git a/pyscf/cc/uccsdt.py b/pyscf/cc/uccsdt.py
index 76c052b05b..0e24ffe6aa 100644
--- a/pyscf/cc/uccsdt.py
+++ b/pyscf/cc/uccsdt.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright 2014-2025 The PySCF Developers. All Rights Reserved.
+# Copyright 2014-2026 The PySCF Developers. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -450,7 +450,7 @@ def energy_uhf(mycc, tamps, eris=None):
     eos +=       lib.einsum('ia,JB,iJaB', t1a, t1b, eris.pPpP[:nocca, :noccb, nocca:, noccb:])
 
     if abs((ess + eos).imag) > 1e-4:
-        logger.warn(mycc, 'Non-zero imaginary part found in %s energy %s', mycc.__class__.name, ess + eos)
+        logger.warn(mycc, 'Non-zero imaginary part found in %s energy %s', mycc.__class__.__name__, ess + eos)
 
     mycc.e_corr = lib.tag_array((ess + eos).real, e_corr_ss=ess.real, e_corr_os=eos.real)
     return mycc.e_corr.real
@@ -553,12 +553,12 @@ def intermediates_t1t2_uhf(mycc, imds, t2):
     einsum('kldc,jdlc->kj', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2ab, out=F_oo, alpha=1.0, beta=1.0)
     W_oooo = t1_erisaa[:nocca, :nocca, :nocca, :nocca].copy()
     einsum('klcd,ijcd->klij', t1_erisaa[:nocca, :nocca, nocca:, nocca:], t2aa, out=W_oooo, alpha=0.5, beta=1.0)
-    W_ovvo = t1_erisaa[:nocca, nocca:, nocca:, :nocca].copy()
-    einsum('klcd,jlbd->kbcj', t1_erisaa[:nocca, :nocca, nocca:, nocca:], t2aa, out=W_ovvo, alpha=0.5, beta=1.0)
-    einsum('klcd,jbld->kbcj', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2ab, out=W_ovvo, alpha=0.5, beta=1.0)
-    W_OvVo = t1_erisab[nocca:, :noccb, :nocca, noccb:].transpose(1, 0, 3, 2).copy()
-    einsum('klcd,jbld->kbcj', t1_erisbb[:noccb, :noccb, noccb:, noccb:], t2ab, out=W_OvVo, alpha=0.5, beta=1.0)
-    einsum('lkdc,jlbd->kbcj', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2aa, out=W_OvVo, alpha=0.5, beta=1.0)
+    W_voov = t1_erisaa[nocca:, :nocca, :nocca, nocca:].copy()
+    einsum('klcd,jlbd->bkjc', t1_erisaa[:nocca, :nocca, nocca:, nocca:], t2aa, out=W_voov, alpha=0.5, beta=1.0)
+    einsum('klcd,jbld->bkjc', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2ab, out=W_voov, alpha=0.5, beta=1.0)
+    W_vOoV = t1_erisab[nocca:, :noccb, :nocca, noccb:].copy()
+    einsum('klcd,jbld->bkjc', t1_erisbb[:noccb, :noccb, noccb:, noccb:], t2ab, out=W_vOoV, alpha=0.5, beta=1.0)
+    einsum('lkdc,jlbd->bkjc', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2aa, out=W_vOoV, alpha=0.5, beta=1.0)
 
     F_VV = t1_fockb[noccb:, noccb:].copy()
     einsum('klcd,klbd->bc', t1_erisbb[:noccb, :noccb, noccb:, noccb:], t2bb, out=F_VV, alpha=-0.5, beta=1.0)
@@ -568,36 +568,31 @@ def intermediates_t1t2_uhf(mycc, imds, t2):
     einsum('lkcd,lcjd->kj', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2ab, out=F_OO, alpha=1.0, beta=1.0)
     W_OOOO = t1_erisbb[:noccb, :noccb, :noccb, :noccb].copy()
     einsum('klcd,ijcd->klij', t1_erisbb[:noccb, :noccb, noccb:, noccb:], t2bb, out=W_OOOO, alpha=0.5, beta=1.0)
-    W_OVVO = t1_erisbb[:noccb, noccb:, noccb:, :noccb].copy()
-    einsum('klcd,jlbd->kbcj', t1_erisbb[:noccb, :noccb, noccb:, noccb:], t2bb, out=W_OVVO, alpha=0.5, beta=1.0)
-    einsum('lkdc,ldjb->kbcj', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2ab, out=W_OVVO, alpha=0.5, beta=1.0)
+    W_VOOV = t1_erisbb[noccb:, :noccb, :noccb, noccb:].copy()
+    einsum('klcd,jlbd->bkjc', t1_erisbb[:noccb, :noccb, noccb:, noccb:], t2bb, out=W_VOOV, alpha=0.5, beta=1.0)
+    einsum('lkdc,ldjb->bkjc', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2ab, out=W_VOOV, alpha=0.5, beta=1.0)
+
     W_oVvO = t1_erisab[:nocca, noccb:, nocca:, :noccb].copy()
     einsum('klcd,ldjb->kbcj', t1_erisaa[:nocca, :nocca, nocca:, nocca:], t2ab, out=W_oVvO, alpha=0.5, beta=1.0)
     einsum('klcd,jlbd->kbcj', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2bb, out=W_oVvO, alpha=0.5, beta=1.0)
-
     W_oOoO = t1_erisab[:nocca, :noccb, :nocca, :noccb].copy()
     einsum('klcd,icjd->klij', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2ab, out=W_oOoO, alpha=1.0, beta=1.0)
-    W_vOvO = - t1_erisab[nocca:, :noccb, nocca:, :noccb]
-    einsum('lkcd,lajd->akcj', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2ab, out=W_vOvO, alpha=0.5, beta=1.0)
-    W_VoVo = - t1_erisab[:nocca, noccb:, :nocca, noccb:].transpose(1, 0, 3, 2)
-    einsum('kldc,idlb->bkci', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2ab, out=W_VoVo, alpha=0.5, beta=1.0)
-    W_vovo = - t1_erisaa[nocca:, :nocca, nocca:, :nocca]
-    einsum('klcd,lida->akci', t1_erisaa[:nocca, :nocca, nocca:, nocca:], t2aa, out=W_vovo, alpha=0.5, beta=1.0)
-    einsum('klcd,iald->akci', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2ab, out=W_vovo, alpha=0.5, beta=1.0)
-    W_VOVO = - t1_erisbb[noccb:, :noccb, noccb:, :noccb]
-    einsum('klcd,ljdb->bkcj', t1_erisbb[:noccb, :noccb, noccb:, noccb:], t2bb, out=W_VOVO, alpha=0.5, beta=1.0)
-    einsum('lkdc,ldjb->bkcj', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2ab, out=W_VOVO, alpha=0.5, beta=1.0)
-    W_vOVo = t1_erisab[nocca:, :noccb, :nocca, noccb:].transpose(0, 1, 3, 2).copy()
-    einsum('lkdc,ilad->akci', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2aa, out=W_vOVo, alpha=0.5, beta=1.0)
-    einsum('lkdc,iald->akci', t1_erisbb[:noccb, :noccb, noccb:, noccb:], t2ab, out=W_vOVo, alpha=0.5, beta=1.0)
-    W_VovO = t1_erisab[:nocca, noccb:, nocca:, :noccb].transpose(1, 0, 2, 3).copy()
-    einsum('klcd,ljdb->bkcj', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2bb, out=W_VovO, alpha=0.5, beta=1.0)
-    einsum('lkdc,ldjb->bkcj', t1_erisaa[:nocca, :nocca, nocca:, nocca:], t2ab, out=W_VovO, alpha=0.5, beta=1.0)
+    W_vOvO = t1_erisab[nocca:, :noccb, nocca:, :noccb].copy()
+    einsum('lkcd,lajd->akcj', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2ab, out=W_vOvO, alpha=-0.5, beta=1.0)
+    W_oVoV = t1_erisab[:nocca, noccb:, :nocca, noccb:].copy()
+    einsum('kldc,idlb->kbic', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2ab, out=W_oVoV, alpha=-0.5, beta=1.0)
+    W_vovo = t1_erisaa[nocca:, :nocca, nocca:, :nocca].copy()
+    einsum('klcd,lida->akci', t1_erisaa[:nocca, :nocca, nocca:, nocca:], t2aa, out=W_vovo, alpha=-0.5, beta=1.0)
+    einsum('klcd,iald->akci', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2ab, out=W_vovo, alpha=-0.5, beta=1.0)
+    W_VOVO = t1_erisbb[noccb:, :noccb, noccb:, :noccb].copy()
+    einsum('klcd,ljdb->bkcj', t1_erisbb[:noccb, :noccb, noccb:, noccb:], t2bb, out=W_VOVO, alpha=-0.5, beta=1.0)
+    einsum('lkdc,ldjb->bkcj', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2ab, out=W_VOVO, alpha=-0.5, beta=1.0)
+
     imds.F_oo, imds.F_OO, imds.F_vv, imds.F_VV = F_oo, F_OO, F_vv, F_VV
     imds.W_oooo, imds.W_oOoO, imds.W_OOOO = W_oooo, W_oOoO, W_OOOO
-    imds.W_ovvo, imds.W_oVvO, imds.W_OvVo, imds.W_OVVO = W_ovvo, W_oVvO, W_OvVo, W_OVVO,
-    imds.W_vovo, imds.W_vOvO, imds.W_vOVo = W_vovo, W_vOvO, W_vOVo
-    imds.W_VovO, imds.W_VoVo, imds.W_VOVO = W_VovO, W_VoVo, W_VOVO
+    imds.W_voov, imds.W_oVvO, imds.W_VOOV = W_voov, W_oVvO, W_VOOV
+    imds.W_vovo, imds.W_vOvO, imds.W_vOoV = W_vovo, W_vOvO, W_vOoV
+    imds.W_oVoV, imds.W_VOVO = W_oVoV, W_VOVO
     return imds
 
 def compute_r1r2_uhf(mycc, imds, t2):
@@ -613,9 +608,9 @@ def compute_r1r2_uhf(mycc, imds, t2):
 
     F_oo, F_OO, F_vv, F_VV = imds.F_oo, imds.F_OO, imds.F_vv, imds.F_VV
     W_oooo, W_oOoO, W_OOOO = imds.W_oooo, imds.W_oOoO, imds.W_OOOO
-    W_ovvo, W_oVvO, W_OvVo, W_OVVO = imds.W_ovvo, imds.W_oVvO, imds.W_OvVo, imds.W_OVVO
-    W_vovo, W_vOvO, W_vOVo = imds.W_vovo, imds.W_vOvO, imds.W_vOVo
-    W_VovO, W_VoVo, W_VOVO = imds.W_VovO, imds.W_VoVo, imds.W_VOVO
+    W_voov, W_oVvO, W_VOOV = imds.W_voov, imds.W_oVvO, imds.W_VOOV
+    W_vovo, W_vOvO, W_vOoV = imds.W_vovo, imds.W_vOvO, imds.W_vOoV
+    W_oVoV, W_VOVO = imds.W_oVoV, imds.W_VOVO
 
     r1a = t1_focka[nocca:, :nocca].T.copy()
     einsum('kc,ikac->ia', t1_focka[:nocca, nocca:], t2aa, out=r1a, alpha=1.0, beta=1.0)
@@ -638,10 +633,8 @@ def compute_r1r2_uhf(mycc, imds, t2):
     einsum("kj,ikab->ijab", F_oo, t2aa, out=r2aa, alpha=-0.5, beta=1.0)
     einsum("abcd,ijcd->ijab", t1_erisaa[nocca:, nocca:, nocca:, nocca:], t2aa, out=r2aa, alpha=0.125, beta=1.0)
     einsum("klij,klab->ijab", W_oooo, t2aa, out=r2aa, alpha=0.125, beta=1.0)
-    einsum("kbcj,ikac->ijab", W_ovvo, t2aa, out=r2aa, alpha=1.0, beta=1.0)
-    einsum("kbcj,iakc->ijab", W_OvVo, t2ab, out=r2aa, alpha=1.0, beta=1.0)
-    W_ovvo = imds.W_ovvo = None
-    W_OvVo = imds.W_OvVo = None
+    einsum("bkjc,ikac->ijab", W_voov, t2aa, out=r2aa, alpha=1.0, beta=1.0)
+    einsum("bkjc,iakc->ijab", W_vOoV, t2ab, out=r2aa, alpha=1.0, beta=1.0)
 
     r2ab = t1_erisab[nocca:, noccb:, :nocca, :noccb].transpose(2, 3, 0, 1).copy()
     r2ab = r2ab.transpose(0, 2, 1, 3)
@@ -651,17 +644,13 @@ def compute_r1r2_uhf(mycc, imds, t2):
     einsum("ki,kajb->iajb", F_oo, t2ab, out=r2ab, alpha=-1.0, beta=1.0)
     einsum("abcd,icjd->iajb", t1_erisab[nocca:, noccb:, nocca:, noccb:], t2ab, out=r2ab, alpha=1.0, beta=1.0)
     einsum("klij,kalb->iajb", W_oOoO, t2ab, out=r2ab, alpha=1.0, beta=1.0)
-    einsum("akcj,ickb->iajb", W_vOvO, t2ab, out=r2ab, alpha=1.0, beta=1.0)
-    einsum("akci,kcjb->iajb", W_vovo, t2ab, out=r2ab, alpha=1.0, beta=1.0)
-    einsum("akci,kjcb->iajb", W_vOVo, t2bb, out=r2ab, alpha=1.0, beta=1.0)
-    einsum("bkcj,ikac->iajb", W_VovO, t2aa, out=r2ab, alpha=1.0, beta=1.0)
-    einsum("bkcj,iakc->iajb", W_VOVO, t2ab, out=r2ab, alpha=1.0, beta=1.0)
-    einsum("bkci,kajc->iajb", W_VoVo, t2ab, out=r2ab, alpha=1.0, beta=1.0)
+    einsum("akcj,ickb->iajb", W_vOvO, t2ab, out=r2ab, alpha=-1.0, beta=1.0)
+    einsum("akci,kcjb->iajb", W_vovo, t2ab, out=r2ab, alpha=-1.0, beta=1.0)
+    einsum("akic,kjcb->iajb", W_vOoV, t2bb, out=r2ab, alpha=1.0, beta=1.0)
+    einsum("kbcj,ikac->iajb", W_oVvO, t2aa, out=r2ab, alpha=1.0, beta=1.0)
+    einsum("bkcj,iakc->iajb", W_VOVO, t2ab, out=r2ab, alpha=-1.0, beta=1.0)
+    einsum("kbic,kajc->iajb", W_oVoV, t2ab, out=r2ab, alpha=-1.0, beta=1.0)
     W_vovo = imds.W_vovo = None
-    W_vOvO = imds.W_vOvO = None
-    W_vOVo = imds.W_vOVo = None
-    W_VovO = imds.W_VovO = None
-    W_VoVo = imds.W_VoVo = None
     W_VOVO = imds.W_VOVO = None
 
     r2bb = 0.25 * t1_erisbb[noccb:, noccb:, :noccb, :noccb].T
@@ -669,10 +658,8 @@ def compute_r1r2_uhf(mycc, imds, t2):
     einsum("kj,ikab->ijab", F_OO, t2bb, out=r2bb, alpha=-0.5, beta=1.0)
     einsum("abcd,ijcd->ijab", t1_erisbb[noccb:, noccb:, noccb:, noccb:], t2bb, out=r2bb, alpha=0.125, beta=1.0)
     einsum("klij,klab->ijab", W_OOOO, t2bb, out=r2bb, alpha=0.125, beta=1.0)
-    einsum("kbcj,ikac->ijab", W_OVVO, t2bb, out=r2bb, alpha=1.0, beta=1.0)
+    einsum("bkjc,ikac->ijab", W_VOOV, t2bb, out=r2bb, alpha=1.0, beta=1.0)
     einsum("kbcj,kcia->ijab", W_oVvO, t2ab, out=r2bb, alpha=1.0, beta=1.0)
-    W_oVvO = imds.W_oVvO = None
-    W_OVVO = imds.W_OVVO = None
     return [r1a, r1b], [r2aa, r2ab, r2bb]
 
 def r1r2_add_t3_tri_uhf_(mycc, imds, r1, r2, t3):
@@ -855,12 +842,6 @@ def intermediates_t3_uhf(mycc, imds, t2):
 
     W_vvvv = t1_erisaa[nocca:, nocca:, nocca:, nocca:].copy()
     einsum('lmde,lmab->abde', t1_erisaa[:nocca, :nocca, nocca:, nocca:], t2aa, out=W_vvvv, alpha=0.5, beta=1.0)
-    W_voov = t1_erisaa[nocca:, :nocca, :nocca, nocca:].copy()
-    einsum('mled,imae->alid', t1_erisaa[:nocca, :nocca, nocca:, nocca:], t2aa, out=W_voov, alpha=1.0, beta=1.0)
-    einsum('lmde,iame->alid', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2ab, out=W_voov, alpha=1.0, beta=1.0)
-    W_vOoV = t1_erisab[nocca:, :noccb, :nocca, noccb:].copy()
-    einsum('mled,imae->alid', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2aa, out=W_vOoV, alpha=1.0, beta=1.0)
-    einsum('mled,iame->alid', t1_erisbb[:noccb, :noccb, noccb:, noccb:], t2ab, out=W_vOoV, alpha=1.0, beta=1.0)
     W_vvvo = t1_erisaa[nocca:, nocca:, nocca:, :nocca].copy()
     einsum('lbed,klce->bcdk', t1_erisaa[:nocca, nocca:, nocca:, nocca:], t2aa, out=W_vvvo, alpha=2.0, beta=1.0)
     einsum('blde,kcle->bcdk', t1_erisab[nocca:, :noccb, nocca:, noccb:], t2ab, out=W_vvvo, alpha=2.0, beta=1.0)
@@ -870,15 +851,8 @@ def intermediates_t3_uhf(mycc, imds, t2):
     einsum('mldj,kmcd->lcjk', t1_erisaa[:nocca, :nocca, nocca:, :nocca], t2aa, out=W_ovoo, alpha=2.0, beta=1.0)
     einsum('lmjd,kcmd->lcjk', t1_erisab[:nocca, :noccb, :nocca, noccb:], t2ab, out=W_ovoo, alpha=2.0, beta=1.0)
     einsum('lcde,jkde->lcjk', t1_erisaa[:nocca, nocca:, nocca:, nocca:], t2aa, out=W_ovoo, alpha=0.5, beta=1.0)
-
     W_VVVV = t1_erisbb[noccb:, noccb:, noccb:, noccb:].copy()
     einsum('lmde,lmab->abde', t1_erisbb[:noccb, :noccb, noccb:, noccb:], t2bb, out=W_VVVV, alpha=0.5, beta=1.0)
-    W_VOOV = t1_erisbb[noccb:, :noccb, :noccb, noccb:].copy()
-    einsum('mled,imae->alid', t1_erisbb[:noccb, :noccb, noccb:, noccb:], t2bb, out=W_VOOV, alpha=1.0, beta=1.0)
-    einsum('mled,meia->alid', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2ab, out=W_VOOV, alpha=1.0, beta=1.0)
-    W_VoOv = t1_erisab[:nocca, noccb:, nocca:, :noccb].transpose(1, 0, 3, 2).copy()
-    einsum('lmde,imae->alid', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2bb, out=W_VoOv, alpha=1.0, beta=1.0)
-    einsum('mled,meia->alid', t1_erisaa[:nocca, :nocca, nocca:, nocca:], t2ab, out=W_VoOv, alpha=1.0, beta=1.0)
     W_VVVO = t1_erisbb[noccb:, noccb:, noccb:, :noccb].copy()
     einsum('lbed,klce->bcdk', t1_erisbb[:noccb, noccb:, noccb:, noccb:], t2bb, out=W_VVVO, alpha=2.0, beta=1.0)
     einsum('lbed,lekc->bcdk', t1_erisab[:nocca, noccb:, nocca:, noccb:], t2ab, out=W_VVVO, alpha=2.0, beta=1.0)
@@ -888,13 +862,8 @@ def intermediates_t3_uhf(mycc, imds, t2):
     einsum('mldj,kmcd->lcjk', t1_erisbb[:noccb, :noccb, noccb:, :noccb], t2bb, out=W_OVOO, alpha=2.0, beta=1.0)
     einsum('mldj,mdkc->lcjk', t1_erisab[:nocca, :noccb, nocca:, :noccb], t2ab, out=W_OVOO, alpha=2.0, beta=1.0)
     einsum('lcde,jkde->lcjk', t1_erisbb[:noccb, noccb:, noccb:, noccb:], t2bb, out=W_OVOO, alpha=0.5, beta=1.0)
-
     W_vVvV = t1_erisab[nocca:, noccb:, nocca:, noccb:].copy()
     einsum('lmed,lbmc->bced', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2ab, out=W_vVvV, alpha=1.0, beta=1.0)
-    W_oVoV = t1_erisab[:nocca, noccb:, :nocca, noccb:].copy()
-    einsum('lmed,iemc->lcid', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2ab, out=W_oVoV, alpha=-1.0, beta=1.0)
-    W_vOvO = t1_erisab[nocca:, :noccb, nocca:, :noccb].copy()
-    einsum('mlde,make->aldk', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2ab, out=W_vOvO, alpha=-1.0, beta=1.0)
     W_vVvO = t1_erisab[nocca:, noccb:, nocca:, :noccb].copy()
     einsum('lbed,lekc->bcdk', t1_erisaa[:nocca, nocca:, nocca:, nocca:], t2ab, out=W_vVvO, alpha=1.0, beta=1.0)
     einsum('blde,lkec->bcdk', t1_erisab[nocca:, :noccb, nocca:, noccb:], t2bb, out=W_vVvO, alpha=1.0, beta=1.0)
@@ -919,10 +888,19 @@ def intermediates_t3_uhf(mycc, imds, t2):
     einsum('alde,jdke->aljk', t1_erisab[nocca:, :noccb, nocca:, noccb:], t2ab, out=W_vOoO, alpha=1.0, beta=1.0)
     imds.W_ovoo, imds.W_oVoO, imds.W_OVOO = W_ovoo, W_oVoO, W_OVOO
     imds.W_vOoO, imds.W_vVoV = W_vOoO, W_vVoV
-    imds.W_voov, imds.W_vOoV, imds.W_VoOv, imds.W_VOOV = W_voov, W_vOoV, W_VoOv, W_VOOV
-    imds.W_oVoV, imds.W_vOvO = W_oVoV, W_vOvO
     imds.W_vvvo, imds.W_vVvO, imds.W_VVVO = W_vvvo, W_vVvO, W_VVVO
     imds.W_vvvv, imds.W_vVvV, imds.W_VVVV = W_vvvv, W_vVvV, W_VVVV
+
+    einsum('mled,imae->alid', t1_erisaa[:nocca, :nocca, nocca:, nocca:], t2aa, out=imds.W_voov, alpha=0.5, beta=1.0)
+    einsum('lmde,iame->alid', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2ab, out=imds.W_voov, alpha=0.5, beta=1.0)
+    einsum('mled,imae->alid', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2aa, out=imds.W_vOoV, alpha=0.5, beta=1.0)
+    einsum('mled,iame->alid', t1_erisbb[:noccb, :noccb, noccb:, noccb:], t2ab, out=imds.W_vOoV, alpha=0.5, beta=1.0)
+    einsum('mled,imae->alid', t1_erisbb[:noccb, :noccb, noccb:, noccb:], t2bb, out=imds.W_VOOV, alpha=0.5, beta=1.0)
+    einsum('mled,meia->alid', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2ab, out=imds.W_VOOV, alpha=0.5, beta=1.0)
+    einsum('lmde,imae->ladi', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2bb, out=imds.W_oVvO, alpha=0.5, beta=1.0)
+    einsum('mled,meia->ladi', t1_erisaa[:nocca, :nocca, nocca:, nocca:], t2ab, out=imds.W_oVvO, alpha=0.5, beta=1.0)
+    einsum('lmed,iemc->lcid', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2ab, out=imds.W_oVoV, alpha=-0.5, beta=1.0)
+    einsum('mlde,make->aldk', t1_erisab[:nocca, :noccb, nocca:, noccb:], t2ab, out=imds.W_vOvO, alpha=-0.5, beta=1.0)
     return imds
 
 def intermediates_t3_add_t3_tri_uhf(mycc, imds, t3):
@@ -1289,7 +1267,7 @@ def compute_r3bbb_tri_uhf(mycc, imds, t2, t3):
 
     F_OO, F_VV = imds.F_OO, imds.F_VV
     W_OOOO, W_OVOO, W_VVVO, W_VVVV = imds.W_OOOO, imds.W_OVOO, imds.W_VVVO, imds.W_VVVV
-    W_VoOv, W_VOOV = imds.W_VoOv, imds.W_VOOV
+    W_oVvO, W_VOOV = imds.W_oVvO, imds.W_VOOV
 
     r3bbb = np.zeros_like(t3bbb)
 
@@ -1465,39 +1443,39 @@ def compute_r3bbb_tri_uhf(mycc, imds, t2, t3):
 
                             _unp_bba_(mycc, t3bba, t3_tmp_2, j0, j1, k0, k1, b0, b1, c0, c1,
                                 blk_i=blksize_o_aaa, blk_j=blksize_o_aaa, blk_a=blksize_v_aaa, blk_b=blksize_v_aaa)
-                            einsum("alid,jkbcld->ijkabc", W_VoOv[a0:a1, :, i0:i1, :],
+                            einsum("ladi,jkbcld->ijkabc", W_oVvO[:, a0:a1, :, i0:i1],
                                 t3_tmp_2[:bj, :bk, :bb, :bc], out=r3_tmp[bijkabc], alpha=1.0, beta=1.0)
                             _unp_bba_(mycc, t3bba, t3_tmp_2, j0, j1, k0, k1, a0, a1, c0, c1,
                                 blk_i=blksize_o_aaa, blk_j=blksize_o_aaa, blk_a=blksize_v_aaa, blk_b=blksize_v_aaa)
-                            einsum("blid,jkacld->ijkabc", W_VoOv[b0:b1, :, i0:i1, :],
+                            einsum("lbdi,jkacld->ijkabc", W_oVvO[:, b0:b1, :, i0:i1],
                                 t3_tmp_2[:bj, :bk, :ba, :bc], out=r3_tmp[bijkabc], alpha=-1.0, beta=1.0)
                             _unp_bba_(mycc, t3bba, t3_tmp_2, j0, j1, k0, k1, a0, a1, b0, b1,
                                 blk_i=blksize_o_aaa, blk_j=blksize_o_aaa, blk_a=blksize_v_aaa, blk_b=blksize_v_aaa)
-                            einsum("clid,jkabld->ijkabc", W_VoOv[c0:c1, :, i0:i1, :],
+                            einsum("lcdi,jkabld->ijkabc", W_oVvO[:, c0:c1, :, i0:i1],
                                 t3_tmp_2[:bj, :bk, :ba, :bb], out=r3_tmp[bijkabc], alpha=1.0, beta=1.0)
                             _unp_bba_(mycc, t3bba, t3_tmp_2, i0, i1, k0, k1, b0, b1, c0, c1,
                                 blk_i=blksize_o_aaa, blk_j=blksize_o_aaa, blk_a=blksize_v_aaa, blk_b=blksize_v_aaa)
-                            einsum("aljd,ikbcld->ijkabc", W_VoOv[a0:a1, :, j0:j1, :],
+                            einsum("ladj,ikbcld->ijkabc", W_oVvO[:, a0:a1, :, j0:j1],
                                 t3_tmp_2[:bi, :bk, :bb, :bc], out=r3_tmp[bijkabc], alpha=-1.0, beta=1.0)
                             _unp_bba_(mycc, t3bba, t3_tmp_2, i0, i1, k0, k1, a0, a1, c0, c1,
                                 blk_i=blksize_o_aaa, blk_j=blksize_o_aaa, blk_a=blksize_v_aaa, blk_b=blksize_v_aaa)
-                            einsum("bljd,ikacld->ijkabc", W_VoOv[b0:b1, :, j0:j1, :],
+                            einsum("lbdj,ikacld->ijkabc", W_oVvO[:, b0:b1, :, j0:j1],
                                 t3_tmp_2[:bi, :bk, :ba, :bc], out=r3_tmp[bijkabc], alpha=1.0, beta=1.0)
                             _unp_bba_(mycc, t3bba, t3_tmp_2, i0, i1, k0, k1, a0, a1, b0, b1,
                                 blk_i=blksize_o_aaa, blk_j=blksize_o_aaa, blk_a=blksize_v_aaa, blk_b=blksize_v_aaa)
-                            einsum("cljd,ikabld->ijkabc", W_VoOv[c0:c1, :, j0:j1, :],
+                            einsum("lcdj,ikabld->ijkabc", W_oVvO[:, c0:c1, :, j0:j1],
                                 t3_tmp_2[:bi, :bk, :ba, :bb], out=r3_tmp[bijkabc], alpha=-1.0, beta=1.0)
                             _unp_bba_(mycc, t3bba, t3_tmp_2, i0, i1, j0, j1, b0, b1, c0, c1,
                                 blk_i=blksize_o_aaa, blk_j=blksize_o_aaa, blk_a=blksize_v_aaa, blk_b=blksize_v_aaa)
-                            einsum("alkd,ijbcld->ijkabc", W_VoOv[a0:a1, :, k0:k1, :],
+                            einsum("ladk,ijbcld->ijkabc", W_oVvO[:, a0:a1, :, k0:k1],
                                 t3_tmp_2[:bi, :bj, :bb, :bc], out=r3_tmp[bijkabc], alpha=1.0, beta=1.0)
                             _unp_bba_(mycc, t3bba, t3_tmp_2, i0, i1, j0, j1, a0, a1, c0, c1,
                                 blk_i=blksize_o_aaa, blk_j=blksize_o_aaa, blk_a=blksize_v_aaa, blk_b=blksize_v_aaa)
-                            einsum("blkd,ijacld->ijkabc", W_VoOv[b0:b1, :, k0:k1, :],
+                            einsum("lbdk,ijacld->ijkabc", W_oVvO[:, b0:b1, :, k0:k1],
                                 t3_tmp_2[:bi, :bj, :ba, :bc], out=r3_tmp[bijkabc], alpha=-1.0, beta=1.0)
                             _unp_bba_(mycc, t3bba, t3_tmp_2, i0, i1, j0, j1, a0, a1, b0, b1,
                                 blk_i=blksize_o_aaa, blk_j=blksize_o_aaa, blk_a=blksize_v_aaa, blk_b=blksize_v_aaa)
-                            einsum("clkd,ijabld->ijkabc", W_VoOv[c0:c1, :, k0:k1, :],
+                            einsum("lcdk,ijabld->ijkabc", W_oVvO[:, c0:c1, :, k0:k1],
                                 t3_tmp_2[:bi, :bj, :ba, :bb], out=r3_tmp[bijkabc], alpha=1.0, beta=1.0)
 
                             _update_packed_bbb_(mycc, r3bbb, r3_tmp, i0, i1, j0, j1, k0, k1,
@@ -1525,7 +1503,7 @@ def compute_r3aab_tri_uhf(mycc, imds, t2, t3):
     F_oo, F_vv, F_OO, F_VV = imds.F_oo, imds.F_vv, imds.F_OO, imds.F_VV
     W_oooo, W_oOoO, W_ovoo, W_oVoO = imds.W_oooo, imds.W_oOoO, imds.W_ovoo, imds.W_oVoO
     W_vOoO, W_oVoV, W_vOvO, W_vVoV = imds.W_vOoO, imds.W_oVoV, imds.W_vOvO, imds.W_vVoV
-    W_voov, W_vOoV, W_VoOv, W_VOOV = imds.W_voov, imds.W_vOoV, imds.W_VoOv, imds.W_VOOV
+    W_voov, W_vOoV, W_oVvO, W_VOOV = imds.W_voov, imds.W_vOoV, imds.W_oVvO, imds.W_VOOV
     W_vvvo, W_vVvO, W_vvvv, W_vVvV = imds.W_vvvo, imds.W_vVvO, imds.W_vvvv, imds.W_vVvV
 
     r3aab = np.zeros_like(t3aab)
@@ -1688,7 +1666,7 @@ def compute_r3aab_tri_uhf(mycc, imds, t2, t3):
                     _unp_aaa_(mycc, t3aaa, t3_tmp_3, i0, i1, j0, j1, 0, nocca, a0, a1, b0, b1, 0, nvira,
                             blk_i=blksize_o_aab, blk_j=blksize_o_aab, blk_k=nocca,
                             blk_a=blksize_v_aab, blk_b=blksize_v_aab, blk_c=nvira)
-                    einsum("clkd,ijlabd->ijabkc", W_VoOv, t3_tmp_3[:bi, :bj, :, :ba, :bb, :],
+                    einsum("lcdk,ijlabd->ijabkc", W_oVvO, t3_tmp_3[:bi, :bj, :, :ba, :bb, :],
                         out=r3_tmp[:bi, :bj, :ba, :bb], alpha=1.0, beta=1.0)
 
                     _update_packed_aab_(mycc, r3aab, r3_tmp, i0, i1, j0, j1, a0, a1, b0, b1)
@@ -1720,7 +1698,7 @@ def compute_r3bba_tri_uhf(mycc, imds, t2, t3):
     F_oo, F_vv, F_OO, F_VV = imds.F_oo, imds.F_vv, imds.F_OO, imds.F_VV
     W_oOoO, W_OOOO, W_oVoO, W_OVOO = imds.W_oOoO, imds.W_OOOO, imds.W_oVoO, imds.W_OVOO
     W_vOoO, W_oVoV, W_vOvO, W_vVoV = imds.W_vOoO, imds.W_oVoV, imds.W_vOvO, imds.W_vVoV
-    W_voov, W_vOoV, W_VoOv, W_VOOV = imds.W_voov, imds.W_vOoV, imds.W_VoOv, imds.W_VOOV
+    W_voov, W_vOoV, W_oVvO, W_VOOV = imds.W_voov, imds.W_vOoV, imds.W_oVvO, imds.W_VOOV
     W_vVvO, W_VVVO, W_vVvV, W_VVVV = imds.W_vVvO, imds.W_VVVO, imds.W_vVvV, imds.W_VVVV
 
     r3bba = np.zeros_like(t3bba)
@@ -1867,16 +1845,16 @@ def compute_r3bba_tri_uhf(mycc, imds, t2, t3):
                             bd = d1 - d0
                             _unp_aab_(mycc, t3aab, t3_tmp_2, l0, l1, 0, nocca,
                                     d0, d1, 0, nvira, blk_j=nocca, blk_b=nvira)
-                            einsum("alid,lkdcjb->ijabkc", W_VoOv[a0:a1, l0:l1, i0:i1, d0:d1],
+                            einsum("ladi,lkdcjb->ijabkc", W_oVvO[l0:l1, a0:a1, d0:d1, i0:i1],
                                 t3_tmp_2[:bl, :, :bd, :, j0:j1, b0:b1],
                                 out=r3_tmp[:bi, :bj, :ba, :bb], alpha=1.0, beta=1.0)
-                            einsum("blid,lkdcja->ijabkc", W_VoOv[b0:b1, l0:l1, i0:i1, d0:d1],
+                            einsum("lbdi,lkdcja->ijabkc", W_oVvO[l0:l1, b0:b1, d0:d1, i0:i1],
                                 t3_tmp_2[:bl, :, :bd, :, j0:j1, a0:a1],
                                 out=r3_tmp[:bi, :bj, :ba, :bb], alpha=-1.0, beta=1.0)
-                            einsum("aljd,lkdcib->ijabkc", W_VoOv[a0:a1, l0:l1, j0:j1, d0:d1],
+                            einsum("ladj,lkdcib->ijabkc", W_oVvO[l0:l1, a0:a1, d0:d1, j0:j1],
                                 t3_tmp_2[:bl, :, :bd, :, i0:i1, b0:b1],
                                 out=r3_tmp[:bi, :bj, :ba, :bb], alpha=-1.0, beta=1.0)
-                            einsum("bljd,lkdcia->ijabkc", W_VoOv[b0:b1, l0:l1, j0:j1, d0:d1],
+                            einsum("lbdj,lkdcia->ijabkc", W_oVvO[l0:l1, b0:b1, d0:d1, j0:j1],
                                 t3_tmp_2[:bl, :, :bd, :, i0:i1, a0:a1],
                                 out=r3_tmp[:bi, :bj, :ba, :bb], alpha=1.0, beta=1.0)
 
@@ -1905,7 +1883,7 @@ def compute_r3bba_tri_uhf(mycc, imds, t2, t3):
     W_oOoO = imds.W_oOoO = None
     W_oVoV = imds.W_oVoV = None
     W_vOvO = imds.W_vOvO = None
-    W_VoOv = imds.W_VoOv = None
+    W_oVvO = imds.W_oVvO = None
     W_VOOV = imds.W_VOOV = None
     W_VVVV = imds.W_VVVV = None
     W_VVVO = imds.W_VVVO = None
@@ -2045,20 +2023,10 @@ def update_amps_uccsdt_tri_(mycc, tamps, eris):
     # antisymmetrization
     antisymmetrize_r2_uhf_(r2)
     time1 = log.timer_debug1('t1t2: antisymmetrize r2', *time1)
-    # divide by eijkabc
+    # divide by eijab
     r1r2_divide_e_uhf_(mycc, r1, r2, mo_energy)
     (r1a, r1b), (r2aa, r2ab, r2bb) = r1, r2
     time1 = log.timer_debug1('t1t2: divide r1 & r2 by eia & eijab', *time1)
-
-    res_norm = [np.linalg.norm(r1a), np.linalg.norm(r1b),
-                np.linalg.norm(r2aa), np.linalg.norm(r2ab), np.linalg.norm(r2bb)]
-
-    t1a += r1a
-    t1b += r1b
-    t2aa += r2aa
-    t2ab += r2ab
-    t2bb += r2bb
-    time1 = log.timer_debug1('t1t2: update t1 & t2', *time1)
     time0 = log.timer_debug1('t1t2 total', *time0)
 
     # t3
@@ -2074,8 +2042,16 @@ def update_amps_uccsdt_tri_(mycc, tamps, eris):
     r3aaa, r3aab, r3bba, r3bbb = r3
     time1 = log.timer_debug1('t3: divide r3 by eijkabc', *time1)
 
-    res_norm += [np.linalg.norm(r3aaa), np.linalg.norm(r3aab), np.linalg.norm(r3bba), np.linalg.norm(r3bbb)]
+    res_norm = [np.linalg.norm(r1a), np.linalg.norm(r1b),
+                np.linalg.norm(r2aa), np.linalg.norm(r2ab), np.linalg.norm(r2bb),
+                np.linalg.norm(r3aaa), np.linalg.norm(r3aab), np.linalg.norm(r3bba), np.linalg.norm(r3bbb)]
 
+    t1a += r1a
+    t1b += r1b
+    t2aa += r2aa
+    t2ab += r2ab
+    t2bb += r2bb
+    r1a, r1b, r2aa, r2ab, r2bb = None, None, None, None, None
     t3aaa += r3aaa
     r3aaa = None
     t3bbb += r3bbb
@@ -2085,7 +2061,7 @@ def update_amps_uccsdt_tri_(mycc, tamps, eris):
     t3bba += r3bba
     r3bba = None
     t3 = [t3aaa, t3aab, t3bba, t3bbb]
-    time1 = log.timer_debug1('t3: update t3', *time1)
+    time1 = log.timer_debug1('t3: update t1, t2, t3', *time1)
     time0 = log.timer_debug1('t3 total', *time0)
 
     tamps = [t1, t2, t3]
@@ -2280,7 +2256,7 @@ def restore_from_diis_(mycc, diis_file, inplace=True):
             else:
                 n1, nocc1, nvir1, n2, nocc2, nvir2 = nb, noccb, nvirb, na, nocca, nvira
             if mycc.do_tri_max_t:
-                if n2 >= 0:
+                if n2 > 0:
                     shape = (nx(nocc1, n1),) + (nx(nvir1, n1),) + (nx(nocc2, n2),) + (nx(nvir2, n2),)
                 else:
                     shape = (nx(nocc1, n1),) + (nx(nvir1, n1),)
@@ -2471,6 +2447,7 @@ def dump_chk(mycc, tamps=None, frozen=None, mo_coeff=None, mo_occ=None):
         lib.chkfile.save(mycc.chkfile, 'uccsdt', cc_chk)
     else:
         lib.chkfile.save(mycc.chkfile, 'uccsdt_highm', cc_chk)
+    return mycc
 
 def tamps_tri2full_uhf(mycc, tamps_tri):
     '''Convert triangular-stored T amplitudes to their full tensor form (UHF case).'''
@@ -2912,11 +2889,11 @@ def __init__(self):
         self.F_oo, self.F_OO = None, None
         self.F_vv, self.F_VV = None, None
         self.W_oooo, self.W_oOoO, self.W_OOOO = None, None, None
-        self.W_ovoo, self.W_oVoO, self.W_OVOO = None, None, None
-        self.W_vOoO, self.W_oVoV, self.W_vOvO, self.W_vVoV = None, None, None, None
-        self.W_voov, self.W_vOoV, self.W_VoOv, self.W_VOOV = None, None, None, None
-        self.W_vvvo, self.W_vVvO, self.W_VVVO = None, None, None
+        self.W_voov, self.W_vOoV, self.W_oVvO, self.W_VOOV = None, None, None, None
+        self.W_vovo, self.W_vOvO, self.W_oVoV, self.W_VOVO = None, None, None, None
         self.W_vvvv, self.W_vVvV, self.W_VVVV = None, None, None
+        self.W_vvvo, self.W_vVvO, self.W_vVoV, self.W_VVVO = None, None, None, None
+        self.W_ovoo, self.W_oVoO, self.W_vOoO, self.W_OVOO = None, None, None, None
 
 
 if __name__ == "__main__":
diff --git a/pyscf/cc/uccsdt_highm.py b/pyscf/cc/uccsdt_highm.py
index d02de7ccbe..407c1984ea 100644
--- a/pyscf/cc/uccsdt_highm.py
+++ b/pyscf/cc/uccsdt_highm.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright 2014-2025 The PySCF Developers. All Rights Reserved.
+# Copyright 2014-2026 The PySCF Developers. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -28,14 +28,10 @@
 '''
 
 import numpy as np
-import numpy
 import functools
-from pyscf import lib
 from pyscf.lib import logger
-from pyscf.mp.mp2 import get_e_hf
-from pyscf.mp.ump2 import get_nocc, get_nmo, get_frozen_mask
 from pyscf.cc import uccsdt
-from pyscf.cc.rccsdt import _einsum, run_diis, _finalize
+from pyscf.cc.rccsdt import _einsum
 from pyscf.cc.uccsdt import (update_t1_fock_eris_uhf, intermediates_t1t2_uhf, compute_r1r2_uhf,
                             antisymmetrize_r2_uhf_, r1r2_divide_e_uhf_, intermediates_t3_uhf, _PhysicistsERIs, _IMDS)
 from pyscf import __config__
@@ -132,7 +128,7 @@ def compute_r3_uhf(mycc, imds, t2, t3):
     W_oooo, W_oOoO, W_OOOO = imds.W_oooo, imds.W_oOoO, imds.W_OOOO
     W_ovoo, W_oVoO, W_OVOO = imds.W_ovoo, imds.W_oVoO, imds.W_OVOO
     W_vOoO, W_oVoV, W_vOvO, W_vVoV = imds.W_vOoO, imds.W_oVoV, imds.W_vOvO, imds.W_vVoV
-    W_voov, W_vOoV, W_VoOv, W_VOOV = imds.W_voov, imds.W_vOoV, imds.W_VoOv, imds.W_VOOV
+    W_voov, W_vOoV, W_oVvO, W_VOOV = imds.W_voov, imds.W_vOoV, imds.W_oVvO, imds.W_VOOV
     W_vvvo, W_vVvO, W_VVVO = imds.W_vvvo, imds.W_vVvO, imds.W_VVVO
     W_vvvv, W_vVvV, W_VVVV = imds.W_vvvv, imds.W_vVvV, imds.W_VVVV
 
@@ -155,7 +151,7 @@ def compute_r3_uhf(mycc, imds, t2, t3):
     einsum("abde,ijkdec->ijkabc", W_VVVV, t3bbb, out=r3bbb, alpha=1.0 / 24.0, beta=1.0)
     einsum("lmij,lmkabc->ijkabc", W_OOOO, t3bbb, out=r3bbb, alpha=1.0 / 24.0, beta=1.0)
     einsum("alid,ljkdbc->ijkabc", W_VOOV, t3bbb, out=r3bbb, alpha=0.25, beta=1.0)
-    einsum("alid,jkbcld->ijkabc", W_VoOv, t3bba, out=r3bbb, alpha=0.25, beta=1.0)
+    einsum("ladi,jkbcld->ijkabc", W_oVvO, t3bba, out=r3bbb, alpha=0.25, beta=1.0)
     time1 = log.timer_debug1('t3: r3bbb', *time1)
 
     r3aab = np.empty_like(t3aab)
@@ -177,7 +173,7 @@ def compute_r3_uhf(mycc, imds, t2, t3):
     einsum("alid,lkdcjb->ijabkc", W_vOoV, t3bba, out=r3aab, alpha=1.0, beta=1.0)
     einsum("lcid,ljabkd->ijabkc", W_oVoV, t3aab, out=r3aab, alpha=-0.5, beta=1.0)
     einsum("aldk,ijdblc->ijabkc", W_vOvO, t3aab, out=r3aab, alpha=-0.5, beta=1.0)
-    einsum("clkd,ijlabd->ijabkc", W_VoOv, t3aaa, out=r3aab, alpha=0.25, beta=1.0)
+    einsum("lcdk,ijlabd->ijabkc", W_oVvO, t3aaa, out=r3aab, alpha=0.25, beta=1.0)
     einsum("clkd,ijabld->ijabkc", W_VOOV, t3aab, out=r3aab, alpha=0.25, beta=1.0)
     W_vvvo = imds.W_vvvo = None
     W_ovoo = imds.W_ovoo = None
@@ -201,7 +197,7 @@ def compute_r3_uhf(mycc, imds, t2, t3):
     einsum("lmij,lmabkc->ijabkc", W_OOOO, t3bba, out=r3bba, alpha=0.125, beta=1.0)
     einsum("mlki,ljabmc->ijabkc", W_oOoO, t3bba, out=r3bba, alpha=0.5, beta=1.0)
     einsum("alid,ljdbkc->ijabkc", W_VOOV, t3bba, out=r3bba, alpha=1.0, beta=1.0)
-    einsum("alid,lkdcjb->ijabkc", W_VoOv, t3aab, out=r3bba, alpha=1.0, beta=1.0)
+    einsum("ladi,lkdcjb->ijabkc", W_oVvO, t3aab, out=r3bba, alpha=1.0, beta=1.0)
     einsum("cldi,ljabkd->ijabkc", W_vOvO, t3bba, out=r3bba, alpha=-0.5, beta=1.0)
     einsum("lakd,ijdblc->ijabkc", W_oVoV, t3bba, out=r3bba, alpha=-0.5, beta=1.0)
     einsum("clkd,ijlabd->ijabkc", W_vOoV, t3bbb, out=r3bba, alpha=0.25, beta=1.0)
@@ -220,7 +216,7 @@ def compute_r3_uhf(mycc, imds, t2, t3):
     W_oOoO = imds.W_oOoO = None
     W_oVoV = imds.W_oVoV = None
     W_vOvO = imds.W_vOvO = None
-    W_VoOv = imds.W_VoOv = None
+    W_oVvO = imds.W_oVvO = None
     W_VOOV = imds.W_VOOV = None
     W_VVVV = imds.W_VVVV = None
     W_VVVO = imds.W_VVVO = None
@@ -292,20 +288,10 @@ def update_amps_uccsdt_(mycc, tamps, eris):
     # antisymmetrization
     antisymmetrize_r2_uhf_(r2)
     time1 = log.timer_debug1('t1t2: antisymmetrize r2', *time1)
-    # divide by eijkabc
+    # divide by eijab
     r1r2_divide_e_uhf_(mycc, r1, r2, mo_energy)
     (r1a, r1b), (r2aa, r2ab, r2bb) = r1, r2
     time1 = log.timer_debug1('t1t2: divide r1 & r2 by eia & eijab', *time1)
-
-    res_norm = [np.linalg.norm(r1a), np.linalg.norm(r1b),
-                np.linalg.norm(r2aa), np.linalg.norm(r2ab), np.linalg.norm(r2bb)]
-
-    t1a += r1a
-    t1b += r1b
-    t2aa += r2aa
-    t2ab += r2ab
-    t2bb += r2bb
-    time1 = log.timer_debug1('t1t2: update t1 & t2', *time1)
     time0 = log.timer_debug1('t1t2 total', *time0)
 
     # t3
@@ -324,8 +310,16 @@ def update_amps_uccsdt_(mycc, tamps, eris):
     r3aaa, r3aab, r3bba, r3bbb = r3
     time1 = log.timer_debug1('t3: divide r3 by eijkabc', *time1)
 
-    res_norm += [np.linalg.norm(r3aaa), np.linalg.norm(r3aab), np.linalg.norm(r3bba), np.linalg.norm(r3bbb)]
+    res_norm = [np.linalg.norm(r1a), np.linalg.norm(r1b),
+                np.linalg.norm(r2aa), np.linalg.norm(r2ab), np.linalg.norm(r2bb),
+                np.linalg.norm(r3aaa), np.linalg.norm(r3aab), np.linalg.norm(r3bba), np.linalg.norm(r3bbb)]
 
+    t1a += r1a
+    t1b += r1b
+    t2aa += r2aa
+    t2ab += r2ab
+    t2bb += r2bb
+    r1a, r1b, r2aa, r2ab, r2bb = None, None, None, None, None
     t3aaa += r3aaa
     r3aaa = None
     t3bbb += r3bbb
@@ -335,7 +329,7 @@ def update_amps_uccsdt_(mycc, tamps, eris):
     t3bba += r3bba
     r3bba = None
     t3 = (t3aaa, t3aab, t3bba, t3bbb)
-    time1 = log.timer_debug1('t3: update t3', *time1)
+    time1 = log.timer_debug1('t3: update t1, t2, t3', *time1)
     time0 = log.timer_debug1('t3 total', *time0)
 
     tamps = [t1, t2, t3]
diff --git a/pyscf/cc/uintermediates_slow.py b/pyscf/cc/uintermediates_slow.py
index 4b7c949f06..977bc004f6 100644
--- a/pyscf/cc/uintermediates_slow.py
+++ b/pyscf/cc/uintermediates_slow.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import tempfile
 import h5py
 import numpy as np
 from pyscf import lib
@@ -73,7 +72,7 @@ def cc_Wvvvv(t1,t2,eris):
     #Wabef += 0.25*einsum('mnab,mnef->abef',tau,eris.oovv)
     if t1.dtype == np.complex128: ds_type = 'c16'
     else: ds_type = 'f8'
-    _tmpfile1 = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+    _tmpfile1 = lib.NamedTemporaryFile(dir=lib.param.TMPDIR)
     fimd = h5py.File(_tmpfile1.name)
     nocc, nvir = t1.shape
     Wabef = fimd.create_dataset('vvvv', (nvir,nvir,nvir,nvir), ds_type)
@@ -120,7 +119,7 @@ def Wvvvv(t1,t2,eris):
     #Wabef = cc_Wvvvv(t1,t2,eris) + 0.25*einsum('mnab,mnef->abef',tau,eris.oovv)
     if t1.dtype == np.complex128: ds_type = 'c16'
     else: ds_type = 'f8'
-    _tmpfile1 = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+    _tmpfile1 = lib.NamedTemporaryFile(dir=lib.param.TMPDIR)
     fimd = h5py.File(_tmpfile1.name)
     nocc, nvir = t1.shape
     Wabef = fimd.create_dataset('vvvv', (nvir,nvir,nvir,nvir), ds_type)
diff --git a/pyscf/ci/test/test_cisd.py b/pyscf/ci/test/test_cisd.py
index 20f0cac12a..32eb4ac4dd 100644
--- a/pyscf/ci/test/test_cisd.py
+++ b/pyscf/ci/test/test_cisd.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 import unittest
-import tempfile
 import numpy
 from functools import reduce
 
@@ -332,7 +331,7 @@ def test_dump_chk(self):
         H   0.   -0.757   0.587
         H   0.   0.757    0.587''', basis='631g')
         mf = scf.RHF(mol).run()
-        mf.chkfile = tempfile.NamedTemporaryFile().name
+        mf.chkfile = lib.NamedTemporaryFile().name
         ci_scanner = ci.CISD(mf).as_scanner()
         ci_scanner(mol)
         ci_scanner.nmo = mf.mo_energy.size
diff --git a/pyscf/df/df.py b/pyscf/df/df.py
index 987521e04b..47c37775a0 100644
--- a/pyscf/df/df.py
+++ b/pyscf/df/df.py
@@ -21,7 +21,6 @@
 '''
 
 
-import tempfile
 import contextlib
 import numpy
 import h5py
@@ -171,7 +170,7 @@ def build(self):
                                               max_memory=max_memory, verbose=log)
         else:
             if self._cderi_to_save is None:
-                self._cderi_to_save = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+                self._cderi_to_save = lib.NamedTemporaryFile(dir=lib.param.TMPDIR)
             cderi = self._cderi_to_save
 
             if is_custom_storage:
diff --git a/pyscf/df/outcore.py b/pyscf/df/outcore.py
index e59bf464f4..033752ccfa 100644
--- a/pyscf/df/outcore.py
+++ b/pyscf/df/outcore.py
@@ -17,7 +17,6 @@
 #
 
 
-import tempfile
 import numpy
 import scipy.linalg
 import h5py
@@ -57,7 +56,7 @@ def cholesky_eri(mol, erifile, auxbasis='weigend+etb', dataname='j3c', tmpdir=No
 
     if tmpdir is None:
         tmpdir = lib.param.TMPDIR
-    swapfile = tempfile.NamedTemporaryFile(dir=tmpdir)
+    swapfile = lib.NamedTemporaryFile(dir=tmpdir)
     cholesky_eri_b(mol, swapfile.name, auxbasis, dataname,
                    int3c, aosym, int2c, comp, max_memory, auxmol, verbose=log)
     fswap = h5py.File(swapfile.name, 'r')
@@ -243,7 +242,7 @@ def general(mol, mo_coeffs, erifile, auxbasis='weigend+etb', dataname='eri_mo',
 
     if tmpdir is None:
         tmpdir = lib.param.TMPDIR
-    swapfile = tempfile.NamedTemporaryFile(dir=tmpdir)
+    swapfile = lib.NamedTemporaryFile(dir=tmpdir)
     cholesky_eri_b(mol, swapfile.name, auxbasis, dataname,
                    int3c, aosym, int2c, comp, max_memory, verbose=log)
     fswap = h5py.File(swapfile.name, 'r')
diff --git a/pyscf/df/test/test_addons.py b/pyscf/df/test/test_addons.py
index 1a62a905a9..00918e2326 100644
--- a/pyscf/df/test/test_addons.py
+++ b/pyscf/df/test/test_addons.py
@@ -17,7 +17,6 @@
 
 import unittest
 import itertools
-import tempfile
 import numpy as np
 from pyscf import lib
 from pyscf import gto
diff --git a/pyscf/df/test/test_df.py b/pyscf/df/test/test_df.py
index 11b07b91be..a386b02e94 100644
--- a/pyscf/df/test/test_df.py
+++ b/pyscf/df/test/test_df.py
@@ -17,7 +17,6 @@
 
 import os
 import unittest
-import tempfile
 import numpy
 from pyscf import lib
 from pyscf import gto
@@ -73,7 +72,7 @@ def test_ao2mo(self):
 
     def test_cderi_to_save(self):
         with open(os.devnull, 'w') as f:
-            ftmp = tempfile.NamedTemporaryFile()
+            ftmp = lib.NamedTemporaryFile()
             dfobj = df.DF(mol)
             dfobj.auxmol = df.addons.make_auxmol(mol, 'weigend')
             dfobj.verbose = 5
@@ -132,7 +131,7 @@ def test_rsh_df_custom_storage(self):
         mol = gto.M(atom = 'H 0 0 0; F 0 0 1.1', basis='ccpvdz', max_memory=10, verbose=0)
         mf = mol.RKS().density_fit()
         mf.xc = 'lda+0.5*SR_HF(0.3)'
-        with tempfile.NamedTemporaryFile() as ftmp:
+        with lib.NamedTemporaryFile() as ftmp:
             mf.with_df._cderi_to_save = ftmp.name
             mf.run()
         self.assertAlmostEqual(mf.e_tot, -103.4965622991, 6)
diff --git a/pyscf/df/test/test_df_grad.py b/pyscf/df/test/test_df_grad.py
index 80d39fe064..a892d77220 100644
--- a/pyscf/df/test/test_df_grad.py
+++ b/pyscf/df/test/test_df_grad.py
@@ -17,7 +17,6 @@
 
 import os
 import unittest
-import tempfile
 import numpy
 from pyscf import lib
 from pyscf import gto
diff --git a/pyscf/df/test/test_df_hessian.py b/pyscf/df/test/test_df_hessian.py
index 078b967aa3..e22e79fd0e 100644
--- a/pyscf/df/test/test_df_hessian.py
+++ b/pyscf/df/test/test_df_hessian.py
@@ -17,7 +17,6 @@
 
 import os
 import unittest
-import tempfile
 import numpy
 from pyscf import lib
 from pyscf import gto
diff --git a/pyscf/df/test/test_outcore.py b/pyscf/df/test/test_outcore.py
index 2e498267dd..910124e8d0 100644
--- a/pyscf/df/test/test_outcore.py
+++ b/pyscf/df/test/test_outcore.py
@@ -16,7 +16,6 @@
 #
 
 import unittest
-import tempfile
 import numpy
 import scipy.linalg
 import h5py
@@ -47,7 +46,7 @@ def tearDownModule():
 
 class KnownValues(unittest.TestCase):
     def test_outcore(self):
-        ftmp = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+        ftmp = lib.NamedTemporaryFile(dir=lib.param.TMPDIR)
         cderi0 = df.incore.cholesky_eri(mol)
         df.outcore.cholesky_eri(mol, ftmp.name)
         with h5py.File(ftmp.name, 'r') as feri:
@@ -73,7 +72,7 @@ def test_outcore(self):
         with h5py.File(ftmp.name, 'r') as feri:
             self.assertTrue(numpy.allclose(feri['j3c'], cderi0.reshape(naux,-1)))
 
-        ftmp = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+        ftmp = lib.NamedTemporaryFile(dir=lib.param.TMPDIR)
         numpy.random.seed(1)
         co = numpy.random.random((nao,4))
         cv = numpy.random.random((nao,25))
@@ -96,7 +95,7 @@ def test_outcore(self):
             self.assertTrue(numpy.allclose(feri['eri_mo'], cderi0))
 
     def test_lindep(self):
-        ftmp = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+        ftmp = lib.NamedTemporaryFile(dir=lib.param.TMPDIR)
         df.outcore.cholesky_eri(mol, ftmp.name, auxmol=auxmol, verbose=7)
         with h5py.File(ftmp.name, 'r') as f:
             cderi0 = f['j3c'][:]
@@ -111,7 +110,7 @@ def test_lindep(self):
         self.assertAlmostEqual(abs(eri0-eri1).max(), 0, 9)
 
 #    def test_int3c2e_ip(self):
-#        ftmp = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+#        ftmp = lib.NamedTemporaryFile(dir=lib.param.TMPDIR)
 #        df.outcore.cholesky_eri(mol, ftmp.name, int3c='int3c2e_ip1',
 #                                auxmol=auxmol, comp=3)
 #        with h5py.File(ftmp.name, 'r') as f:
diff --git a/pyscf/dft/libxc.py b/pyscf/dft/libxc.py
index 5e7db2f381..3f3385f3bb 100644
--- a/pyscf/dft/libxc.py
+++ b/pyscf/dft/libxc.py
@@ -276,6 +276,7 @@ def _xc_key_without_underscore(xc_keys):
     'M05_2X'            : 'HYB_MGGA_X_M05_2X,MGGA_C_M05_2X',
     'M06_2X'            : 'HYB_MGGA_X_M06_2X,MGGA_C_M06_2X',
     'M06_HF'            : 'HYB_MGGA_X_M06_HF,MGGA_C_M06_HF',
+    'CF22D'             : 'HYB_MGGA_X_CF22D,MGGA_C_CF22D',
     # extra aliases
     'SOGGA11X'          : 'SOGGA11_X',
     'M06L'              : 'M06_L',
diff --git a/pyscf/dft/radi.py b/pyscf/dft/radi.py
index 43bfb7b71d..75a62b51b5 100644
--- a/pyscf/dft/radi.py
+++ b/pyscf/dft/radi.py
@@ -37,7 +37,7 @@
 
 # P.M.W. Gill, B.G. Johnson, J.A. Pople, Chem. Phys. Letters 209 (1993) 506-512
 SG1RADII = numpy.array((
-    0,
+    1.0000, # Ghost
     1.0000,                                                 0.5882,
     3.0769, 2.0513, 1.5385, 1.2308, 1.0256, 0.8791, 0.7692, 0.6838,
     4.0909, 3.1579, 2.5714, 2.1687, 1.8750, 1.6514, 1.4754, 1.3333))
diff --git a/pyscf/dft/test/test_h2o.py b/pyscf/dft/test/test_h2o.py
index aa55453984..ca44e6cdf4 100644
--- a/pyscf/dft/test/test_h2o.py
+++ b/pyscf/dft/test/test_h2o.py
@@ -22,7 +22,7 @@
 from pyscf import scf
 try:
     from pyscf.dispersion import dftd3, dftd4
-except ImportError:
+except (ImportError, OSError):
     dftd3 = dftd4 = None
 
 def setUpModule():
diff --git a/pyscf/dft/test/test_he.py b/pyscf/dft/test/test_he.py
index 00400d0f17..67ce66dbc4 100644
--- a/pyscf/dft/test/test_he.py
+++ b/pyscf/dft/test/test_he.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 import unittest
-import tempfile
 import numpy
 from pyscf import gto
 from pyscf import lib
@@ -215,14 +214,14 @@ def test_convert(self):
 
     # issue 1986
     def test_init_guess_chkfile(self):
-        with tempfile.NamedTemporaryFile() as tmpf:
-            mol = gto.M(atom='He 0 0 0', basis='631g', charge=1, spin=1)
+        mol = gto.M(atom='He 0 0 0', basis='631g', charge=1, spin=1)
+        with lib.NamedTemporaryFile() as tmpf:
             mf = dft.RKS(mol)
             mf.chkfile = tmpf.name
             e1 = mf.kernel()
             mf = dft.RKS(mol)
-            mf.init_guess = 'chkfile'
             mf.chkfile = tmpf.name
+            mf.init_guess = 'chkfile'
             mf.max_cycle = 1
             e2 = mf.kernel()
             self.assertAlmostEqual(e1, e2, 9)
diff --git a/pyscf/dft/test/test_libxc.py b/pyscf/dft/test/test_libxc.py
index 7f47c5a624..a820cf4b96 100644
--- a/pyscf/dft/test/test_libxc.py
+++ b/pyscf/dft/test/test_libxc.py
@@ -446,6 +446,7 @@ def test_dft_parser(self):
         self.assertEqual(parse_dft('b3lyp-d3zerom'), ('b3lyp', '', 'd3zerom'))
         self.assertEqual(parse_dft('wb97x-d3bj'), ('wb97x-v', False, 'd3bj'))
         self.assertEqual(parse_dft('wb97x-d3zero2b'), ('wb97x', '', 'd3zero2b'))
+        self.assertEqual(parse_dft('wb97x-3c'), ('wb97x-v', False, 'd4:wb97x-3c'))
 
     def test_set_param(self):
         XC_ID_B97_2 = 410
diff --git a/pyscf/eph/test/test_rhf.py b/pyscf/eph/test/test_rhf.py
index 3c72e92cf6..5ac5aba9bf 100644
--- a/pyscf/eph/test/test_rhf.py
+++ b/pyscf/eph/test/test_rhf.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 #
 
-import tempfile
 from pyscf import scf, gto, lib
 from pyscf.eph import eph_fd, rhf
 import numpy as np
@@ -33,7 +32,6 @@ def setUpModule():
     mol.output = '/dev/null'
     mol.build()
     mf = scf.RHF(mol)
-    mf.chkfile = tempfile.NamedTemporaryFile().name
     mf.conv_tol = 1e-14
     mf.conv_tol_grad = 1e-9
     mf.kernel()
diff --git a/pyscf/eph/test/test_rks.py b/pyscf/eph/test/test_rks.py
index 4383b89d45..3fd9cd4d39 100644
--- a/pyscf/eph/test/test_rks.py
+++ b/pyscf/eph/test/test_rks.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 #
 
-import tempfile
 from pyscf import dft, gto, lib
 from pyscf.eph import eph_fd, rks
 import numpy as np
@@ -33,7 +32,6 @@ def setUpModule():
     mol.output = '/dev/null'
     mol.build()
     mf = dft.RKS(mol)
-    mf.chkfile = tempfile.NamedTemporaryFile().name
     mf.grids.level = 3
     mf.xc = 'b3lyp5'
     mf.conv_tol = 1e-14
diff --git a/pyscf/eph/test/test_uhf.py b/pyscf/eph/test/test_uhf.py
index ac20518f83..3508afb2ed 100644
--- a/pyscf/eph/test/test_uhf.py
+++ b/pyscf/eph/test/test_uhf.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 #
 
-import tempfile
 from pyscf import scf, gto, lib
 from pyscf.eph import eph_fd, uhf
 import numpy as np
@@ -33,7 +32,6 @@ def setUpModule():
     mol.output = '/dev/null'
     mol.build()
     mf = scf.UHF(mol)
-    mf.chkfile = tempfile.NamedTemporaryFile().name
     mf.conv_tol = 1e-14
     mf.conv_tol_grad = 1e-9
     mf.kernel()
diff --git a/pyscf/eph/test/test_uks.py b/pyscf/eph/test/test_uks.py
index 0cdba70440..2308ce88e0 100644
--- a/pyscf/eph/test/test_uks.py
+++ b/pyscf/eph/test/test_uks.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 #
 
-import tempfile
 from pyscf import dft, gto, lib
 from pyscf.eph import eph_fd, uks
 import numpy as np
@@ -34,7 +33,6 @@ def setUpModule():
     mol.build()
 
     mf = dft.UKS(mol)
-    mf.chkfile = tempfile.NamedTemporaryFile().name
     mf.grids.level = 3
     mf.xc = 'b3lyp5'
     mf.conv_tol = 1e-14
diff --git a/pyscf/fci/test/test_rdm.py b/pyscf/fci/test/test_rdm.py
index 7366783c7d..7c69d03ea0 100644
--- a/pyscf/fci/test/test_rdm.py
+++ b/pyscf/fci/test/test_rdm.py
@@ -267,7 +267,7 @@ def test_full_alpha(self):
         h2 = numpy.random.random((npair,npair)) * .1
         h2 = h2 + h2.T
         cis = fci.direct_spin1.FCI()
-        e, c = cis.kernel(h1, h2, norb, nelec, verbose=5)
+        e, c = cis.kernel(h1, h2, norb, nelec)
         dm1s, dm2s = cis.make_rdm12s(c, norb, nelec)
         self.assertAlmostEqual(abs(dm1s[0]).sum(), 6, 9)
         self.assertAlmostEqual(dm1s[1].trace(), 3, 9)
@@ -285,7 +285,7 @@ def test_0beta(self):
         h2 = numpy.random.random((npair,npair)) * .1
         h2 = h2 + h2.T
         cis = fci.direct_spin1.FCI()
-        e, c = cis.kernel(h1, h2, norb, nelec, verbose=5)
+        e, c = cis.kernel(h1, h2, norb, nelec)
         dm1s, dm2s = cis.make_rdm12s(c, norb, nelec)
         self.assertAlmostEqual(dm1s[0].trace(), 3, 9)
         self.assertAlmostEqual(abs(dm1s[1]).sum(), 0, 9)
diff --git a/pyscf/grad/dispersion.py b/pyscf/grad/dispersion.py
index 1f954930b9..3b1695f172 100644
--- a/pyscf/grad/dispersion.py
+++ b/pyscf/grad/dispersion.py
@@ -28,10 +28,12 @@ def get_dispersion(mf_grad, disp=None, with_3body=None, verbose=None):
     '''gradient of DFTD3/DFTD4 dispersion correction'''
     mf = mf_grad.base
     mol = mf.mol
-    disp_version = check_disp(mf, disp)
-    if not disp_version:
+    if not check_disp(mf, disp):
         return np.zeros([mol.natm,3])
 
+    if disp is None:
+        disp = getattr(mf, 'disp', None)
+
     try:
         from pyscf.dispersion import dftd3, dftd4
     except ImportError:
@@ -39,9 +41,9 @@ def get_dispersion(mf_grad, disp=None, with_3body=None, verbose=None):
         raise
 
     method = getattr(mf, 'xc', 'hf')
-    method, _, disp_with_3body = parse_disp(method)
+    method, disp_version, disp_with_3body = parse_disp(method, disp)
 
-    if with_3body is not None:
+    if with_3body is None:
         with_3body = disp_with_3body
 
     if disp_version[:2].upper() == 'D3':
diff --git a/pyscf/grad/test/test_lpdft.py b/pyscf/grad/test/test_lpdft.py
index 0ced79fe05..61b17eb05f 100644
--- a/pyscf/grad/test/test_lpdft.py
+++ b/pyscf/grad/test/test_lpdft.py
@@ -286,7 +286,7 @@ def test_rohf_sanity (self):
                 de_ref = mc_grad_ref.kernel(state=i)[1, 0]
                 self.assertAlmostEqual (de, de_ref, 6)
 
-    def test_dfrohf_sanity (self):
+    def test_dfrohf_sanity_high_cost (self):
         n_states = 3
         mc_grad = diatomic(
             "Li", "H", 1.4, "ftpbe", "6-31g", 4, 2, n_states, density_fit=True, spin=2
diff --git a/pyscf/grad/test/test_mcpdft.py b/pyscf/grad/test/test_mcpdft.py
index af65a7ea34..82f7b025eb 100644
--- a/pyscf/grad/test/test_mcpdft.py
+++ b/pyscf/grad/test/test_mcpdft.py
@@ -43,23 +43,26 @@ def auto_setup (xyz='Li 0 0 0\nH 1.5 0 0'):
     mol_sym = gto.M (atom = xyz, basis = 'sto3g', symmetry=True,
                      output = '/dev/null', verbose = 0)
     mf_nosym = scf.RHF (mol_nosym).run ()
-    mc_nosym = mcscf.CASSCF (mf_nosym, 5, 2).run ()
+    mc_nosym = mcscf.CASSCF (mf_nosym, 5, 2)
     mf_sym = scf.RHF (mol_sym).run ()
     mc_sym = mcscf.CASSCF (mf_sym, 5, 2).run ()
+    mc_nosym.run (mo_coeff=mc_sym.mo_coeff)
     mcp_ss_nosym = mcpdft.CASSCF (mc_nosym, 'ftLDA,VWN3', 5, 2,
                                   grids_level=1).run ()
     mcp_ss_sym = mcpdft.CASSCF (mc_sym, 'ftLDA,VWN3', 5, 2,
                                 grids_level=1).run ()
-    mcp_sa_0 = mcp_ss_nosym.state_average ([1.0/5,]*5).run ()
+    mcp_sa_0 = mcp_ss_nosym.state_average ([1.0/5,]*5)
     solver_S = fci.solver (mol_nosym, singlet=True).set (spin=0, nroots=2)
     solver_T = fci.solver (mol_nosym, singlet=False).set (spin=2, nroots=3)
     mcp_sa_1 = mcp_ss_nosym.state_average_mix (
-        [solver_S,solver_T], [1.0/5,]*5).set(ci=None).run ()
+        [solver_S,solver_T], [1.0/5,]*5).set(ci=None)
     solver_A1 = fci.solver (mol_sym).set (wfnsym='A1', nroots=3)
     solver_E1x = fci.solver (mol_sym).set (wfnsym='E1x', nroots=1, spin=2)
     solver_E1y = fci.solver (mol_sym).set (wfnsym='E1y', nroots=1, spin=2)
     mcp_sa_2 = mcp_ss_sym.state_average_mix (
         [solver_A1,solver_E1x,solver_E1y], [1.0/5,]*5).set(ci=None).run ()
+    mcp_sa_0.run (mo_coeff=mcp_sa_2.mo_coeff)
+    mcp_sa_1.run (mo_coeff=mcp_sa_2.mo_coeff)
     mcp = [[mcp_ss_nosym, mcp_ss_sym], [mcp_sa_0, mcp_sa_1, mcp_sa_2]]
     nosym = [mol_nosym, mf_nosym, mc_nosym]
     sym = [mol_sym, mf_sym, mc_sym]
diff --git a/pyscf/grad/test/test_rhf.py b/pyscf/grad/test/test_rhf.py
index 087e6cf40e..30896ab731 100644
--- a/pyscf/grad/test/test_rhf.py
+++ b/pyscf/grad/test/test_rhf.py
@@ -20,7 +20,7 @@
 
 try:
     from pyscf.dispersion import dftd3, dftd4
-except ImportError:
+except (ImportError, OSError):
     dftd3 = dftd4 = None
 
 def setUpModule():
diff --git a/pyscf/grad/test/test_rks.py b/pyscf/grad/test/test_rks.py
index fe6e509122..42c815ebe9 100644
--- a/pyscf/grad/test/test_rks.py
+++ b/pyscf/grad/test/test_rks.py
@@ -20,7 +20,7 @@
 from pyscf.grad import rks
 try:
     from pyscf.dispersion import dftd3, dftd4
-except ImportError:
+except (ImportError, OSError):
     dftd3 = dftd4 = None
 
 
diff --git a/pyscf/grad/test/test_uhf.py b/pyscf/grad/test/test_uhf.py
index 25d3885031..971e61627f 100644
--- a/pyscf/grad/test/test_uhf.py
+++ b/pyscf/grad/test/test_uhf.py
@@ -19,7 +19,7 @@
 from pyscf import grad
 try:
     from pyscf.dispersion import dftd3, dftd4
-except ImportError:
+except (ImportError, OSError):
     dftd3 = dftd4 = None
 
 
@@ -196,8 +196,8 @@ def test_finite_diff_df_uhf_grad(self):
 H             -0.43459905    0.65805058   -0.00861418''')
         self.assertAlmostEqual(g[2,1], (e2-e1)/2e-4*lib.param.BOHR, 7)
 
-    @unittest.skipIf(dftd4 is None, "requires the dftd4 library")
-    def test_finite_diff_df_uhf_d4_grad(self):
+    @unittest.skipIf(dftd3 is None, "requires the dftd3 library")
+    def test_finite_diff_df_uhf_d3_grad(self):
         mf = scf.UHF(mol).density_fit ()
         mf.conv_tol = 1e-14
         mf.disp = 'd3bj'
diff --git a/pyscf/grad/test/test_uks.py b/pyscf/grad/test/test_uks.py
index ed77ba14b3..8331fee615 100644
--- a/pyscf/grad/test/test_uks.py
+++ b/pyscf/grad/test/test_uks.py
@@ -20,7 +20,7 @@
 from pyscf.grad import uks
 try:
     from pyscf.dispersion import dftd3, dftd4
-except ImportError:
+except (ImportError, OSError):
     dftd3 = dftd4 = None
 
 
diff --git a/pyscf/gto/basis/__init__.py b/pyscf/gto/basis/__init__.py
index bbcd110f7b..84ec26c04f 100644
--- a/pyscf/gto/basis/__init__.py
+++ b/pyscf/gto/basis/__init__.py
@@ -711,15 +711,33 @@ def load(filename_or_basisname, symb, optimize=OPTIMIZE_CONTRACTION):
 
         raise BasisNotFoundError(f'Unknown basis format or basis name for {filename_or_basisname}')
 
-    if 'dat' in basmod:
-        b = fload(join(basis_dir, basmod), symb, optimize)
-    elif isinstance(basmod, (tuple, list)) and isinstance(basmod[0], str):
-        b = []
-        for f in basmod:
-            b += fload(join(basis_dir, f), symb, optimize)
-    else:
-        mod = importlib.import_module('.'+basmod, __package__)
-        b = mod.__getattribute__(symb)
+    try:
+        if 'dat' in basmod:
+            b = fload(join(basis_dir, basmod), symb, optimize)
+        elif isinstance(basmod, (tuple, list)) and isinstance(basmod[0], str):
+            b = []
+            for f in basmod:
+                b += fload(join(basis_dir, f), symb, optimize)
+        else:
+            mod = importlib.import_module('.'+basmod, __package__)
+            b = mod.__getattribute__(symb)
+    except (BasisNotFoundError, AttributeError):
+        # When basis set is recognized but its .dat file lacks required elements (e.g., lanthanides), fallback to BSE
+        from pyscf.gto.basis import bse
+        if bse.basis_set_exchange is None:
+            warnings.warn(
+                'Basis may be available in basis-set-exchange. '
+                'It is recommended to install basis-set-exchange: '
+                'pip install basis-set-exchange')
+            raise BasisNotFoundError(
+                f'Basis set not found for {symb} in {filename_or_basisname}')
+        try:
+            bse_obj = bse.basis_set_exchange.api.get_basis(
+                filename_or_basisname, elements=symb)
+        except KeyError:
+            raise BasisNotFoundError(
+                f'Basis set not found for {symb} in {filename_or_basisname}')
+        b = bse._orbital_basis(bse_obj)[0][symb]
 
     if contr_scheme != 'Full':
         b = _truncate(b, contr_scheme, symb, split_name)
@@ -739,6 +757,23 @@ def load_ecp(filename_or_basisname, symb):
         return parse_nwchem_ecp.load(join(_BASIS_DIR, basmod), symb)
 
     if '\n' not in filename_or_basisname:
+        from pyscf.gto.basis import bse
+        if bse.basis_set_exchange is None:
+            warnings.warn(
+                'ECP may be available in basis-set-exchange. '
+                'It is recommended to install basis-set-exchange: '
+                'pip install basis-set-exchange')
+        else:
+            try:
+                bse_obj = bse.basis_set_exchange.api.get_basis(
+                    filename_or_basisname, elements=symb)
+            except KeyError:
+                raise BasisNotFoundError(filename_or_basisname)
+            ecp_basis = bse._ecp_basis(bse_obj)
+            if symb not in ecp_basis:
+                raise BasisNotFoundError(
+                    f'No ECP defined for {symb} in {filename_or_basisname}')
+            return ecp_basis[symb]
         raise RuntimeError(f'Unable to parse the input ECP data\n{filename_or_basisname}')
 
     try:
@@ -760,20 +795,6 @@ def load_ecp(filename_or_basisname, symb):
         except BasisNotFoundError:
             pass
 
-    # Last, a trial to access Basis Set Exchange database
-    from pyscf.gto.basis import bse
-    if bse.basis_set_exchange is not None:
-        try:
-            bse_obj = bse.basis_set_exchange.api.get_basis(
-                filename_or_basisname, elements=symb)
-        except KeyError:
-            raise BasisNotFoundError(filename_or_basisname)
-        ecp_basis = bse._ecp_basis(bse_obj)
-        if len(ecp_basis) > 0:
-            return ecp_basis[symb]
-        else:
-            return {}
-
     raise BasisNotFoundError('Unknown ECP format or ECP name')
 
 # PP_NAME_PATTERN follows the convention of CP2K orbital basis and pseudo names
diff --git a/pyscf/gto/test/test_basis_parser.py b/pyscf/gto/test/test_basis_parser.py
index 1d6cbbf3b0..b1da3118d3 100644
--- a/pyscf/gto/test/test_basis_parser.py
+++ b/pyscf/gto/test/test_basis_parser.py
@@ -13,8 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import unittest
-import tempfile
 from functools import reduce
 import numpy
 from pyscf import gto
@@ -30,14 +30,15 @@
 
 class KnownValues(unittest.TestCase):
     def test_parse_pople(self):
+        join = os.path.join
         self.assertEqual(gto.basis._parse_pople_basis('631g(d)', 'C'),
-                         ('pople-basis/6-31G.dat', 'pople-basis/6-31G-polarization-d.dat'))
+                         (join('pople-basis', '6-31G.dat'), join('pople-basis', '6-31G-polarization-d.dat')))
         self.assertEqual(gto.basis._parse_pople_basis('631g**', 'C'),
-                         ('pople-basis/6-31Gss.dat',))
+                         (join('pople-basis', '6-31Gss.dat'),))
         self.assertEqual(gto.basis._parse_pople_basis('631++g**', 'C'),
-                         ('pople-basis/6-31++Gss.dat',))
+                         (join('pople-basis', '6-31++Gss.dat'),))
         self.assertEqual(gto.basis._parse_pople_basis('6311+g(d,p)', 'C'),
-                         ('pople-basis/6-311+G.dat', 'pople-basis/6-311G-polarization-d.dat'))
+                         (join('pople-basis', '6-311+G.dat'), join('pople-basis', '6-311G-polarization-d.dat')))
         self.assertRaises(KeyError, gto.basis._parse_pople_basis, '631g++', 'C')
 
     def test_basis_load(self):
@@ -61,7 +62,7 @@ def test_basis_load(self):
         self.assertEqual(len(gto.basis.load('def2-svp', 'Rn')), 16)
 
     def test_basis_load_from_file(self):
-        ftmp = tempfile.NamedTemporaryFile()
+        ftmp = lib.NamedTemporaryFile()
         ftmp.write('''
 Li    S
      16.1195750              0.15432897
@@ -401,7 +402,7 @@ def test_parse_gaussian_basis(self):
         self.assertEqual(ref, basis1)
 
     def test_parse_gaussian_load_basis(self):
-        with tempfile.NamedTemporaryFile(mode='w+') as f:
+        with lib.NamedTemporaryFile(mode='w+') as f:
             f.write('''
 ****
 H 0
@@ -412,7 +413,7 @@ def test_parse_gaussian_load_basis(self):
             f.flush()
             self.assertEqual(parse_gaussian.load(f.name, 'H'), [[0, [1., 1.]]])
 
-        with tempfile.NamedTemporaryFile(mode='w+') as f:
+        with lib.NamedTemporaryFile(mode='w+') as f:
             f.write('''
 H 0
 S 1 1.0
@@ -422,7 +423,7 @@ def test_parse_gaussian_load_basis(self):
             f.flush()
             self.assertEqual(parse_gaussian.load(f.name, 'H'), [[0, [1., 1.]]])
 
-        with tempfile.NamedTemporaryFile(mode='w+') as f:
+        with lib.NamedTemporaryFile(mode='w+') as f:
             f.write('''
 ****
 H 0
@@ -432,7 +433,7 @@ def test_parse_gaussian_load_basis(self):
             f.flush()
             self.assertEqual(parse_gaussian.load(f.name, 'H'), [[0, [1., 1.]]])
 
-        with tempfile.NamedTemporaryFile(mode='w+') as f:
+        with lib.NamedTemporaryFile(mode='w+') as f:
             f.write('''
 H 0
 S 1 1.0
diff --git a/pyscf/gto/test/test_ecp.py b/pyscf/gto/test/test_ecp.py
index b65b7dcb80..cee3a99dac 100644
--- a/pyscf/gto/test/test_ecp.py
+++ b/pyscf/gto/test/test_ecp.py
@@ -420,6 +420,53 @@ def test_ecp_f_in_valence(self):
         self.assertEqual(mol.ao_labels()[40], '0 U 5f-3  ')
         self.assertAlmostEqual(lib.fp(mf.get_hcore()), -55.38627201912257)
 
+    def test_large_exponent_ecp_closed_form(self):
+        # Regression test for the adaptive Gauss-Chebyshev radial quadrature
+        # in nr_ecp.c.  At large combined exponents the integrand is sharply
+        # peaked at small r; two successive coarse rules would happen to agree
+        # to 1e-12 even when both were under-resolved, so the loop declared
+        # premature convergence and the integral could be off by 1e-5.
+        #
+        # The radial integral with a same-center primitive AO of angular
+        # momentum l_ao (single primitive, exponent al) and a same-center
+        # local/semilocal ECP channel c * r^(n-2) * exp(-g r^2) factorises so
+        # that the ratio I(g1)/I(g2) at fixed alpha and l is
+        #   ((2 alpha + g2) / (2 alpha + g1)) ** ((n + 2*l_ao + 1) / 2)
+        # independent of the AO normalisation, providing a stringent
+        # closed-form check on the radial quadrature.
+        L_SYM = {0: 'S', 1: 'P', 2: 'D'}
+
+        def build_local(n, l, al, g):
+            basis = {'Kr': [[l, [al, 1.0]]]}
+            ecp = 'ECP\nKr nelec 0\nKr ul\n%d  %.10e  1.0\nEND\n' % (n, g)
+            return gto.M(atom='Kr 0 0 0', basis=basis,
+                         ecp={'Kr': ecp}, verbose=0)
+
+        def build_semilocal(n, l, al, g):
+            basis = {'Kr': [[l, [al, 1.0]]]}
+            # zero ul keeps a local channel present (required by parser)
+            ecp = ('ECP\nKr nelec 0\nKr ul\n2  1.0  0.0\n'
+                   'Kr %s\n%d  %.10e  1.0\nEND\n' % (L_SYM[l], n, g))
+            return gto.M(atom='Kr 0 0 0', basis=basis,
+                         ecp={'Kr': ecp}, verbose=0)
+
+        def ratio_closed(n, l, al, g1, g2):
+            p = (n + 2 * l + 1) / 2.0
+            return ((2 * al + g2) / (2 * al + g1)) ** p
+
+        worst = 0.0
+        for build in (build_local, build_semilocal):
+            for n in (1, 2):
+                for l in (0, 1, 2):
+                    for al in (1e0, 1e2, 1e4, 3e5):
+                        for g1, g2 in ((1e1, 1e7), (1e3, 1e7), (1e5, 1e7)):
+                            m1 = build(n, l, al, g1).intor('ECPscalar')[0, 0]
+                            m2 = build(n, l, al, g2).intor('ECPscalar')[0, 0]
+                            r = m1 / m2
+                            err = abs(r / ratio_closed(n, l, al, g1, g2) - 1)
+                            worst = max(worst, err)
+        self.assertLess(worst, 1e-10)
+
 
 if __name__ == '__main__':
     print("Full Tests for ECP")
diff --git a/pyscf/gto/test/test_mole.py b/pyscf/gto/test/test_mole.py
index a2f7aa6afa..fc7577d3d9 100644
--- a/pyscf/gto/test/test_mole.py
+++ b/pyscf/gto/test/test_mole.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 import unittest
-import tempfile
 from functools import reduce
 import numpy
 import numpy as np
@@ -44,7 +43,7 @@ def setUpModule():
     mol0.spin = 1
     mol0.verbose = 7
     mol0.ecp = {'O1': 'lanl2dz'}
-    ftmp = tempfile.NamedTemporaryFile()
+    ftmp = lib.NamedTemporaryFile()
     mol0.output = ftmp.name
     mol0.build()
 
@@ -257,7 +256,7 @@ def test_first_argument(self):
         self.assertEqual(mol1.natm, 1)
 
     def test_atom_as_file(self):
-        ftmp = tempfile.NamedTemporaryFile('w')
+        ftmp = lib.NamedTemporaryFile('w')
         # file in raw format
         ftmp.write('He 0 0 0\nHe 0 0 1\n')
         ftmp.flush()
@@ -265,14 +264,14 @@ def test_atom_as_file(self):
         self.assertEqual(mol1.natm, 2)
 
         # file in xyz format
-        ftmp = tempfile.NamedTemporaryFile('w', suffix='.xyz')
+        ftmp = lib.NamedTemporaryFile('w', suffix='.xyz')
         ftmp.write('2\n\nHe 0 0 0\nHe 0 0 1\n')
         ftmp.flush()
         mol1 = gto.M(atom=ftmp.name)
         self.assertEqual(mol1.natm, 2)
 
         # file in zmatrix format
-        ftmp = tempfile.NamedTemporaryFile('w', suffix='.zmat')
+        ftmp = lib.NamedTemporaryFile('w', suffix='.zmat')
         ftmp.write('He\nHe 1 1.5\n')
         ftmp.flush()
         mol1 = gto.M(atom=ftmp.name)
@@ -621,7 +620,7 @@ def test_atom_method(self):
 
     def test_dump_loads_skip(self):
         import json
-        with tempfile.NamedTemporaryFile() as tmpfile:
+        with lib.NamedTemporaryFile() as tmpfile:
             lib.chkfile.save_mol(mol0, tmpfile.name)
             mol1 = gto.Mole()
             mol1.update(tmpfile.name)
@@ -975,7 +974,7 @@ def test_ao2mo(self):
         self.assertAlmostEqual(eri[0,0], 1.0557129427350722, 12)
 
     def test_tofile(self):
-        tmpfile = tempfile.NamedTemporaryFile()
+        tmpfile = lib.NamedTemporaryFile()
         mol = gto.M(atom=[[1  , (0.,1.,1.)],
                           ["O1", (0.,0.,0.)],
                           [1  , (1.,1.,0.)], ])
@@ -990,7 +989,7 @@ def test_tofile(self):
             self.assertEqual(f.read(), ref)
         self.assertEqual(out1, ref[:-1])
 
-        tmpfile = tempfile.NamedTemporaryFile(suffix='.zmat')
+        tmpfile = lib.NamedTemporaryFile(suffix='.zmat')
         str1 = mol.tofile(tmpfile.name, format='zmat')
         #FIXME:self.assertEqual(mol._atom, mol.fromfile(tmpfile.name))
 
@@ -1020,7 +1019,7 @@ def test_fromstring(self):
         print(mol.unit == 'Angstrom')
 
     def test_fromfile(self):
-        with tempfile.NamedTemporaryFile(mode='w+', suffix='.xyz') as f:
+        with lib.NamedTemporaryFile(mode='w+', suffix='.xyz') as f:
             f.write('2\n\nH 0 0 1; H 0 -1 0')
             f.flush()
             mol = gto.Mole()
diff --git a/pyscf/gw/bse.py b/pyscf/gw/bse.py
new file mode 100644
index 0000000000..3fc5597d7a
--- /dev/null
+++ b/pyscf/gw/bse.py
@@ -0,0 +1,2006 @@
+#!/usr/bin/env python
+# Copyright 2014-2026 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Tianyu Zhu <zhutianyu1991@gmail.com>
+# Author: Christopher Hillenbrand <chillenbrand15@gmail.com>
+# Author: Jiachen Li <lijiachen.duke@gmail.com>
+#
+
+"""
+Bethe-Salpeter equation (BSE) for excitation energy.
+Both restricted and unrestricted cases are supported.
+BSE can be solved with (energy-specific) Davidson algorithm, Lanczos algorithms or fully diagonalization.
+
+References:
+    Hillenbrand, Christopher, Jiachen Li, and Tianyu Zhu. J. Chem. Phys. 162, 174117 (2025).
+    J. Comput. Chem. 38, 383 (2017).
+    Ghosh, S. K and  Chattaraj, P. K. (Eds.). (2013).
+    SIAM J. Matrix Anal. Appl. 39, 683 (2018).
+"""
+
+import time
+
+import numpy as np
+import scipy
+import scipy.linalg as sla
+import h5py
+
+from pyscf import lib
+from pyscf.data import nist
+from pyscf.tools import mo_mapping
+
+HARTREE2EV = nist.HARTREE2EV
+
+einsum = lib.einsum
+
+
+def bse_full_diagonalization(multi, nocc, mo_energy, Lpq, TDA=False):
+    """Full diagonalization of BSE equation.
+    BSE equation is defined as equation 1 in doi.org/10.1002/jcc.24688.
+    Spin-adapted formalism can be found in chapter 18.3.2 in "Concepts and methods in modern theoretical chemistry.
+    Electronic structure (2013, CRC) Ghosh S.K., Chattaraj P.K. (eds.)"
+    The working equation is rewritten as equation 15 in doi.org/10.1063/1.477483.
+
+    Parameters
+    ----------
+    multi : str
+        multiplicity, 's'=singlet, 't'=triplet, 'u'=unrestricted.
+    nocc : int array
+        numbers of occupied orbitals.
+    mo_energy : double array
+        orbital energy.
+    Lpq : double array
+        three-center density-fitting matrix in MO.
+    TDA : bool, optional
+        use Tamm-Dancoff approximation, by default False
+
+    Returns
+    -------
+    exci : double array
+        excitation energy.
+    X_vec : list of double ndarray
+        X blocks of eigenvectors (excitations).
+    Y_vec : list of double ndarray
+        Y blocks of eigenvectors (de-excitation).
+    """
+    nspin, _, nmo, _ = Lpq.shape
+
+    # determine dimension
+    nvir = [(nmo - nocc[i]) for i in range(nspin)]
+    dim = [(nocc[i] * nvir[i]) for i in range(nspin)]
+    full_dim = dim[0] + dim[1] if nspin == 2 else dim[0]
+    apb = np.zeros(shape=[full_dim, full_dim], dtype=np.double)
+
+    # amb is not allocated if TDA is true, since B=0
+    if not TDA:
+        amb = np.zeros_like(apb)
+
+    Lpq_bar = _get_lpq_bar(nocc=nocc, mo_energy=mo_energy, Lpq=Lpq)
+
+    # scale Coulomb matrix
+    scale = 4.0 / nspin
+    if TDA:
+        scale /= 2.0
+
+    # Coulomb part
+    if multi == 's' or multi == 'u':
+        for i in range(nspin):
+            for j in range(nspin):
+                apb[i * dim[0] : i * dim[0] + dim[i], j * dim[0] : j * dim[0] + dim[j]] += einsum(
+                    'Lia,Ljb->iajb', Lpq[i][:, : nocc[i], nocc[i] :], Lpq[j][:, : nocc[j], nocc[j] :]
+                ).reshape(dim[i], dim[j])
+        apb *= scale
+
+    # W part
+    for i in range(nspin):
+        WA = -einsum('Lij,Lab->iajb', Lpq[i][:, : nocc[i], : nocc[i]], Lpq_bar[i][:, nocc[i] :, nocc[i] :])
+        WA = WA.reshape(nocc[i] * nvir[i], nocc[i] * nvir[i])
+        apb[i * dim[0] : i * dim[0] + dim[i], i * dim[0] : i * dim[0] + dim[i]] += WA
+        if not TDA:
+            amb[i * dim[0] : i * dim[0] + dim[i], i * dim[0] : i * dim[0] + dim[i]] += WA
+            WB = -einsum('Lib,Laj->iajb', Lpq[i][:, : nocc[i], nocc[i] :], Lpq_bar[i][:, nocc[i] :, : nocc[i]])
+            WB = WB.reshape(nocc[i] * nvir[i], nocc[i] * nvir[i])
+            apb[i * dim[0] : i * dim[0] + dim[i], i * dim[0] : i * dim[0] + dim[i]] += WB
+            amb[i * dim[0] : i * dim[0] + dim[i], i * dim[0] : i * dim[0] + dim[i]] -= WB
+
+    # orbital energy contribution to A+B and A-B matrix
+    orb_diff = []
+    for i in range(nspin):
+        orb_diff.append((mo_energy[i][None, nocc[i] :] - mo_energy[i][: nocc[i], None]).reshape(-1))
+    orb_diff = np.concatenate(orb_diff, axis=0)
+    if not TDA:
+        np.fill_diagonal(amb, orb_diff + np.diag(amb))
+    np.fill_diagonal(apb, orb_diff + np.diag(apb))
+
+    if TDA:
+        # Diagonalizing A is numerically more stable than
+        # diagonalizing A^2. Solve standard hermitian eigenvalue problem
+
+        # B = 0, so A = apb
+        exci, xpy = scipy.linalg.eigh(apb)
+        X_vec = xpy.T
+        Y_vec = np.zeros_like(xpy)
+
+    else:
+        # equation 15 in doi/10.1063/1.477483, solved by LAPACK function dsygvd
+        exci_sqr, xpy_w = scipy.linalg.eigh(apb, amb, type=3)
+        exci = np.sqrt(exci_sqr)
+
+        # dsygvd normalizes xpy_w such that
+        # xpy_w @ xpy_w.T = A - B
+        # Using the fact that A - B = (X+Y) @ diag(w) @ (X+Y).T,
+        # we calculate X+Y = xpy_w @ diag(1/sqrt(w)).
+        xpy = xpy_w / np.sqrt(exci)[None, :]
+
+        # (A+B) |X+Y> = w |X-Y>, so
+        # |X-Y> = w^-1 (A+B) |X+Y>
+        xmy = (apb @ xpy) / exci[None, :]
+
+        # Rows of X_vec and Y_vec are the eigenvectors, hence the transpose.
+        X_vec = (xpy + xmy).T / 2.0
+        Y_vec = (xpy - xmy).T / 2.0
+
+    # reshape X and Y eigenvector
+    if nspin == 1:
+        X_vec = [X_vec.reshape(-1, nocc[0], nvir[0])]
+        Y_vec = [Y_vec.reshape(-1, nocc[0], nvir[0])]
+    else:
+        X_vec_a, X_vec_b, Y_vec_a, Y_vec_b = [], [], [], []
+        for r in range(len(exci)):
+            X_vec_a.append(X_vec[r][: dim[0]].reshape(nocc[0], nvir[0]))
+            X_vec_b.append(X_vec[r][dim[0] :].reshape(nocc[1], nvir[1]))
+            Y_vec_a.append(Y_vec[r][: dim[0]].reshape(nocc[0], nvir[0]))
+            Y_vec_b.append(Y_vec[r][dim[0] :].reshape(nocc[1], nvir[1]))
+        X_vec = [np.asarray(X_vec_a), np.asarray(X_vec_b)]
+        Y_vec = [np.asarray(Y_vec_a), np.asarray(Y_vec_b)]
+
+    return exci, X_vec, Y_vec
+
+
+def davidson_restart(Mp, Mm, tri_vec, nvec_pair_to_save, e_min=0.0):
+    """Restart Davidson algorithm.
+
+    Parameters
+    ----------
+    Mp : ndarray
+        The matrix <tri_vec|A+B|tri_vec>
+    Mm : ndarray or None
+        The matrix <tri_vec|A-B|tri_vec>
+    tri_vec : ndarray
+        Trial vectors.
+    nvec_pair_to_save : int
+        Number of vector pairs to save.
+    e_min : double, optional
+        Minimum desired excitation energy, by default 0.0
+
+    Returns
+    -------
+    int
+        Number of new trial vectors returned.
+    """
+    # Full BSE case.
+    if Mm is not None:
+        full_dim = tri_vec.shape[1]
+        assert tri_vec.shape[0] >= nvec_pair_to_save, (
+            'Requested number of saved trial vectors is larger than the allocated space.'
+        )
+        Mp_sym = (Mp + Mp.T) / 2.0
+        Mm_sym = (Mm + Mm.T) / 2.0
+        nprod = Mm.shape[0]
+        exci_sqr, xpy_w = scipy.linalg.eigh(Mp_sym, Mm_sym, type=3)
+        e_tri = np.sqrt(exci_sqr)
+        emin_index = np.searchsorted(e_tri, e_min, side='left')
+
+        if full_dim < 2 * nvec_pair_to_save:
+            #print('full_dim < 2*nvec_pair_to_save')
+            Q, _, _ = sla.qr(tri_vec[:nprod].T, mode='economic', pivoting=True)
+            tri_vec[:nprod] = Q.T
+            return nprod
+
+        # Truncate the eigenvectors and eigenvalues outside the target energy range.
+        e_tri = e_tri[emin_index:]
+        xpy_w = xpy_w[:, emin_index:]
+        nvec_pair_to_save = min(nvec_pair_to_save, e_tri.size)
+
+        # Calculate normalized |X+Y> and |X-Y> in subspace.
+        xpy = xpy_w / np.sqrt(e_tri)[None, :]
+        xmy = (Mp_sym @ xpy) / e_tri[None, :]
+
+        # Write out the left and right vectors in the full space to a temporary file.
+        # They're written to disk because they may be too large to fit in memory.
+        with lib.H5TmpFile() as chkf:
+            dset = chkf.create_dataset('tri_vec', shape=(2 * nvec_pair_to_save, full_dim), fillvalue=0)
+            blksize = 10
+            buf = np.empty((blksize, full_dim))
+            for i in range(0, nvec_pair_to_save, blksize):
+                if i + blksize < nvec_pair_to_save:
+                    np.matmul(xpy[:, i : i + blksize].T, tri_vec[:nprod], out=buf)
+                    dset.write_direct(buf, dest_sel=np.s_[2 * i : 2 * i + blksize])
+                    np.matmul(xmy[:, i : i + blksize].T, tri_vec[:nprod], out=buf)
+                    dset.write_direct(buf, dest_sel=np.s_[2 * i + blksize : 2 * i + 2 * blksize])
+                else:
+                    remaining = nvec_pair_to_save - i
+                    np.matmul(xpy[:, i : i + remaining].T, tri_vec[:nprod], out=buf[:remaining])
+                    dset.write_direct(buf[:remaining], dest_sel=np.s_[2 * i : 2 * i + remaining])
+                    np.matmul(xmy[:, i : i + remaining].T, tri_vec[:nprod], out=buf[:remaining])
+                    dset.write_direct(buf[:remaining], dest_sel=np.s_[2 * i + remaining : 2 * i + 2 * remaining])
+
+            # Read the vectors back in and orthogonalize them.
+            assert (tri_vec[: 2 * nvec_pair_to_save].T).flags.f_contiguous
+            dset.read_direct(
+                tri_vec, source_sel=np.s_[: 2 * nvec_pair_to_save], dest_sel=np.s_[: 2 * nvec_pair_to_save]
+            )
+
+            # In-place QR decomposition leaving orthogonal vectors Q as the rows
+            # of tri_vec.
+            lwork = sla.lapack.dgeqrf_lwork(2 * nvec_pair_to_save, full_dim)
+            _, tau, _, _ = sla.lapack.dgeqrf(tri_vec[: 2 * nvec_pair_to_save].T, lwork=lwork, overwrite_a=1)
+            sla.lapack.dorgqr(tri_vec[: 2 * nvec_pair_to_save].T, tau, overwrite_a=1)
+        return 2 * nvec_pair_to_save
+
+    # TDA case.
+    else:
+        full_dim = tri_vec.shape[1]
+        assert (
+            tri_vec.shape[0] >= nvec_pair_to_save
+        ), 'Requested number of saved trial vectors is larger than the allocated space.'
+        Mp_sym = (Mp + Mp.T) / 2.0
+        nprod = Mp.shape[0]
+        e_tri, x = scipy.linalg.eigh(Mp_sym)
+        emin_index = np.searchsorted(e_tri, e_min, side='left')
+
+        if full_dim < nvec_pair_to_save:
+            #print('full_dim < 2*nvec_pair_to_save')
+            Q, R, _ = sla.qr(tri_vec[:nprod].T, mode='economic', pivoting=True)
+            tri_vec[:nprod] = Q.T
+            return nprod
+
+        # Truncate the eigenvectors and eigenvalues outside the target energy range.
+        e_tri = e_tri[emin_index:]
+        x = x[:, emin_index:]
+        nvec_pair_to_save = min(nvec_pair_to_save, e_tri.size)
+
+        # Write out the left and right vectors in the full space to a temporary file.
+        # They're written to disk because they may be too large to fit in memory.
+        with lib.H5TmpFile() as chkf:
+            dset = chkf.create_dataset('tri_vec', shape=(nvec_pair_to_save, full_dim), fillvalue=0)
+            blksize = 10
+            buf = np.empty((blksize, full_dim))
+            for i in range(0, nvec_pair_to_save, blksize):
+                if i + blksize < nvec_pair_to_save:
+                    np.matmul(x[:, i : i + blksize].T, tri_vec[:nprod], out=buf)
+                    dset.write_direct(buf, dest_sel=np.s_[i : i + blksize])
+                else:
+                    remaining = nvec_pair_to_save - i
+                    np.matmul(x[:, i : i + remaining].T, tri_vec[:nprod], out=buf[:remaining])
+                    dset.write_direct(buf[:remaining], dest_sel=np.s_[i : i + remaining])
+
+            # Read the vectors back in and orthogonalize them.
+            assert (tri_vec[:nvec_pair_to_save].T).flags.f_contiguous
+            dset.read_direct(tri_vec, source_sel=np.s_[:nvec_pair_to_save], dest_sel=np.s_[:nvec_pair_to_save])
+
+            # In-place QR decomposition leaving orthogonal vectors Q as the rows
+            # of tri_vec.
+            lwork = sla.lapack.dgeqrf_lwork(nvec_pair_to_save, full_dim)
+            _, tau, _, _ = sla.lapack.dgeqrf(tri_vec[:nvec_pair_to_save].T, lwork=lwork, overwrite_a=1)
+            sla.lapack.dorgqr(tri_vec[:nvec_pair_to_save].T, tau, overwrite_a=1)
+        return nvec_pair_to_save
+
+
+def davidson_save_checkpoint(chkfile, tri_vec, amb_prod, apb_prod, nprod):
+    """Save the current state of the Davidson algorithm to a checkpoint file.
+
+    Parameters
+    ----------
+    chkfile : str, pathlib.Path
+        Path to the checkpoint file.
+    tri_vec : ndarray
+        Trial vectors.
+    amb_prod : ndarray
+        The vectors (A-B)|tri_vec>.
+    apb_prod : ndarray
+        The vectors (A+B)|tri_vec>.
+    nprod : int
+        The number of vectors to be written---we save the slice tri_vec[:nprod].
+    """
+    full_dim = tri_vec.shape[1]
+    with h5py.File(chkfile, 'a') as chkf:
+        if 'tri_vec' not in chkf:
+            chkf.create_dataset('tri_vec', data=tri_vec, maxshape=(None, full_dim), chunks=(1, full_dim))
+            chkf.create_dataset('amb_prod', data=amb_prod, maxshape=(None, full_dim), chunks=(1, full_dim))
+            chkf.create_dataset('apb_prod', data=apb_prod, maxshape=(None, full_dim), chunks=(1, full_dim))
+        else:
+            old_ntri = chkf['tri_vec'].shape[0]
+            # Discard old contents if we're overwriting.
+            if nprod < old_ntri:
+                old_ntri = 0
+            writesel = np.s_[old_ntri:nprod]
+            chkf['tri_vec'].resize((nprod, full_dim))
+            chkf['amb_prod'].resize((nprod, full_dim))
+            chkf['apb_prod'].resize((nprod, full_dim))
+            chkf['tri_vec'].write_direct(tri_vec, source_sel=writesel, dest_sel=writesel)
+            chkf['amb_prod'].write_direct(amb_prod, source_sel=writesel, dest_sel=writesel)
+            chkf['apb_prod'].write_direct(apb_prod, source_sel=writesel, dest_sel=writesel)
+    return
+
+
+def davidson_load_from_checkpoint(chkfile, tri_vec, amb_prod, apb_prod, nload=None):
+    """Load the contents of a checkpoint file into the Davidson algorithm.
+
+    Parameters
+    ----------
+    chkfile : str, pathlib.Path
+        Path to the checkpoint file.
+    tri_vec : ndarray
+        Array to contain trial vectors.
+    amb_prod : ndarray
+        Array to contain the vectors (A-B)|tri_vec>.
+    apb_prod : ndarray
+        Array to contain the vectors (A+B)|tri_vec>.
+    nload : int, optional
+        Maximum number of trial vectors to load. If None, load all vectors.
+
+    Returns
+    -------
+    int
+        The number of trial vectors loaded.
+    """
+    with h5py.File(chkfile, 'r') as chkf:
+        if 'tri_vec' not in chkf:
+            raise ValueError('Checkpoint file does not contain tri_vec.')
+        ntri = chkf['tri_vec'].shape[0]
+        if nload is not None:
+            ntri = min(ntri, nload)
+        sel = np.s_[:ntri]
+        for array in (tri_vec, amb_prod, apb_prod):
+            if array.shape[0] < ntri:
+                raise ValueError(f'max_vec is too small to load {ntri} vectors as requested.')
+        chkf['tri_vec'].read_direct(tri_vec, source_sel=sel, dest_sel=sel)
+        chkf['amb_prod'].read_direct(amb_prod, source_sel=sel, dest_sel=sel)
+        chkf['apb_prod'].read_direct(apb_prod, source_sel=sel, dest_sel=sel)
+    return ntri
+
+
+def bse_davidson(
+    bse,
+    multi,
+    e_min=0.0,
+    delta=0.0,
+    core_orbs=None,
+    init_from_chkfile=None,
+    expand_only_core=False,
+    precond_exact_diag=False,
+):
+    """Davidson algorithm for BSE.
+    The Davidson algorithm follows doi.org/10.1063/1.477483.
+    BSE equation is defined as equation 1 in doi.org/10.1002/jcc.24688.
+    Spin-adapted formalism can be found in chapter 18.3.2 in "Concepts and methods in modern theoretical chemistry.
+    Electronic structure (2013, CRC) Ghosh S.K., Chattaraj P.K. (eds.)"
+
+    Parameters
+    ----------
+    bse : BSE
+        BSE object.
+    multi : str
+        multiplicity, 's'=singlet, 't'=triplet, 'u'=unrestricted.
+    e_min : float, optional
+        minimum desired excitation energy. Defaults to 0.0.
+    delta : float, optional
+        energy shift for trial vector generation, typically <=0.0. Defaults to 0.0.
+    core_orbs : optional
+        filter function or AO labels or AO index, for generating trial vectors from core orbitals.
+        If this is provided, then e_min and delta are not used to generate trial vectors.
+    init_from_chkfile : str, optional
+        checkpoint file to initialize the Davidson algorithm. Defaults to None.
+    expand_only_core : bool, optional
+        expand only the core orbitals. Defaults to False.
+    precond_exact_diag : bool, optional
+        use exact diagonal preconditioning. Defaults to False.
+
+    Returns
+    -------
+    exci : double array
+        excitation energy.
+    X_vec : list of double ndarray
+        X block of eigenvector (excitation).
+    Y_vec : list of double ndarray
+        Y block of eigenvector (de-excitation).
+    """
+    # load matrix
+    nspin = bse.nspin
+    nmo = bse.nmo
+    nocc = bse.nocc
+    mo_energy = bse.mo_energy
+    Lpq = bse.Lpq
+    # load parameter
+    TDA = bse.TDA
+    max_vec = bse.max_vec
+    nroot = bse.nroot
+    max_iter = bse.max_iter
+    max_expand = bse.max_expand
+    init_ntri = max(2, bse.init_ntri)
+    residue_thresh = bse.residue_thresh
+
+    # determine dimension
+    nvir = [(nmo - nocc[i]) for i in range(nspin)]
+    dim = [(nocc[i] * nvir[i]) for i in range(nspin)]
+    full_dim = dim[0] + dim[1] if nspin == 2 else dim[0]
+
+    # initialize trial vector
+    tri_vec = np.zeros(shape=[max_vec, full_dim], dtype=np.double)
+    ntri = min(init_ntri, full_dim)  # initial guess size should be larger than nroot
+
+    if bse.trial == 'identity':
+        ntri_found, tri_vec_found = get_davidson_trial_vector(
+            bse, ntri=ntri, nocc=nocc, mo_energy=mo_energy, e_min=e_min, delta=delta, core_orbs=core_orbs
+        )
+    elif bse.trial == 'subspace':
+        ntri_found, tri_vec_found = get_davidson_trial_vector_diag(
+            ntri, multi, nocc, mo_energy, Lpq, nocc_sub=bse.nocc_sub, nvir_sub=bse.nvir_sub, e_min=e_min, delta=delta,
+            TDA=TDA
+        )
+    else:
+        raise ValueError
+
+    if ntri_found < ntri:
+        lib.logger.info(bse, f'only {ntri_found} trial vectors are generated rather than {ntri}.')
+        ntri = ntri_found
+    if ntri_found < init_ntri:
+        raise ValueError('cannot find enough trial vectors; lower e_min or add more trial vectors')
+    tri_vec[:ntri, :] = tri_vec_found
+    del tri_vec_found
+
+    # initialize Davidson matrix
+    apb_prod = np.zeros_like(tri_vec)
+    if not TDA:
+        amb_prod = np.zeros_like(tri_vec)
+    else:
+        amb_prod = None
+
+    Lia = [np.ascontiguousarray(Lpq[s][:, : nocc[s], nocc[s] :]) for s in range(nspin)]
+    Laa = [np.ascontiguousarray(Lpq[s][:, nocc[s] :, nocc[s] :]) for s in range(nspin)]
+    Lii_bar, Lia_bar = _get_lpq_bar_by_block(
+        nocc=nocc, mo_energy=mo_energy, Lii=[Lpq[s][:, : nocc[s], : nocc[s]] for s in range(nspin)], Lia=Lia
+    )
+
+    if precond_exact_diag:
+        assert TDA
+        Laa_diag = [np.diagonal(Laa[s], axis1=1, axis2=2) for s in range(nspin)]
+        Lii_bar_diag = [np.diagonal(Lii_bar[s], axis1=0, axis2=2) for s in range(nspin)]
+        v_iaia = [
+            2 / nspin * np.vecdot(Lia[s].reshape(-1, nocc[s] * nvir[s]).T, Lia[s].reshape(-1, nocc[s] * nvir[s]).T)
+            for s in range(nspin)
+        ]
+        Wiiaa = [(Lii_bar_diag[s].T @ Laa_diag[s]).reshape(nocc[s] * nvir[s]) for s in range(nspin)]
+        Wiaia = [
+            np.vecdot(Lia_bar[s].reshape(-1, nocc[s] * nvir[s]).T, Lia[s].reshape(-1, nocc[s] * nvir[s]).T).reshape(
+                nocc[s] * nvir[s]
+            )
+            for s in range(nspin)
+        ]
+        if TDA:
+            apb_diag = [v_iaia[s] - Wiiaa[s] - Wiaia[s] for s in range(nspin)]
+            #amb_diag = apb_diag
+        else:
+            apb_diag = [2 * v_iaia[s] - Wiiaa[s] - Wiaia[s] for s in range(nspin)]
+            #amb_diag = [Wiaia[s] - Wiiaa[s] for s in range(nspin)]
+        Laa_diag = None
+        Lii_bar_diag = None
+
+    # We no longer need Lpq in this function.
+    Lpq = None
+
+    # Delete Lpq if it is not needed anymore.
+    if bse.delete_lpq:
+        bse.Lpq = None
+
+    iter = 0
+    nprod = 0  # the number of contracted vectors
+    total_contract_work = 0
+    total_linalg_work = 0
+
+    Mm = None
+    Mp = None
+
+    if init_from_chkfile is not None:
+        ntri = davidson_load_from_checkpoint(init_from_chkfile, tri_vec, amb_prod, apb_prod)
+        lib.logger.info(bse, f'Loaded {ntri} trial vectors from {init_from_chkfile}.')
+        nprod = ntri
+
+    chk_last = 0
+
+    while iter < max_iter:
+        lib.logger.info(bse, '\nBSE Davidson #%d iteration, ntri= %d , nprod= %d .', iter + 1, ntri, nprod)
+        if not TDA:
+            apb_prod[nprod:ntri, :], amb_prod[nprod:ntri, :], contract_work_this_iter = _bse_contraction(
+                multi=multi,
+                nocc=nocc,
+                mo_energy=mo_energy,
+                Lia=Lia,
+                Laa=Laa,
+                Lii_bar=Lii_bar,
+                Lia_bar=Lia_bar,
+                tri_vec=tri_vec[nprod:ntri, :],
+                TDA=False,
+            )
+        else:
+            apb_prod[nprod:ntri, :], _, contract_work_this_iter = _bse_contraction(
+                multi=multi,
+                nocc=nocc,
+                mo_energy=mo_energy,
+                Lia=Lia,
+                Laa=Laa,
+                Lii_bar=Lii_bar,
+                Lia_bar=Lia_bar,
+                tri_vec=tri_vec[nprod:ntri, :],
+                TDA=True,
+            )
+        total_contract_work += contract_work_this_iter
+        lib.logger.info(bse, f'work for iter {iter+1}: {float(contract_work_this_iter):.2E}')
+
+        Mp, Mm, mmwork = update_mp_mm(Mp, Mm, tri_vec, apb_prod, amb_prod, ntri, nprod)
+        Mp_sym = (Mp + Mp.T) / 2.0
+        if not TDA:
+            Mm_sym = (Mm + Mm.T) / 2.0
+        total_linalg_work += mmwork
+        nprod_prev, nprod = nprod, ntri
+
+        if bse.chkfile is not None:
+            if nprod - chk_last >= bse.chk_every:
+                davidson_save_checkpoint(bse.chkfile, tri_vec, amb_prod, apb_prod, nprod)
+                lib.logger.info(bse, f'Saving progress at iteration {iter+1} to {bse.chkfile}: {chk_last}->{nprod}.')
+                chk_last = nprod
+
+        nroot_current = min(nroot, ntri)
+        # equation 15 in doi/10.1063/1.477483, solved by LAPACK function dsygvd
+
+        # Save current NumPy error handling settings
+        nperrhandling = np.geterr()['invalid']
+        try:
+            if not TDA:
+                exci_sqr, xpy_w = scipy.linalg.eigh(Mp_sym.T, Mm_sym.T, type=3)
+                np.seterr(invalid='raise')
+                e_tri = np.sqrt(exci_sqr)
+            else:
+                np.seterr(invalid='raise')
+                e_tri, xpy_w = scipy.linalg.eigh(Mp_sym.T, driver='evd')
+
+        except (scipy.linalg.LinAlgError, FloatingPointError):
+            lib.logger.warn(bse, 'Restarting Davidson algorithm.')
+            # restart Davidson algorithm
+            # Throw away most recent trial vectors, since they are likely to be linearly dependent
+            if bse.restart_max_size is None:
+                nvec_pair_to_save = nprod_prev
+            else:
+                nvec_pair_to_save = min(nprod_prev, bse.restart_max_size)
+            if not TDA:
+                ntri = davidson_restart(
+                    Mp[:nprod_prev, :nprod_prev],
+                    Mm[:nprod_prev, :nprod_prev],
+                    tri_vec,
+                    nvec_pair_to_save=nvec_pair_to_save,
+                    e_min=e_min,
+                )
+            else:
+                ntri = davidson_restart(
+                    Mp[:nprod_prev, :nprod_prev], None, tri_vec, nvec_pair_to_save=nvec_pair_to_save, e_min=e_min
+                )
+
+            # Set nprod to 0 to recalculate all mat-vec products.
+            nprod = 0
+            Mp = None
+            Mm = None
+            iter += 1
+            continue
+
+        finally:
+            # Restore NumPy error handling settings
+            np.seterr(invalid=nperrhandling)
+
+        if not TDA:
+            # dsygvd normalizes xpy_w such that
+            # xpy_w @ xpy_w.T = A - B
+            # Using the fact that A - B = (X+Y) @ diag(w) @ (X+Y).T,
+            # we calculate X+Y = xpy_w @ diag(1/sqrt(w)).
+            xpy = xpy_w / np.sqrt(e_tri)[None, :]
+
+            # (A+B) |X+Y> = w |X-Y>, so
+            # |X-Y> = w^-1 (A+B) |X+Y>
+            xmy = (Mp_sym @ xpy) / e_tri[None, :]
+
+            # Thanks to the use of the generalized eigensolver,
+            # xpy and xmy already form a biorthonormal system.
+
+        else:
+            # TDA is easy
+            xpy = xpy_w
+
+        total_linalg_work += ntri**3
+
+        found_roots = np.flatnonzero(e_tri >= e_min)
+        nrootfound = min(nroot, found_roots.size)
+        lib.logger.debug(bse, 'lowest %d exci above minimum: \n%s', nrootfound, e_tri[found_roots[:nrootfound]])
+        emin_index = np.searchsorted(e_tri, e_min, side='left')
+        if emin_index + nroot_current > ntri:
+            emin_index = ntri - nroot_current
+            if ntri >= nroot:
+                lib.logger.info(bse, 'fewer than nroot exci found above e_min.')
+
+        if core_orbs is not None and nspin == 1 and expand_only_core:
+            if not hasattr(bse, 'mol'):
+                raise ValueError('mol object is required if core_orbs is given.')
+            # Select those occupied orbitals with a significant contribution from given core orbitals.
+            occ_we_want = np.flatnonzero(
+                mo_mapping.mo_comps(core_orbs, bse.mol, bse.mo_coeff[0][:, : nocc[0]]) >= 0.3
+            )
+            core_roots = []
+
+            for idx in range(emin_index, ntri):
+                if not TDA:
+                    Xvec = (0.5 * (xpy[:, idx].T + xmy[:, idx].T)) @ tri_vec[:ntri, :]
+                else:
+                    Xvec = xpy[:, idx].T @ tri_vec[:ntri, :]
+                Xvec = Xvec.reshape(nocc[0], nvir[0])
+                Xvecsqr = np.linalg.norm(Xvec, axis=1)
+                X_core_component = np.linalg.norm(Xvecsqr[occ_we_want])
+                if X_core_component > 0.3:
+                    core_roots.append(idx)
+                if len(core_roots) >= nroot_current:
+                    break
+            exci_candidate_indices = np.asarray(core_roots, dtype=int)
+            lib.logger.debug(
+                bse,
+                'lowest %d core excitations above minimum: \n%s',
+                exci_candidate_indices.size,
+                e_tri[exci_candidate_indices],
+            )
+
+        else:
+            exci_candidate_indices = np.s_[emin_index : emin_index + nroot_current]
+
+        exci = e_tri[exci_candidate_indices]
+        # get left and right eigenvector in the full space, equation 25 and 26 in doi.org/10.1063/1.477483
+
+        right_vec_tri = xpy.T[exci_candidate_indices, :]
+        right_vec = np.matmul(right_vec_tri, tri_vec[:ntri, :])
+        total_linalg_work += nroot_current * ntri * full_dim
+
+        if not TDA:
+            left_vec_tri = xmy.T[exci_candidate_indices, :]
+            left_vec = np.matmul(left_vec_tri, tri_vec[:ntri, :])
+            total_linalg_work += nroot_current * ntri * full_dim
+
+        if not TDA:
+            right_res = -exci[:, None] * left_vec
+            left_res = -exci[:, None] * right_vec
+            right_res += np.matmul(right_vec_tri, apb_prod[:ntri, :])
+            left_res += np.matmul(left_vec_tri, amb_prod[:ntri, :])
+
+            # check convergence
+            res_norms_left = np.linalg.norm(left_res, axis=1) ** 2
+            res_norms_right = np.linalg.norm(right_res, axis=1) ** 2
+            res_norms = np.maximum(res_norms_left, res_norms_right)
+
+        else:  # TDA
+            right_res = -exci[:, None] * right_vec
+            right_res += np.matmul(right_vec_tri, apb_prod[:ntri, :])
+            res_norms = np.linalg.norm(right_res, axis=1) ** 2
+
+        max_res_norm = np.max(res_norms)
+        conv_vec = res_norms < residue_thresh
+        lib.logger.info(bse, 'max residue norm = %.4e', max_res_norm)
+        if conv_vec.size >= nroot:
+            if np.all(conv_vec[:nroot]):
+                conv = True
+                break
+
+        not_converged = np.flatnonzero(~conv_vec)
+        errs_not_converged = res_norms[not_converged]
+        assert np.max(errs_not_converged) == max_res_norm
+        srt_errs = np.argsort(errs_not_converged)[::-1]
+        nexpand = min(max_expand, nroot_current, not_converged.size, full_dim - ntri)
+        candidates_to_expand = not_converged[srt_errs[:nexpand]]
+
+        # Gather both left and right residues
+        if not TDA:
+            all_res = np.empty(shape=(2 * nexpand, full_dim), dtype=np.double)
+        else:
+            all_res = np.empty(shape=(nexpand, full_dim), dtype=np.double)
+
+        # preconditioning the residues, equation 29 in doi.org/10.1063/1.477483.
+        for s in range(nspin):
+            q_vec = exci[candidates_to_expand, None, None] - (
+                mo_energy[s][None, None, nocc[s] :] - mo_energy[s][None, : nocc[s], None]
+            )
+            q_vec = q_vec.reshape(-1, nocc[s] * nvir[s])
+            if precond_exact_diag:
+                q_vec -= apb_diag[s].reshape(-1, nocc[s] * nvir[s])
+            all_res[:nexpand, s * dim[0] : s * dim[0] + dim[s]] = (
+                right_res[candidates_to_expand, s * dim[0] : s * dim[0] + dim[s]] / q_vec
+            )
+            if not TDA:
+                all_res[nexpand:, s * dim[0] : s * dim[0] + dim[s]] = (
+                    left_res[candidates_to_expand, s * dim[0] : s * dim[0] + dim[s]] / q_vec
+                )
+
+        # The rows of all_res are now the preconditioned left residues
+        # followed by the preconditioned right residues.
+
+        # Orthogonalize residues against current trial vectors
+        all_res -= (all_res @ tri_vec[:ntri, :].T) @ tri_vec[:ntri, :]
+        # Orthogonalize residues amongst themselves
+        Q, R, _ = scipy.linalg.qr(all_res.T, mode='economic', pivoting=True)
+
+        # Don't care about the small residues
+        orth_res = Q.T[np.abs(np.diag(R)) > 1e-10]
+        # But we should take at least one new vector.
+        if orth_res.size == 0:
+            orth_res = Q.T[:1]
+
+        # Make sure the residues are orthogonal to the trial vectors
+        # and normalize them.
+        orth_res -= (orth_res @ tri_vec[:ntri, :].T) @ tri_vec[:ntri, :]
+        orth_res /= np.linalg.norm(orth_res, axis=1)[:, None]
+
+        n_new_vec = min(orth_res.shape[0], full_dim - ntri)
+        if n_new_vec > 0:
+            if ntri + n_new_vec > tri_vec.shape[0]:
+                raise ValueError('Exceeded max_vec. Davidson algorithm for BSE is not converged!')
+            tri_vec[ntri : ntri + n_new_vec] = orth_res[:n_new_vec]
+            ntri += n_new_vec
+            lib.logger.info(bse, 'add %d new trial vectors.', n_new_vec)
+        else:
+            # We need to restart.
+            lib.logger.warn(bse, 'Restarting Davidson algorithm.')
+            if bse.restart_max_size is None:
+                nvec_pair_to_save = ntri
+            else:
+                nvec_pair_to_save = min(ntri, bse.restart_max_size)
+            ntri = davidson_restart(Mp, Mm, tri_vec, nvec_pair_to_save=nvec_pair_to_save, e_min=e_min)
+        conv = False
+
+        iter += 1
+        if conv is True:
+            break
+
+    assert conv is True, 'Davidson algorithm for BSE is not converged!'
+
+    if bse.chkfile is not None:
+        davidson_save_checkpoint(bse.chkfile, tri_vec, amb_prod, apb_prod, nprod)
+        lib.logger.info(bse, f'Saving progress at iteration {iter+1} to {bse.chkfile}: {chk_last}->{nprod}.')
+        chk_last = nprod
+
+    lib.logger.info(bse, f'BSE converged in {iter} iterations, final subspace size = {nprod}')
+    lib.logger.info(bse, f'total work for contraction: {float(total_contract_work):.2E}')
+    lib.logger.info(bse, f'total work for linalg: {float(total_linalg_work):.2E}')
+    lib.logger.info(bse, f'Mp condition number: {np.linalg.cond(Mp_sym)}')
+    if Mm is not None:
+        lib.logger.info(bse, f'Mm condition number: {np.linalg.cond(Mm_sym)}')
+
+    found_roots = np.flatnonzero((exci >= e_min) & conv_vec)
+    nrootfound = found_roots.size
+    lib.logger.debug(bse, 'Finished with %d converged roots: \n%s', nrootfound, exci[found_roots])
+
+    # transfer left and right eigenvector to X and Y
+
+    if not TDA:
+        X_vec = (left_vec[found_roots] + right_vec[found_roots]) * 0.5
+        Y_vec = (-left_vec[found_roots] + right_vec[found_roots]) * 0.5
+    else:
+        X_vec = right_vec[found_roots]
+        Y_vec = np.zeros_like(X_vec)
+
+    # reshape X and Y eigenvector
+    if nspin == 1:
+        X_vec = [X_vec.reshape(nrootfound, nocc[0], nvir[0])]
+        Y_vec = [Y_vec.reshape(nrootfound, nocc[0], nvir[0])]
+    else:
+        X_vec_a, X_vec_b, Y_vec_a, Y_vec_b = [], [], [], []
+        for r in range(nrootfound):
+            X_vec_a.append(X_vec[r][: dim[0]].reshape(nocc[0], nvir[0]))
+            X_vec_b.append(X_vec[r][dim[0] :].reshape(nocc[1], nvir[1]))
+            Y_vec_a.append(Y_vec[r][: dim[0]].reshape(nocc[0], nvir[0]))
+            Y_vec_b.append(Y_vec[r][dim[0] :].reshape(nocc[1], nvir[1]))
+        X_vec = [np.asarray(X_vec_a), np.asarray(X_vec_b)]
+        Y_vec = [np.asarray(Y_vec_a), np.asarray(Y_vec_b)]
+
+    bse.exci = exci[found_roots]
+    bse.X_vec = X_vec
+    bse.Y_vec = Y_vec
+
+    return exci[found_roots], X_vec, Y_vec
+
+
+def update_mp_mm(Mp, Mm, tri_vec, apb_prod, amb_prod, ntri, nprod):
+    """Update Mp and Mm to reflect the new trial vectors.
+
+    Parameters
+    ----------
+    Mp : ndarray
+        The matrix <tri_vec|A+B|tri_vec>
+    Mm : ndarray or None
+        The matrix <tri_vec|A-B|tri_vec>
+    tri_vec : ndarray
+        Trial vectors (stored as rows).
+    apb_prod : ndarray
+        The vectors (A+B)|tri_vec> (stored as rows).
+    amb_prod : ndarray or None
+        The vectors (A-B)|tri_vec> (stored as rows).
+    ntri : int
+        Number of valid trial vectors in tri_vec.
+    nprod : int
+        Number of valid trial vectors when Mm and Mp were last updated.
+
+    Returns
+    -------
+    (ndarray, ndarray, int)
+        Mm, Mp, work; where work is a rough estimate of the FLOP count.
+    """
+    full_dim = tri_vec.shape[1]
+    work = 0
+    if Mp is None or Mm is None:
+        # A+B and A-B in subspace, step 3 in doi.org/10.1063/1.477483
+        if apb_prod is not None:
+            Mp = np.matmul(tri_vec[:ntri, :], apb_prod[:ntri, :].T)
+            work += ntri**2 * full_dim
+
+        if amb_prod is not None:
+            Mm = np.matmul(tri_vec[:ntri, :], amb_prod[:ntri, :].T)
+            work += ntri**2 * full_dim
+
+    else:
+        if apb_prod is not None:
+            Mp_new = np.zeros(shape=[ntri, ntri], dtype=np.double)
+            Mp_new[:nprod, :nprod] = Mp[:nprod, :nprod]
+            Mp_new[nprod:ntri, :ntri] = tri_vec[nprod:ntri, :] @ apb_prod[:ntri, :].T
+            Mp_new[:ntri, nprod:ntri] = Mp_new[nprod:ntri, :ntri].T
+            Mp_new[nprod:ntri, nprod:ntri] = tri_vec[nprod:ntri, :] @ apb_prod[nprod:ntri, :].T
+            Mp = Mp_new
+            work += (ntri**2 - nprod**2) * full_dim
+
+        if amb_prod is not None:
+            Mm_new = np.zeros(shape=[ntri, ntri], dtype=np.double)
+            Mm_new[:nprod, :nprod] = Mm[:nprod, :nprod]
+            Mm_new[nprod:ntri, :ntri] = tri_vec[nprod:ntri, :] @ amb_prod[:ntri, :].T
+            Mm_new[:ntri, nprod:ntri] = Mm_new[nprod:ntri, :ntri].T
+            Mm_new[nprod:ntri, nprod:ntri] = tri_vec[nprod:ntri, :] @ amb_prod[nprod:ntri, :].T
+            Mm = Mm_new
+            work += (ntri**2 - nprod**2) * full_dim
+
+    return Mp, Mm, work
+
+
+def bse_lanczos(bse, multi, u1=None, core_orbs=None, nsteps=100):
+    """Lanczos algorithm for BSE.
+    Follows 10.1137/16M1102641.
+
+    Parameters
+    ----------
+    bse : BSE
+        BSE object.
+    multi : str
+        multiplicity, 's'=singlet, 't'=triplet, 'u'=unrestricted.
+    u1 : np.ndarray, optional
+        initial state for Lanczos algorithm, by default None
+    core_orbs : np.ndarray, optional
+        core orbitals, by default None
+    nsteps : int, optional
+        the number of Lanczos steps, by default 100
+
+    Returns
+    -------
+    alphas : double array
+        coefficients from the Lanczos algorithm, diagonal elements of the tridiagonal matrix.
+    betas : double array
+        coefficients from the Lanczos algorithm, off-diagonal elements of the tridiagonal matrix.
+    """
+    # load matrix
+    nspin = bse.nspin
+    nmo = bse.nmo
+    nocc = bse.nocc
+    mo_energy = bse.mo_energy
+    # load parameter
+    TDA = bse.TDA
+
+    # determine dimension
+    nvir = [(nmo - nocc[i]) for i in range(nspin)]
+    dim = [(nocc[i] * nvir[i]) for i in range(nspin)]
+    full_dim = dim[0] + dim[1] if nspin == 2 else dim[0]
+
+    Lia = [np.ascontiguousarray(bse.Lpq[s][:, : nocc[s], nocc[s] :]) for s in range(nspin)]
+    Laa = [np.ascontiguousarray(bse.Lpq[s][:, nocc[s] :, nocc[s] :]) for s in range(nspin)]
+    Lii_bar, Lia_bar = _get_lpq_bar_by_block(
+        nocc=nocc, mo_energy=mo_energy, Lii=[bse.Lpq[s][:, : nocc[s], : nocc[s]] for s in range(nspin)], Lia=Lia
+    )
+
+    prev_vecs = np.zeros((nsteps + 1, full_dim))
+
+    if core_orbs is not None:
+        assert u1 is None, 'u1 and core_orbs cannot be used together'
+        u1 = np.zeros(full_dim)
+        occ_to_take = [
+            np.flatnonzero(mo_mapping.mo_comps(core_orbs, bse.mol, bse.mo_coeff[s]) >= 0.5) for s in range(nspin)
+        ]
+        for s in range(nspin):
+            vir_to_take = np.arange(nocc[s], nmo, dtype=int)
+            for o in occ_to_take[s]:
+                u1[s * dim[s] + o * nvir[s] + vir_to_take] = 1.0
+        u1 = u1 / np.linalg.norm(u1)
+
+    elif u1 is None:
+        eia = []
+        for s in range(nspin):
+            eia.append(np.asarray(mo_energy[s][None, nocc[s] :] - mo_energy[s][: nocc[s], None]).reshape(-1))
+        eia = np.concatenate(eia, axis=0)
+        u1 = np.random.random(full_dim) - 0.5
+        u1 = u1 / np.linalg.norm(u1)
+
+    apb_u1, _, _ = _bse_contraction(
+        multi=multi,
+        nocc=nocc,
+        mo_energy=mo_energy,
+        Lia=Lia,
+        Laa=Laa,
+        Lii_bar=Lii_bar,
+        Lia_bar=Lia_bar,
+        tri_vec=u1[None, :],
+        TDA=TDA,
+    )
+
+    apb_u1 = apb_u1.reshape(-1)
+
+    betas = np.zeros(nsteps)
+    alphas = np.zeros(nsteps)
+
+    if TDA is False:
+        u1_apbnorm = np.dot(u1, apb_u1)
+        u = u1 / np.sqrt(u1_apbnorm)
+        v = apb_u1 / np.sqrt(u1_apbnorm)
+    else:
+        u = u1 / np.linalg.norm(u1)
+        v = u
+
+    u_last = np.zeros_like(u)
+    #v_last = np.zeros_like(v)
+    beta_last = 0.0
+
+    prev_vecs[0] = v
+    nprev = 1
+
+    for step in range(nsteps):
+        lib.logger.debug(bse, 'BSE Lanczos #%d iteration', step + 1)
+        if TDA is False:
+            # x = (A - B) v_j - beta_{j-1} u_{j-1}
+            _, amb_v, _ = _bse_contraction(
+                multi=multi,
+                nocc=nocc,
+                mo_energy=mo_energy,
+                Lia=Lia,
+                Laa=Laa,
+                Lii_bar=Lii_bar,
+                Lia_bar=Lia_bar,
+                tri_vec=v.reshape((1, -1)),
+                TDA=TDA,
+            )
+            amb_v = amb_v.reshape(-1)
+            sla.blas.daxpy(u_last, amb_v, a=-beta_last)
+            x = amb_v
+            # alpha = v_j^T x
+            alphas[step] = np.dot(x, v)
+            # x = x - alpha u_j
+            sla.blas.daxpy(u, x, a=-alphas[step])
+            # y = (A + B) x
+            y, _, _ = _bse_contraction(
+                multi=multi,
+                nocc=nocc,
+                mo_energy=mo_energy,
+                Lia=Lia,
+                Laa=Laa,
+                Lii_bar=Lii_bar,
+                Lia_bar=Lia_bar,
+                tri_vec=x.reshape((1, -1)),
+                TDA=TDA,
+            )
+            y = y.reshape(-1)
+            # beta_j = sqrt(x^T y)
+            betas[step] = np.sqrt(np.dot(x, y))
+            u_last = u
+            # v_last = v
+            # u_{j+1} = x / beta_j
+            # v_{j+1} = y / beta_j
+            sla.blas.dscal(1.0 / betas[step], x)
+            sla.blas.dscal(1.0 / betas[step], y)
+            u = x
+            v = y
+        else:
+            # TDA approximation
+            # v = A u_j - beta_{j-1} u_{j-1}
+            v, _, _ = _bse_contraction(
+                multi=multi,
+                nocc=nocc,
+                mo_energy=mo_energy,
+                Lia=Lia,
+                Laa=Laa,
+                Lii_bar=Lii_bar,
+                Lia_bar=Lia_bar,
+                tri_vec=u.reshape((1, -1)),
+                TDA=TDA,
+            )
+            v = v.reshape(-1)
+            sla.blas.daxpy(u_last, v, a=-beta_last)
+            # alpha_j = u_j^T v
+            alphas[step] = np.dot(u, v)
+            # v = v - alpha u_j
+            sla.blas.daxpy(u, v, a=-alphas[step])
+
+            # orthogonalize against previous vectors
+            hs = prev_vecs[:nprev] @ v
+            v -= prev_vecs[:nprev].T @ hs
+
+            # beta_j = ||v||
+            betas[step] = np.linalg.norm(v)
+            # u_{j+1} = v / beta_j
+            sla.blas.dscal(1.0 / betas[step], v)
+            u_last = u
+            u = v
+            prev_vecs[nprev] = v
+            nprev += 1
+        beta_last = betas[step]
+    return alphas, betas
+
+
+def lanczos_roots_magnitudes(alphas, betas, TDA=False):
+    """Estimate the excitation spectrum density from the results of the Lanczos algorithm.
+
+    Parameters
+    ----------
+    alphas : double array
+        coefficients from the Lanczos algorithm, diagonal elements of the tridiagonal matrix.
+    betas : double array
+        coefficients from the Lanczos algorithm, off-diagonal elements of the tridiagonal matrix.
+    TDA : bool, optional
+        used TDA approximation, by default False
+
+    Returns
+    -------
+    roots_pos : double array
+        positive roots of excitation energies.
+    magnitudes : double array
+        the magnitude of each root.
+    """
+    Tk_diag = np.concatenate([alphas, alphas[-2::-1]], axis=0)
+    Tk_offdiag = np.concatenate([betas, betas[-3::-1]], axis=0)
+    roots, S = scipy.linalg.eigh_tridiagonal(Tk_diag, Tk_offdiag, lapack_driver='stebz')
+    roots_pos = roots[roots > 0]
+    if TDA is False:
+        roots_pos = np.sqrt(roots_pos)
+    magnitudes = S[0, roots > 0] ** 2
+
+    if TDA:
+        return roots_pos, magnitudes
+    else:
+        return roots_pos, magnitudes / roots_pos
+
+
+def lanczos_estimate_spectrum(alphas, betas, e_range, eta, nw, TDA=False):
+    """Estimate the excitation spectrum density from the results of the Lanczos algorithm.
+
+    Parameters
+    ----------
+    alphas : double array
+        coefficients from the Lanczos algorithm, diagonal elements of the tridiagonal matrix.
+    betas : double array
+        coefficients from the Lanczos algorithm, off-diagonal elements of the tridiagonal matrix.
+    e_range : tuple
+        energy range (e_min, e_max).
+    eta : float
+        broadening parameter.
+    nw : int
+        number of frequency points.
+    TDA : bool, optional
+        used TDA approximation, by default False
+
+    Returns
+    -------
+    freqs : double array
+        frequency points at which to compute density estimate.
+    density : double array
+        excitation spectrum density estimate.
+    """
+    roots_pos, magnitudes = lanczos_roots_magnitudes(alphas, betas, TDA=TDA)
+
+    freqs = np.linspace(e_range[0], e_range[1], nw)
+
+    def gauss_broad(omega, eta, roots):
+        normalization = 1.0 / np.sqrt(2 * np.pi * eta**2)
+        return normalization * (
+            np.exp(-((omega[:, None] - roots[None, :]) ** 2) / (2 * eta**2))
+            - np.exp(-((omega[:, None] + roots[None, :]) ** 2) / (2 * eta**2))
+        )
+
+    density = gauss_broad(freqs, eta, roots_pos) @ magnitudes
+    return freqs, density
+
+
+def get_davidson_trial_vector(bse, ntri, nocc, mo_energy, e_min=0.0, delta=0.0, core_orbs=None):
+    """Generate initial trial vectors for particle-hole excitations.
+    The order is determined by the occ-vir pair orbital energy difference.
+    The initial trial vectors are diagonal. They are generated by taking
+    occ-vir pairs with an energy difference of >= e_min + delta.
+
+    Parameters
+    ----------
+    bse : BSE
+        BSE object
+    ntri : int
+        number of desired initial trial vectors.
+    nocc : int array
+        number of occupied orbitals.
+    mo_energy : double ndarray
+        orbital energy.
+    e_min : float, optional
+        minimum desired excitation energy, by default 0.0
+    delta : float, optional
+        energy shift for trial vector generation, typically <=0.0, by default 0.0
+    core_orbs : optional
+        core orbitals, by default None
+
+    Returns
+    -------
+    ntri : int
+        the number of actual trial vectors generated
+    tri_vec : double ndarray
+         initial trial vectors
+    """
+    nspin, nmo = mo_energy.shape
+    nvir = [(nmo - nocc[i]) for i in range(nspin)]
+    dim = [(nocc[i] * nvir[i]) for i in range(nspin)]
+    full_dim = dim[0] + dim[1] if nspin == 2 else dim[0]
+
+    if core_orbs is not None:
+        if not hasattr(bse, 'mol'):
+            raise ValueError('mol object is required for generating trial vectors for core excitations.')
+        # Select those occupied orbitals with a significant contribution from given core orbitals.
+        occ_to_take = [
+            np.flatnonzero(mo_mapping.mo_comps(core_orbs, bse.mol, bse.mo_coeff[s]) >= 0.3) for s in range(nspin)
+        ]
+    else:
+        occ_to_take = [np.arange(nocc[s], dtype=int) for s in range(nspin)]
+
+    e_diffs = []
+    e_diffs_shp = []
+
+    for s in range(nspin):
+        # The shape of e_diffs_s is (nocc[s], nvir[s])
+        # e_diffs_s[i, a] = mo_energy[s][a] - mo_energy[s][i]
+        e_diffs_s = mo_energy[s][None, nocc[s] :] - mo_energy[s][occ_to_take[s], None]
+        e_diffs_shp.append(e_diffs_s.shape)
+        # Flatten e_diffs[s] into a 1D array.
+        e_diffs_s = e_diffs_s.reshape(-1)
+        e_diffs.append(e_diffs_s)
+
+    # At this point, the structure of e_diffs is as follows:
+    # e_diffs[spin, ia] = mo_energy[spin][a] - mo_energy[spin][i]
+    # where ia = a + nvir[spin] * i
+
+    # Glue the e_diffs together into a 1D array.
+    all_ediffs = np.concatenate(e_diffs, axis=0)
+
+    # Compute the sizes of the occ-vir blocks for each spin.
+    e_diffs_sizes = [0] + [nocc[s] * nvir[s] for s in range(nspin)]
+    # Compute the starting index of each spin's occ-vir block.
+    # This indicates where e_diffs[s] resides in all_ediffs, for each s.
+    e_diffs_starts = np.cumsum(e_diffs_sizes)
+
+    # Find the indices which sort all_ediffs.
+    sort_index = np.argsort(all_ediffs)
+
+    # Take the lowest ntri pairs with energy difference greater than e_min + delta.
+    e_min_index = np.searchsorted(all_ediffs, e_min + delta, side='left', sorter=sort_index)
+    if e_min_index + ntri > all_ediffs.size:
+        # cannot find enough pairs for trial vectors; lower e_min
+        ntri = all_ediffs.size - e_min_index
+    exci_to_take = sort_index[e_min_index : e_min_index + ntri]
+
+    # exci_to_take is an index into all_ediffs.
+    # We need to convert it back to orbital indices.
+
+    tri_vec = np.zeros(shape=[ntri, full_dim], dtype=np.double)
+
+    cur_trivec = 0
+    for s in range(nspin):
+        # Figure out which excitation indices are in this spin block.
+        exci_this_spin = np.extract(
+            (exci_to_take >= e_diffs_starts[s]) & (exci_to_take < e_diffs_starts[s + 1]), exci_to_take
+        )
+        # Subtract the starting index of this spin's occ-vir block.
+        # They are now in the form ia = i * nvir[s] + a.
+        # That is, they are indices into e_diffs[s].reshape(-1).
+        exci_this_spin -= e_diffs_starts[s]
+        # Convert the indices from 1D form (i * nvir[s] + a) to 2D form (i, a).
+        ex_occ, ex_vir = np.unravel_index(exci_this_spin, e_diffs_shp[s])
+        ex_occ = occ_to_take[s][ex_occ]
+        n_exci = exci_this_spin.size
+
+        # The following is shorthand for
+        # for i, a in zip(ex_occ, ex_vir):
+        #     tri_vec[cur_trivec, s * dim[s] + i * nvir[s] + a] = 1.
+        #     cur_trivec += 1
+        tri_vec[range(cur_trivec, cur_trivec + n_exci), s * dim[s] + ex_occ * nvir[s] + ex_vir] = 1.0
+        cur_trivec += n_exci
+
+    return ntri, tri_vec
+
+
+def get_davidson_trial_vector_diag(
+    ntri, multi, nocc, mo_energy, Lpq, nocc_sub=50, nvir_sub=150, e_min=0.0, delta=0.0, TDA=False
+):
+    """Get trial vectors from subspace diagnoalization.
+
+    Parameters
+    ----------
+    ntri : int
+        number of trial vectors
+    multi : str
+        multiplicity
+    nocc : list
+        number of occupied orbitals
+    mo_energy : ndarray
+        orbital energy
+    Lpq : ndarray
+        three-center density-fitting matrix
+    nocc_sub : int, optional
+        number of subspace occupied orbitals, by default 50
+    nvir_sub : int, optional
+        number of subspace virtual orbitals, by default 150
+    e_min : float, optional
+        minimum desired excitation energy, by default 0.0
+    delta : float, optional
+        energy shift for trial vector generation, typically <=0.0, by default 0.0
+    TDA : bool, optional
+        use Tamm-Dancoff approximation, by default False
+
+    Returns
+    -------
+    ntri : int
+        the number of actual trial vectors generated
+    tri_vec : double ndarray
+        initial trial vectors
+    """
+    nspin, nmo = mo_energy.shape
+    nvir = [(nmo - nocc[i]) for i in range(nspin)]
+    dim = [(nocc[i] * nvir[i]) for i in range(nspin)]
+
+    # adjust active space if necessary
+    nocc_sub = int(min(nocc[0], nocc_sub))
+    nvir_sub = int(min(nvir[0], nvir_sub))
+
+    if nspin == 1:
+        nocc_sub = [nocc_sub]
+        nvir_sub = [nvir_sub]
+    else:
+        # numbers of beta orbitals are determined by alpha
+        spin = nocc[0] - nocc[1]
+        nocc_sub = [nocc_sub, nocc_sub - spin]
+        nvir_sub = [nvir_sub, nvir_sub + spin]
+
+    # get active-space BSE input
+    start = [(nocc[s] - nocc_sub[s]) for s in range(nspin)]
+    end = [(nocc[s] + nvir_sub[s]) for s in range(nspin)]
+    mo_energy_sub = np.asarray([mo_energy[s, start[s] : end[s]] for s in range(nspin)])
+    Lpq_sub = np.asarray([Lpq[s, :, start[s] : end[s], start[s] : end[s]] for s in range(nspin)])
+
+    exci, X_vec, Y_vec = bse_full_diagonalization(
+        multi=multi, nocc=nocc_sub, mo_energy=mo_energy_sub, Lpq=Lpq_sub, TDA=TDA
+    )
+
+    for i in range(len(exci)):
+        if exci[i] > (e_min + delta):
+            first_state = i
+            break
+
+    ntri = min(ntri, len(exci) - first_state)
+    tri_vec = []
+    for s in range(nspin):
+        tri_vec.append(np.zeros(shape=[ntri, nocc[s], nvir[s]], dtype=np.double))
+        X_vec_tri = X_vec[s][first_state : first_state + ntri].reshape(ntri, nocc_sub[s], nvir_sub[s])
+        tri_vec[s][:, nocc[s] - nocc_sub[s] :, :nvir_sub[s]] = X_vec_tri
+        tri_vec[s] = tri_vec[s].reshape(ntri, dim[s])
+    tri_vec = np.concatenate(tri_vec, axis=1)
+
+    return ntri, tri_vec
+
+
+def _bse_contraction(multi, nocc, mo_energy, Lia, Laa, Lii_bar, Lia_bar, tri_vec, TDA=False):
+    """Contraction for BSE matrix and trial vectors.
+    W part is as equation 25 and 26 in doi.org/10.1002/jcc.24688.
+
+    Parameters
+    ----------
+    multi : str
+        multiplicity, 's'=singlet, 't'=triplet, 'u'=unrestricted.
+    nocc : int array
+        the number of occupied orbitals.
+    mo_energy : double ndarray
+        orbital energy.
+    Lia : double ndarray
+        3-center density-fitting matrix, ov block.
+    Laa : double ndarray
+        3-center density-fitting matrix, vv block.
+    Lii_bar : double ndarray
+        auxiliary 3-center matrix as equation 21 in doi.org/10.1002/jcc.24688.
+    Lia_bar : double ndarray
+        auxiliary 3-center matrix as equation 21 in doi.org/10.1002/jcc.24688.
+    tri_vec : double ndarray
+        trial vector.
+    TDA : bool, optional
+        use TDA approximation, by default False
+
+    Returns
+    -------
+    apb_prod : double ndarray
+        A+B matrix and trial vector contracted vectors.
+    amb_prod : double ndarray
+        A-B matrix and trial vector contracted vectors.
+    """
+    nspin = len(Lia)
+    naux, _, _ = Lia[0].shape
+    nmo = Lia[0].shape[2] + Lia[0].shape[1]
+    ntri = tri_vec.shape[0]
+
+    nvir = [(nmo - nocc[i]) for i in range(nspin)]
+    dim = [(nocc[i] * nvir[i]) for i in range(nspin)]
+    full_dim = dim[0] + dim[1] if nspin == 2 else dim[0]
+
+    work_done = 0
+
+    scale = 4.0 / nspin
+    if TDA is True:
+        scale /= 2.0
+
+    apb_prod = np.zeros(shape=[ntri, full_dim], dtype=np.double)
+    if TDA:
+        amb_prod = None
+    else:
+        amb_prod = np.zeros(shape=[ntri, full_dim], dtype=np.double)
+
+    # contraction: V
+    if multi != 't' and multi != 'T':
+        Lpq_z = np.empty(shape=[nspin, naux], dtype=np.double)
+        for ivec in range(ntri):
+            for s in range(nspin):
+                z = tri_vec[ivec][s * dim[0] : s * dim[0] + dim[s]].reshape(nocc[s], nvir[s])
+                # The following code is exactly equivalent to
+                # Lpq_z[s] = einsum('Pjb,jb->P', Lia[s], z)
+                scipy.linalg.blas.dgemv(
+                    alpha=1.0,
+                    a=Lia[s].reshape(naux, -1).T,
+                    x=z.reshape(-1),
+                    y=Lpq_z[s],
+                    overwrite_y=True,
+                    trans=1,
+                )
+                work_done += naux * nvir[s] * nocc[s]
+
+            for s in range(nspin):
+                for t in range(nspin):
+                    # vz = einsum('Pia,P->ia', Lia[s], Lpq_z[t]).reshape(-1) * scale
+                    # apb_prod[ivec][s * dim[0] : s * dim[0] + dim[s]] += vz
+                    scipy.linalg.blas.dgemv(
+                        alpha=scale,
+                        a=Lia[s].reshape(naux, -1).T,
+                        x=Lpq_z[t],
+                        beta=1.0,
+                        y=apb_prod[ivec][s * dim[0] : s * dim[0] + dim[s]],
+                        overwrite_y=True,
+                        trans=0,
+                    )
+                    work_done += naux * nvir[s] * nocc[s]
+                    # No need to compute this for TDA
+                    # if TDA is True and return_amb:
+                    #     amb_prod[ivec][s * dim[0]: s * dim[0] + dim[s]] += vz
+
+    # contraction: W
+    for s in range(nspin):
+        jLa_zs = np.zeros(shape=[nocc[s], naux * nvir[s]], dtype=np.double)
+        waz = np.zeros((nocc[s], nvir[s]), dtype=np.double)
+        if not TDA:
+            jLi_zs = np.zeros(shape=[nocc[s], naux * nocc[s]], dtype=np.double)
+            wbz = np.zeros((nocc[s], nvir[s]), dtype=np.double)
+        for ivec in range(ntri):
+            z = tri_vec[ivec][s * dim[0] : s * dim[0] + dim[s]].reshape(nocc[s], nvir[s])
+            # The following calculation for waz is equivalent to
+            # jLa_zs = einsum('Lab,jb->jLa', Laa[s], z)
+            # waz = -einsum('jLi,jLa->ia', Lii_bar[s], jLa_zs).reshape(-1)
+            np.matmul(z, Laa[s].reshape(-1, nvir[s]).T, out=jLa_zs)
+            scipy.linalg.blas.dgemm(
+                alpha=-1.0,
+                a=jLa_zs.reshape(nocc[s] * naux, nvir[s]).T,
+                b=Lii_bar[s].reshape(nocc[s] * naux, nocc[s]).T,
+                trans_a=0,
+                trans_b=1,
+                c=waz.T,
+                overwrite_c=True,
+            )
+            work_done += naux * nocc[s] * nocc[s] * nvir[s]
+
+            if not TDA:
+                # the following calculation for wbz is equivalent to
+                # jLi_zs = einsum('Lib,jb->Lij', Lia[s], z)
+                # wbz = -einsum('Lja,jLi->ia', Lia_bar[s], jLi_zs).reshape(-1)
+                np.matmul(z, Lia[s].reshape(-1, nvir[s]).T, out=jLi_zs)
+                scipy.linalg.blas.dgemm(
+                    alpha=-1.0,
+                    a=Lia_bar[s].reshape(nocc[s] * naux, nvir[s]).T,
+                    b=jLi_zs.reshape(nocc[s] * naux, nocc[s]).T,
+                    trans_a=0,
+                    trans_b=1,
+                    beta=0.0,
+                    c=wbz.T,
+                    overwrite_c=True,
+                )
+                work_done += naux * nocc[s] * nocc[s] * nvir[s]
+            if not TDA:
+                apb_prod[ivec][s * dim[0] : s * dim[0] + dim[s]] += (waz + wbz).ravel()
+                amb_prod[ivec][s * dim[0] : s * dim[0] + dim[s]] += (waz - wbz).ravel()
+            else:
+                apb_prod[ivec][s * dim[0] : s * dim[0] + dim[s]] += waz.ravel()
+
+    # contraction: orbital energy difference
+    for s in range(nspin):
+        orb_diff = np.asarray(mo_energy[s][None, nocc[s] :] - mo_energy[s][: nocc[s], None]).reshape(-1)
+        for ivec in range(ntri):
+            oz = orb_diff * tri_vec[ivec][s * dim[0] : s * dim[0] + dim[s]]
+            apb_prod[ivec][s * dim[0] : s * dim[0] + dim[s]] += oz
+            if not TDA:
+                amb_prod[ivec][s * dim[0] : s * dim[0] + dim[s]] += oz
+            work_done += 2 * oz.size
+
+    return apb_prod, amb_prod, work_done
+
+
+def _get_lpq_bar(nocc, mo_energy, Lpq):
+    """Calculate the auxiliary 3-center matrix.
+    Lpq_bar = (epsilon)^-1 * Lpq
+    Equation 11 in doi.org/10.1002/jcc.24688.
+
+    Parameters
+    ----------
+    nocc : int array
+        the number of occupied orbitals
+    mo_energy : double ndarray
+        orbital energy
+    Lpq : double ndarray
+        3-center density-fitting matrix
+
+    Returns
+    -------
+    Lpq_bar : double ndarray
+        auxiliary three-center matrix
+    """
+    nspin, naux, _, _ = Lpq.shape
+
+    # calculate the response function in the auxiliary basis
+    X = np.zeros(shape=[naux, naux], dtype=np.double)
+    for i in range(nspin):
+        orb_diff = mo_energy[i][: nocc[i], None] - mo_energy[i][None, nocc[i] :]
+        orb_diff = 1.0 / orb_diff
+        X += 2.0 * einsum('Pia,ia,Qia->PQ', Lpq[i][:, : nocc[i], nocc[i] :], orb_diff, Lpq[i][:, : nocc[i], nocc[i] :])
+    if nspin == 1:
+        X *= 2.0
+
+    # calculate the inverse dielectric function
+    InvD = np.linalg.inv((np.eye(naux) - X))
+
+    # calculate the auxiliary matrix
+    Lpq_bar = einsum('PQ,sQmn->sPmn', InvD, Lpq)
+
+    return Lpq_bar
+
+
+def _get_lpq_bar_by_block(nocc, mo_energy, Lii, Lia):
+    """Calculate the auxiliary 3-center matrix.
+    Lpq_bar = (epsilon)^-1 * Lpq
+    Equation 11 in doi.org/10.1002/jcc.24688.
+
+    Parameters
+    ----------
+    nocc : int array
+        numbers of occupied orbitals
+    mo_energy : double ndarray
+            orbital energy
+    Lii : double ndarray
+        3-center density-fitting matrix
+    Lia : double ndarray
+        3-center density-fitting matrix
+
+    Returns
+    -------
+    Lii_bar : double ndarray
+        auxiliary three-center matrix
+    Lia_bar : double ndarray
+        auxiliary three-center matrix
+    """
+    nspin = len(Lia)
+    naux, _, _ = Lia[0].shape
+    nvir = [Lia_s.shape[2] for Lia_s in Lia]
+
+    # calculate the response function in the auxiliary basis
+    X = np.zeros(shape=[naux, naux], dtype=np.double)
+    for i in range(nspin):
+        orb_diff = mo_energy[i][: nocc[i], None] - mo_energy[i][None, nocc[i] :]
+        orb_diff = 1.0 / orb_diff
+        Pia = Lia[i] * (orb_diff * 2.0)
+
+        # This line computes Pi = einsum('Pia, Qia -> PQ', Pia, Lia)
+        X += Pia.reshape(naux, -1) @ Lia[i].reshape(naux, -1).T
+        # X += 2.0 * einsum('Pia,ia,Qia->PQ', Lia[i], orb_diff, Lia[i])
+    if nspin == 1:
+        X *= 2.0
+
+    # calculate the inverse dielectric function
+    InvD = np.linalg.inv((np.eye(naux) - X))
+
+    Lia_bar = []
+    Lii_bar = []
+
+    # calculate the auxiliary matrix
+    # Lpq_bar = einsum('PQ,sQmn->sPmn', InvD, Lpq)
+    for i in range(nspin):
+        Lia_bar.append(np.matmul(InvD, Lia[i].reshape(naux, -1)).reshape(naux, nocc[i], nvir[i]))
+
+        Lii_bar.append(np.matmul(InvD, Lii[i].reshape(naux, -1)).reshape(naux, nocc[i], nocc[i]))
+
+    # _bse_contraction reshapes these tensors assuming occupied-major layout.
+    Lii_bar = [np.ascontiguousarray(Lii_bar[s].transpose(1, 0, 2)) for s in range(nspin)]
+    Lia_bar = [np.ascontiguousarray(Lia_bar[s].transpose(1, 0, 2)) for s in range(nspin)]
+
+    return Lii_bar, Lia_bar
+
+
+def _get_oscillator_strength(multi, exci, X_vec, Y_vec, mo_coeff, nocc, mol):
+    """Get transition dipoles and oscillator strengths.
+
+    Parameters
+    ----------
+    multi : str
+        multiplicity. "s"=singlet, "t"=triplet, "u"=unrestricted.
+    exci : double array
+        excitation energy.
+    X_vec : double ndarray
+        X block of eigenvector (excitation).
+    Y_vec : double ndarray
+        Y block of eigenvector (de-excitation).
+    mo_coeff : double ndarray
+        coefficient from AO to MO.
+    nocc : int array
+        number of occupied orbitals.
+    mol : pyscf.gto.mole.Mole
+        Mole object for generating dipole matrix.
+
+    Returns
+    -------
+    dipole : double ndarray
+        transition dipoles of all excitations.
+    oscillator_strength : double array
+        oscillator strengths of all excitations.
+    """
+    nspin, _, _ = mo_coeff.shape
+    nroot = X_vec[0].shape[0]
+
+    dipole = np.zeros(shape=[3, nroot], dtype=np.double, order='F')
+    oscillator_strength = np.zeros(shape=[nroot], dtype=np.double)
+
+    # BSE is blind to triplet oscillator strength
+    if multi == 't':
+        return dipole, oscillator_strength
+
+    with mol.with_common_orig((0, 0, 0)):
+        ao_dip = mol.intor_symmetric('int1e_r', comp=3)
+
+    # Transform AO dipole integrals to MO basis
+    mo_dip = [mo_coeff[s][:, : nocc[s]].T @ ao_dip @ mo_coeff[s][:, nocc[s] :] for s in range(nspin)]
+
+    for j in range(nroot):
+        for s in range(nspin):
+            dipole[:, j] += np.einsum('ia,xia->x', X_vec[s][j], mo_dip[s]) + np.einsum(
+                'ia,xia->x', Y_vec[s][j], mo_dip[s]
+            )
+
+    if nspin == 1:
+        dipole *= np.sqrt(2)
+
+    oscillator_strength = (2 / 3) * exci * np.sum(dipole**2, axis=0)
+
+    return dipole, oscillator_strength
+
+
+def _get_spin_square(nocc, X_vec, Y_vec, mo_coeff, ovlp):
+    """Get <S2> expectation value.
+
+    Parameters
+    ----------
+    nocc : int array
+        number of occupied orbitals.
+    X_vec : double ndarray
+        X block of eigenvector (excitation).
+    Y_vec : double ndarray
+        Y block of eigenvector (de-excitation).
+    mo_coeff : double ndarray
+        coefficient from AO to MO.
+    ovlp : double ndarray
+        overlap matrix.
+
+    Returns
+    -------
+    s2 : double array
+        <S2> expectation value of excitations.
+    """
+    nroot = X_vec[0].shape[0]
+    ab_ovlp = mo_coeff[0].T @ ovlp @ mo_coeff[1]
+    s2 = np.zeros(shape=[nroot], dtype=np.double)
+    s2[:] = nocc[0] - (nocc[0] - nocc[1]) / 2.0 + ((nocc[0] - nocc[1]) / 2.0) ** 2
+    for iroot in range(nroot):
+        # alpha excitation ket
+        # a alpha and j beta exchange: alpha excitation bra
+        s2[iroot] -= einsum(
+            'ia,ib,aj,bj->',
+            X_vec[0][iroot] + Y_vec[0][iroot],
+            X_vec[0][iroot] - Y_vec[0][iroot],
+            ab_ovlp[nocc[0] :, : nocc[1]],
+            ab_ovlp[nocc[0] :, : nocc[1]],
+        )
+        # a alpha and j beta exchange: beta excitation bra
+        s2[iroot] -= einsum(
+            'ia,jb,ij,ab->',
+            X_vec[0][iroot] + Y_vec[0][iroot],
+            X_vec[1][iroot] - Y_vec[1][iroot],
+            ab_ovlp[: nocc[0], : nocc[1]],
+            ab_ovlp[nocc[0] :, nocc[1] :],
+        )
+        # i alpha and j beta exchange: same alpha excitation bra
+        s2[iroot] -= einsum(
+            'ia,ia,jk->',
+            X_vec[0][iroot] + Y_vec[0][iroot],
+            X_vec[0][iroot] - Y_vec[0][iroot],
+            ab_ovlp[: nocc[0], : nocc[1]] ** 2,
+        )
+        s2[iroot] += einsum(
+            'ia,ia,ik->',
+            X_vec[0][iroot] + Y_vec[0][iroot],
+            X_vec[0][iroot] - Y_vec[0][iroot],
+            ab_ovlp[: nocc[0], : nocc[1]] ** 2,
+        )
+        # beta excitation ket
+        # i alpha and b beta exchange: beta excitation bra
+        s2[iroot] -= einsum(
+            'ia,ib,ja,jb->',
+            X_vec[1][iroot] + Y_vec[1][iroot],
+            X_vec[1][iroot] - Y_vec[1][iroot],
+            ab_ovlp[: nocc[0], nocc[1] :],
+            ab_ovlp[: nocc[0], nocc[1] :],
+        )
+        # i alpha and b beta exchange: alpha excitation bra
+        s2[iroot] -= einsum(
+            'ia,jb,ji,ba->',
+            X_vec[1][iroot] + Y_vec[1][iroot],
+            X_vec[0][iroot] - Y_vec[0][iroot],
+            ab_ovlp[: nocc[0], : nocc[1]],
+            ab_ovlp[nocc[0] :, nocc[1] :],
+        )
+        # i alpha and j beta exchange: same alpha excitation bra
+        s2[iroot] -= einsum(
+            'ia,ia,jk->',
+            X_vec[1][iroot] + Y_vec[1][iroot],
+            X_vec[1][iroot] - Y_vec[1][iroot],
+            ab_ovlp[: nocc[0], : nocc[1]] ** 2,
+        )
+        s2[iroot] += einsum(
+            'ia,ia,ji->',
+            X_vec[1][iroot] + Y_vec[1][iroot],
+            X_vec[1][iroot] - Y_vec[1][iroot],
+            ab_ovlp[: nocc[0], : nocc[1]] ** 2,
+        )
+
+    return s2
+
+
+class BSE(lib.StreamObject):
+    def __init__(self, gw):
+        """Initialize BSE object.
+        The BSE object can be initialized by a restricted or unrestricted mol/Gamma GW object.
+
+        Parameters
+        ----------
+        gw : GWAC/UGWAC, optional
+            GW object, by default None
+        """
+        self.verbose = gw.verbose  # verbose level
+        self.nspin = 1 if np.asarray(gw.mo_energy).ndim == 1 else 2  # 1 for restricted, 2 for unrestricted
+        self.mol = gw.mol  # mol object
+        self.mf = gw._scf  # mean-field object
+        self.nocc = np.asarray(gw.nocc)  # number of occupied orbitals
+        if self.nocc.ndim == 0:
+            self.nocc = self.nocc[np.newaxis, ...]
+        self.mo_energy = np.asarray(gw.mo_energy)  # orbital energy
+        if self.mo_energy.ndim == 1:
+            self.mo_energy = self.mo_energy[np.newaxis, ...]
+        self.mo_coeff = gw.mo_coeff  # orbital coefficient from AO to MO
+        if self.mo_coeff.ndim == 2:
+            self.mo_coeff = self.mo_coeff[np.newaxis, ...]
+        self.nmo = self.mo_energy.shape[-1]  # number of molecular orbitals
+        # initialize density-fitting matrix
+        if self.nspin == 2 and isinstance(gw.nmo, int):
+            gw.nmo = [gw.nmo, gw.nmo]
+        self.Lpq = gw.Lpq if hasattr(gw, 'Lpq') else None  # three-center density-fitting matrix in MO
+        if self.Lpq is None:
+            self.Lpq = np.asarray(np.asarray(gw.ao2mo(gw.mo_coeff)))
+        if self.Lpq.ndim == 3:
+            self.Lpq = self.Lpq[np.newaxis, ...]
+
+        # options
+        self.TDA = False  # use TDA approximation to ignore B matrix
+        self.delete_lpq = False  # delete Lpq after calculation
+        self.chkfile = None  # checkpoint file
+        self.chk_every = 10  # checkpoint frequency
+
+        # Davidson algorithm
+        self.multi = None  # multiplicity
+        self.nroot = 10  # the number of desired roots
+        self.trial = 'identity'  # mode to initialize trial vector
+        self.nocc_sub = 50  # number of occpuied orbitals in the trial vector subspace
+        self.nvir_sub = 150  # number of virtual orbitals in the trial vector subspace
+        self.max_vec = 12 * self.nroot  # max allowed subspace size
+        self.max_iter = 100  # max Davidson iteration
+        # max number of trial vectors to expand per iteration
+        self.max_expand = min(100, self.nroot)
+        self.residue_thresh = 1e-8  # threshold if the residue needs to be added as a new trial vector
+        self.init_ntri = min(100, self.nroot)
+        self.restart_max_size = None  # max number of trial vectors to keep during a restart
+
+        # results
+        self.exci = None  # excitation energy
+        self.X_vec = None  # X block of eigenvector (excitation)
+        self.Y_vec = None  # Y block of eigenvector (de-excitation)
+        return
+
+    def dump_flags(self):
+        """Dump BSE flags."""
+        log = lib.logger.Logger(self.stdout, self.verbose)
+        log.info('')
+        log.info('******** %s ********', self.__class__)
+        nvir = [(self.nmo - self.nocc[i]) for i in range(self.nspin)]
+        dim = [(self.nocc[i] * nvir[i]) for i in range(self.nspin)]
+        log.info('multiplicity = %s', self.multi)
+        log.info('nmo = %s', self.nmo)
+        log.info('nocc = %s', self.nocc[0] if self.nspin == 1 else self.nocc)
+        log.info('nvir = %s', nvir[0] if self.nspin == 1 else nvir)
+        log.info('occ-vir dimension = %s', dim[0] if self.nspin == 1 else dim)
+        if self.nspin == 2:
+            log.info('BSE full dimension = %s', dim[0] + dim[1])
+        log.info('Tamm-Dancoff approximation = %s', self.TDA)
+        log.info('number of roots = %d', self.nroot)
+        log.info('trial vector = %s', self.trial)
+        if self.trial == 'subspace':
+            log.info('subspace nocc = %d nvir = %d', self.nocc_sub, self.nvir_sub)
+        log.info('max subspace size = %d', self.max_vec)
+        log.info('max iteration = %s', self.max_iter)
+        log.info('convergence tolerance = %s', self.residue_thresh)
+        log.info('')
+        return
+
+    def check_memory(self):
+        """Check memory needed for the BSE calculation."""
+        nvir = [(self.nmo - self.nocc[i]) for i in range(self.nspin)]
+        dim = [(self.nocc[i] * nvir[i]) for i in range(self.nspin)]
+        full_dim = dim[0] + dim[1] if self.nspin == 2 else dim[0]
+        naux = self.Lpq.shape[1]
+
+        # Lpq and Lpq_bar; trial vector, A+B/A-B matrix with trial vector product
+        mem = (naux * self.nmo * self.nmo * 2 + self.max_vec * full_dim * 3) * 8
+        lib.logger.info(self, 'BSE needs at least %.1f GB memory.', mem / 1.0e9)
+
+        return
+
+    def kernel(self, multi, e_min=0.0, delta=0.0, **kwargs):
+        """Davidson algorithm for BSE.
+
+        Parameters
+        ----------
+        multi : str
+            multiplicity. "s"=singlet, "t"=triplet, "u"=unrestricted.
+        e_min : float, optional
+            minimum excitation energy, by default 0.0
+        delta : float, optional
+            energy shift for trial vector generation, typically <=0.0, by default 0.0
+
+        Returns
+        -------
+        exci : double array
+            excitation energy.
+        X_vec : list
+            X block of eigenvector (excitation).
+        Y_vec : list
+            Y block of eigenvector (de-excitation).
+        """
+        # check spin and multiplicity
+        assert isinstance(multi, str)
+        multi = multi[0].lower()
+        assert (self.nspin == 1 and (multi == 's' or multi == 't')) or (self.nspin == 2 and multi == 'u')
+        self.multi = multi
+
+        cput0 = (time.process_time(), time.perf_counter())
+        self.dump_flags()
+        self.check_memory()
+        self.exci, self.X_vec, self.Y_vec = bse_davidson(bse=self, multi=multi, e_min=e_min, delta=delta, **kwargs)
+        lib.logger.timer(self, 'BSE', *cput0)
+        return self.exci, self.X_vec, self.Y_vec
+
+    def full_diagonalization(self, multi):
+        """Full diagonalization.
+
+        Parameters
+        ----------
+        multi : str
+            multiplicity. "s"=singlet, "t"=triplet, "u"=unrestricted.
+
+        Returns
+        -------
+        exci : double array
+            excitation energy.
+        X_vec : list
+            X block of eigenvector (excitation).
+        Y_vec : list
+            Y block of eigenvector (de-excitation).
+        """
+        cput0 = (time.process_time(), time.perf_counter())
+        lib.logger.info(self, '\nBSE full diagonalization: %s', multi)
+        self.multi = multi
+
+        # set nroot as full dimension for analysis
+        nvir = [(self.nmo - self.nocc[i]) for i in range(self.nspin)]
+        dim = [(self.nocc[i] * nvir[i]) for i in range(self.nspin)]
+        self.nroot = dim[0] + dim[1] if self.nspin == 2 else dim[0]
+
+        # A+B, A-B, X+Y, X-Y
+        mem = (self.nroot * self.nroot * 4) * 8
+        lib.logger.info(self, 'BSE needs at least %.1f GB memory.', mem / 1.0e9)
+
+        self.exci, self.X_vec, self.Y_vec = bse_full_diagonalization(
+            multi=multi, nocc=self.nocc, mo_energy=self.mo_energy, Lpq=self.Lpq, TDA=self.TDA
+        )
+        lib.logger.timer(self, 'BSE full diagonalization', *cput0)
+        return self.exci, self.X_vec, self.Y_vec
+
+    def analyze(self, thresh=0.1, oscillator=True, s2=True, e_min=0.0):
+        """Analyze excitations.
+
+        Parameters
+        ----------
+        thresh : float, optional
+            threshold to print dominant component, by default 0.1
+        oscillator : bool, optional
+            calculate oscillator strength, by default True
+        s2 : bool, optional
+            calculate <S2> expectation value, by default True
+        e_min : float, optional
+            minimum excitation energy to analyze, by default 0.0
+        """
+        multi = self.multi
+        nspin = self.nspin
+        nmo = self.nmo
+        nocc = self.nocc
+
+        emin_index = np.searchsorted(self.exci, e_min, side='left')
+        exci = self.exci[emin_index:]
+
+        X_vec = [X_vec_s[emin_index:] for X_vec_s in self.X_vec]
+        Y_vec = [Y_vec_s[emin_index:] for Y_vec_s in self.Y_vec]
+        nvir = [(nmo - nocc[i]) for i in range(nspin)]
+
+        if oscillator is True:
+            dipole, oscillator_strength = _get_oscillator_strength(
+                multi=multi, exci=exci, X_vec=X_vec, Y_vec=Y_vec, mo_coeff=self.mo_coeff, nocc=nocc, mol=self.mol
+            )
+
+        if s2 is True and nspin == 2:
+            s2 = _get_spin_square(nocc=nocc, X_vec=X_vec, Y_vec=Y_vec, mo_coeff=self.mo_coeff, ovlp=self.mf.get_ovlp())
+
+        lib.logger.info(self, '-' * 55)
+        if multi == 's':
+            lib.logger.info(self, 'restricted singlet BSE')
+        elif multi == 't':
+            lib.logger.info(self, 'restricted triplet BSE')
+        elif multi == 'u':
+            lib.logger.info(self, 'unrestricted BSE')
+        for r in range(exci.size):
+            lib.logger.info(self, '-' * 55)
+            lib.logger.info(self, 'excited state: %-d' % (r + 1))
+            lib.logger.info(self, 'excitation energy:   %15.8f   AU   %15.8f   eV' % (exci[r], exci[r] * HARTREE2EV))
+            if multi == 's':
+                if oscillator is True:
+                    lib.logger.info(self, 'spin allowed, oscillator strength:   %15.8f   AU' % oscillator_strength[r])
+                    lib.logger.info(
+                        self,
+                        'transition dipole: x =  %15.6f  , y =  %15.6f  , z =  %15.6f'
+                        % (dipole[0][r], dipole[1][r], dipole[2][r]),
+                    )
+            elif multi == 't':
+                if oscillator is True:
+                    lib.logger.info(self, 'spin forbidden, oscillator strength and transition dipoles are not defined')
+            elif multi == 'u':
+                if s2 is True:
+                    lib.logger.info(self, '<S^2> =    %.6f' % s2[r])
+                if oscillator is True:
+                    lib.logger.info(self, 'oscillator strength:   %15.8f   AU' % oscillator_strength[r])
+                    lib.logger.info(
+                        self,
+                        'transition dipole: x =  %15.6f  , y =  %15.6f  , z =  %15.6f'
+                        % (dipole[0][r], dipole[1][r], dipole[2][r]),
+                    )
+
+            lib.logger.info(self, 'dominant component')
+            if nspin == 1:
+                for i in range(nocc[0]):
+                    for a in range(nvir[0]):
+                        if abs(X_vec[0][r][i][a]) > thresh:
+                            lib.logger.info(
+                                self, '%5d -> %5d, %15.8f, %s' % (i + 1, a + nocc[0] + 1, float(X_vec[0][r][i][a]), 'X')
+                            )
+                        if abs(Y_vec[0][r][i][a]) > thresh:
+                            lib.logger.info(
+                                self, '%5d -> %5d, %15.8f, %s' % (i + 1, a + nocc[0] + 1, float(Y_vec[0][r][i][a]), 'Y')
+                            )
+            else:
+                for s in range(nspin):
+                    for i in range(nocc[s]):
+                        for a in range(nvir[s]):
+                            if abs(X_vec[s][r][i][a]) > thresh:
+                                lib.logger.info(
+                                    self,
+                                    '%5d -> %5d, spin %d, %15.8f, %s'
+                                    % (i + 1, a + nocc[s] + 1, s, float(X_vec[s][r][i][a]), 'X'),
+                                )
+                            if abs(Y_vec[s][r][i][a]) > thresh:
+                                lib.logger.info(
+                                    self,
+                                    '%5d -> %5d, spin %d, %15.8f, %s'
+                                    % (i + 1, a + nocc[s] + 1, s, float(Y_vec[s][r][i][a]), 'Y'),
+                                )
+        return
+
+    def get_oscillator_strength(self):
+        """Get transition dipoles and oscillator strengths.
+
+        Returns
+        -------
+        dipole : double array
+            transition dipoles.
+        oscillator_strength : double array
+            oscillator strengths.
+        """
+        assert self.exci is not None and self.X_vec is not None and self.Y_vec is not None
+        assert self.mo_coeff is not None and self.mol is not None
+        dipole, oscillator_strength = _get_oscillator_strength(
+            multi=self.multi,
+            exci=self.exci,
+            X_vec=self.X_vec,
+            Y_vec=self.Y_vec,
+            mo_coeff=self.mo_coeff,
+            nocc=self.nocc,
+            mol=self.mol,
+        )
+
+        return dipole, oscillator_strength
diff --git a/pyscf/gw/test/test_bse.py b/pyscf/gw/test/test_bse.py
new file mode 100644
index 0000000000..1b63806f14
--- /dev/null
+++ b/pyscf/gw/test/test_bse.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+
+import pytest
+from pyscf import dft, gto
+from pyscf.gw.bse import BSE
+from pyscf.gw.gw_ac import GWAC
+from pyscf.gw.ugw_ac import UGWAC
+
+
+@pytest.fixture(scope='module')
+def h2o_pbe_gw():
+    mol = gto.Mole()
+    mol.verbose = 0
+    mol.atom = [[8, (0.0, 0.0, 0.0)], [1, (0.7571, 0.0, 0.5861)], [1, (-0.7571, 0.0, 0.5861)]]
+    mol.basis = 'def2-svp'
+    mol.build()
+
+    mf = dft.RKS(mol)
+    mf.xc = 'pbe'
+    mf.kernel()
+
+    gw = GWAC(mf)
+    gw.kernel()
+    return gw
+
+
+@pytest.fixture(scope='module')
+def h2o_cation_pbe_ugw():
+    mol = gto.Mole()
+    mol.verbose = 0
+    mol.atom = [[8, (0.0, 0.0, 0.0)], [1, (0.7571, 0.0, 0.5861)], [1, (-0.7571, 0.0, 0.5861)]]
+    mol.charge = 1
+    mol.spin = 1
+    mol.basis = 'def2-svp'
+    mol.build()
+
+    mf = dft.UKS(mol)
+    mf.xc = 'pbe'
+    mf.kernel()
+
+    gw = UGWAC(mf)
+    gw.kernel()
+    return gw
+
+
+def test_bse_singlet(h2o_pbe_gw):
+    bse = BSE(h2o_pbe_gw)
+    exci = bse.kernel('s')[0]
+    assert exci[0] == pytest.approx(0.25749397, abs=1e-5)
+
+
+def test_bse_triplet(h2o_pbe_gw):
+    bse = BSE(h2o_pbe_gw)
+    exci = bse.kernel('t')[0]
+    assert exci[0] == pytest.approx(0.22299263, abs=1e-5)
+
+
+def test_bse_unrestricted(h2o_cation_pbe_ugw):
+    bse = BSE(h2o_cation_pbe_ugw)
+    exci = bse.kernel('u')[0]
+    assert exci[0] == pytest.approx(0.02114003, abs=1e-5)
+
+
+def test_bse_energy_specific_singlet(h2o_pbe_gw):
+    bse = BSE(h2o_pbe_gw)
+    exci = bse.kernel('s', e_min=0.4)[0]
+    assert exci[0] == pytest.approx(0.42691789, abs=1e-5)
+
+
+def test_bse_energy_specific_triplet(h2o_pbe_gw):
+    bse = BSE(h2o_pbe_gw)
+    exci = bse.kernel('t', e_min=0.4)[0]
+    assert exci[0] == pytest.approx(0.45195324, abs=1e-5)
diff --git a/pyscf/gw/test/test_gw.py b/pyscf/gw/test/test_gw.py
index 5b3a0e92be..4b53ddf751 100644
--- a/pyscf/gw/test/test_gw.py
+++ b/pyscf/gw/test/test_gw.py
@@ -48,8 +48,8 @@ def test_gwac_pade_frozen(self):
         gw_obj.ac = 'pade'
         gw_obj.orbs = range(nocc-3, nocc+3)
         gw_obj.kernel()
-        self.assertAlmostEqual(gw_obj.mo_energy[nocc-1], -0.4129411145067107, 8)
-        self.assertAlmostEqual(gw_obj.mo_energy[nocc], 0.16568737755110896, 8)
+        self.assertAlmostEqual(gw_obj.mo_energy[nocc-1], -0.4129411145067107, 7)
+        self.assertAlmostEqual(gw_obj.mo_energy[nocc], 0.16568737755110896, 7)
 
         gw_obj = gw.GW(mf, freq_int='ac')
         gw_obj.frozen = np.array([0])
@@ -57,8 +57,8 @@ def test_gwac_pade_frozen(self):
         gw_obj.ac = 'pade'
         gw_obj.orbs = range(nocc-3, nocc+3)
         gw_obj.kernel()
-        self.assertAlmostEqual(gw_obj.mo_energy[nocc-1], -0.4129411145067107, 8)
-        self.assertAlmostEqual(gw_obj.mo_energy[nocc], 0.16568737755110896, 8)
+        self.assertAlmostEqual(gw_obj.mo_energy[nocc-1], -0.4129411145067107, 7)
+        self.assertAlmostEqual(gw_obj.mo_energy[nocc], 0.16568737755110896, 7)
 
     def test_gwcd(self):
         nocc = mol.nelectron//2
diff --git a/pyscf/gw/test/test_gw_ac.py b/pyscf/gw/test/test_gw_ac.py
index bbcec0bc16..1dd87419f4 100644
--- a/pyscf/gw/test/test_gw_ac.py
+++ b/pyscf/gw/test/test_gw_ac.py
@@ -2,13 +2,12 @@
 
 import pytest
 import numpy as np
-from pyscf import gto, scf, dft
+from pyscf import gto, dft
 from pyscf.gw.gw_ac import GWAC
 
 @pytest.fixture
 def h2o_pbe0():
     mol = gto.Mole()
-    mol.verbose = 5
     mol.atom = [[8, (0.0, 0.0, 0.0)], [1, (0.0, -0.7571, 0.5861)], [1, (0.0, 0.7571, 0.5861)]]
     mol.basis = 'def2-svp'
     mol.build()
diff --git a/pyscf/gw/test/test_ugw_ac.py b/pyscf/gw/test/test_ugw_ac.py
index 9a3a3b60d7..8600b92004 100644
--- a/pyscf/gw/test/test_ugw_ac.py
+++ b/pyscf/gw/test/test_ugw_ac.py
@@ -6,7 +6,6 @@
 @pytest.fixture
 def h2o_cation_uhf():
     mol = gto.Mole()
-    mol.verbose = 5
     mol.atom = [[8, (0.0, 0.0, 0.0)], [1, (0.0, -0.7571, 0.5861)], [1, (0.0, 0.7571, 0.5861)]]
     mol.basis = 'def2-svp'
     mol.charge = 1
diff --git a/pyscf/hessian/dispersion.py b/pyscf/hessian/dispersion.py
index ad37289312..efa023f5fb 100644
--- a/pyscf/hessian/dispersion.py
+++ b/pyscf/hessian/dispersion.py
@@ -30,10 +30,12 @@ def get_dispersion(hessobj, disp=None, with_3body=None):
     mol = mf.mol
     natm = mol.natm
     h_disp = np.zeros([natm,natm,3,3])
-    disp_version = check_disp(mf, disp)
-    if not disp_version:
+    if not check_disp(mf, disp):
         return h_disp
 
+    if disp is None:
+        disp = getattr(mf, 'disp', None)
+
     try:
         from pyscf.dispersion import dftd3, dftd4
     except ImportError:
@@ -41,9 +43,9 @@ def get_dispersion(hessobj, disp=None, with_3body=None):
         raise
 
     method = getattr(mf, 'xc', 'hf')
-    method, _, disp_with_3body = parse_disp(method)
+    method, disp_version, disp_with_3body = parse_disp(method, disp)
 
-    if with_3body is not None:
+    if with_3body is None:
         with_3body = disp_with_3body
 
     if disp_version[:2].upper() == 'D3':
diff --git a/pyscf/hessian/test/test_rhf.py b/pyscf/hessian/test/test_rhf.py
index fc99b668ca..9a06fdac2e 100644
--- a/pyscf/hessian/test/test_rhf.py
+++ b/pyscf/hessian/test/test_rhf.py
@@ -19,7 +19,7 @@
 from pyscf import grad, hessian
 try:
     from pyscf.dispersion import dftd3, dftd4
-except ImportError:
+except (ImportError, OSError):
     dftd3 = dftd4 = None
 
 def setUpModule():
diff --git a/pyscf/hessian/test/test_rks.py b/pyscf/hessian/test/test_rks.py
index 43634b6e8e..6c5a8d36db 100644
--- a/pyscf/hessian/test/test_rks.py
+++ b/pyscf/hessian/test/test_rks.py
@@ -19,7 +19,7 @@
 from pyscf import grad, hessian
 try:
     from pyscf.dispersion import dftd3, dftd4
-except ImportError:
+except (ImportError, OSError):
     dftd3 = dftd4 = None
 
 def setUpModule():
diff --git a/pyscf/hessian/test/test_uhf.py b/pyscf/hessian/test/test_uhf.py
index 8b65361b7b..ee5efceb80 100644
--- a/pyscf/hessian/test/test_uhf.py
+++ b/pyscf/hessian/test/test_uhf.py
@@ -19,7 +19,7 @@
 from pyscf import grad, hessian
 try:
     from pyscf.dispersion import dftd3, dftd4
-except ImportError:
+except (ImportError, OSError):
     dftd3 = dftd4 = None
 
 def setUpModule():
diff --git a/pyscf/hessian/test/test_uks.py b/pyscf/hessian/test/test_uks.py
index b1c3af8b24..8ebbb7c0e8 100644
--- a/pyscf/hessian/test/test_uks.py
+++ b/pyscf/hessian/test/test_uks.py
@@ -19,7 +19,7 @@
 from pyscf import grad, hessian
 try:
     from pyscf.dispersion import dftd3, dftd4
-except ImportError:
+except (ImportError, OSError):
     dftd3 = dftd4 = None
 
 def setUpModule():
diff --git a/pyscf/lib/CMakeLists.txt b/pyscf/lib/CMakeLists.txt
index 60f404849f..51de15a1bf 100644
--- a/pyscf/lib/CMakeLists.txt
+++ b/pyscf/lib/CMakeLists.txt
@@ -128,7 +128,9 @@ include_directories("${PROJECT_BINARY_DIR}")
 
 # See also https://gitlab.kitware.com/cmake/community/wikis/doc/cmake/RPATH-handling
 if (WIN32)
-  #?
+  set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+  set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--disable-runtime-pseudo-reloc")
+  set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_SOURCE_DIR}")
 elseif (APPLE)
   set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
   set(CMAKE_INSTALL_RPATH "@loader_path;@loader_path/deps/lib;@loader_path/deps/lib64")
diff --git a/pyscf/lib/agf2/uagf2.c b/pyscf/lib/agf2/uagf2.c
index f16b97d53a..e59df223ad 100644
--- a/pyscf/lib/agf2/uagf2.c
+++ b/pyscf/lib/agf2/uagf2.c
@@ -317,17 +317,19 @@ void AGF2udf_vv_vev_islice_lowmem(double *qxi,
         do_os = j < nob;
         do_ss = j < noa;
 
-        // build qx_i
+        // build qx_i / qa_i (always indexed by i < noa)
         AGF2slice_01i(qxi, naux, nmo, noa, i, qx_i);
-
-        // build qx_j
-        AGF2slice_01i(qxi, naux, nmo, noa, j, qx_j);
-
-        // build qa_i
         AGF2slice_0i2(qja, naux, noa, nva, i, qa_i);
 
-        // build qa_j
-        AGF2slice_0i2(qja, naux, noa, nva, j, qa_j);
+        // Build qx_j / qa_j only when j is in the alpha range. With
+        // nob > noa, j ranges up to nob-1 for the cross-spin (do_os) part,
+        // and slicing the alpha arrays qxi/qja at j >= noa would read past
+        // their noa-dim. The OS path uses qx_j_b / qa_j_b instead, so the
+        // alpha j slice is only needed for do_ss.
+        if (do_ss) {
+            AGF2slice_01i(qxi, naux, nmo, noa, j, qx_j);
+            AGF2slice_0i2(qja, naux, noa, nva, j, qa_j);
+        }
 
         if (do_ss) {
             // build xija
diff --git a/pyscf/lib/cc/ccsd_t.c b/pyscf/lib/cc/ccsd_t.c
index 22e3e5e4ea..dbda960255 100644
--- a/pyscf/lib/cc/ccsd_t.c
+++ b/pyscf/lib/cc/ccsd_t.c
@@ -392,15 +392,16 @@ void CCsd_t_contract(double *e_tot,
                                         cache_row_b, cache_col_b, sizeof(double));
         int *permute_idx = malloc(sizeof(int) * nocc*nocc*nocc * 6);
         _make_permute_indices(permute_idx, nocc);
+        FILE *err_fp = stderr;
 #pragma omp parallel default(none) \
         shared(njobs, nocc, nvir, mo_energy, t1T, t2T, nirrep, o_ir_loc, \
-               v_ir_loc, oo_ir_loc, orbsym, vooo, fvo, jobs, e_tot, permute_idx, stderr)
+               v_ir_loc, oo_ir_loc, orbsym, vooo, fvo, jobs, e_tot, permute_idx, err_fp)
 {
         int a, b, c;
         size_t k;
         double *cache1 = malloc(sizeof(double) * (nocc*nocc*nocc*3+2));
         if (cache1 == NULL) {
-                fprintf(stderr, "malloc(%zu) failed in CCsd_t_contract\n",
+                fprintf(err_fp, "malloc(%zu) failed in CCsd_t_contract\n",
                         sizeof(double) * nocc*nocc*nocc*3);
                 exit(1);
         }
@@ -447,15 +448,16 @@ void QCIsd_t_contract(double *e_tot,
                                         cache_row_b, cache_col_b, sizeof(double));
         int *permute_idx = malloc(sizeof(int) * nocc*nocc*nocc * 6);
         _make_permute_indices(permute_idx, nocc);
+        FILE *err_fp = stderr;
 #pragma omp parallel default(none) \
         shared(njobs, nocc, nvir, mo_energy, t1T, t2T, nirrep, o_ir_loc, \
-               v_ir_loc, oo_ir_loc, orbsym, vooo, fvo, jobs, e_tot, permute_idx, stderr)
+               v_ir_loc, oo_ir_loc, orbsym, vooo, fvo, jobs, e_tot, permute_idx, err_fp)
 {
         int a, b, c;
         size_t k;
         double *cache1 = malloc(sizeof(double) * (nocc*nocc*nocc*3+2));
         if (cache1 == NULL) {
-                fprintf(stderr, "malloc(%zu) failed in QCIsd_t_contract\n",
+                fprintf(err_fp, "malloc(%zu) failed in QCIsd_t_contract\n",
                         sizeof(double) * nocc*nocc*nocc*3);
                 exit(1);
         }
@@ -628,15 +630,16 @@ void CCsd_t_zcontract(double complex *e_tot,
         int *permute_idx = malloc(sizeof(int) * nocc*nocc*nocc * 6);
         _make_permute_indices(permute_idx, nocc);
 
+        FILE *err_fp = stderr;
 #pragma omp parallel default(none) \
         shared(njobs, nocc, nvir, mo_energy, t1T, t2T, nirrep, o_ir_loc, \
-               v_ir_loc, oo_ir_loc, orbsym, vooo, fvo, jobs, e_tot, permute_idx, stderr)
+               v_ir_loc, oo_ir_loc, orbsym, vooo, fvo, jobs, e_tot, permute_idx, err_fp)
 {
         int a, b, c;
         size_t k;
         double complex *cache1 = malloc(sizeof(double complex) * (nocc*nocc*nocc*3+2));
         if (cache1 == NULL) {
-                fprintf(stderr, "malloc(%zu) failed in CCsd_t_zcontract\n",
+                fprintf(err_fp, "malloc(%zu) failed in CCsd_t_zcontract\n",
                         sizeof(double complex) * nocc*nocc*nocc*3);
                 exit(1);
         }
@@ -686,15 +689,16 @@ void QCIsd_t_zcontract(double complex *e_tot,
         int *permute_idx = malloc(sizeof(int) * nocc*nocc*nocc * 6);
         _make_permute_indices(permute_idx, nocc);
 
+        FILE *err_fp = stderr;
 #pragma omp parallel default(none) \
         shared(njobs, nocc, nvir, mo_energy, t1T, t2T, nirrep, o_ir_loc, \
-               v_ir_loc, oo_ir_loc, orbsym, vooo, fvo, jobs, e_tot, permute_idx, stderr)
+               v_ir_loc, oo_ir_loc, orbsym, vooo, fvo, jobs, e_tot, permute_idx, err_fp)
 {
         int a, b, c;
         size_t k;
         double complex *cache1 = malloc(sizeof(double complex) * (nocc*nocc*nocc*3+2));
         if (cache1 == NULL) {
-                fprintf(stderr, "malloc(%zu) failed in QCIsd_t_zcontract\n",
+                fprintf(err_fp, "malloc(%zu) failed in QCIsd_t_zcontract\n",
                         sizeof(double complex) * nocc*nocc*nocc*3);
                 exit(1);
         }
@@ -872,15 +876,16 @@ void MPICCsd_t_contract(double *e_tot, double *mo_energy, double *t1T,
         int *permute_idx = malloc(sizeof(int) * nocc*nocc*nocc * 6);
         _make_permute_indices(permute_idx, nocc);
 
+        FILE *err_fp = stderr;
 #pragma omp parallel default(none) \
         shared(njobs, nocc, nvir, mo_energy, t1T, fvo, jobs, e_tot, slices, \
-               data_ptrs, permute_idx, stderr)
+               data_ptrs, permute_idx, err_fp)
 {
         int a, b, c;
         size_t k;
         double *cache1 = malloc(sizeof(double) * (nocc*nocc*nocc*3+2));
         if (cache1 == NULL) {
-                fprintf(stderr, "malloc(%zu) failed in MPICCsd_t_contract\n",
+                fprintf(err_fp, "malloc(%zu) failed in MPICCsd_t_contract\n",
                         sizeof(double) * nocc*nocc*nocc*3);
                 exit(1);
         }
@@ -1105,15 +1110,16 @@ void CCsd_zcontract_t3T(double complex *t3Tw, double complex *t3Tv, double *mo_e
         int *permute_idx = malloc(sizeof(int) * nocc*nocc*nocc * 6);
         _make_permute_indices(permute_idx, nocc);
 
+        FILE *err_fp = stderr;
 #pragma omp parallel default(none) \
         shared(njobs, nocc, nvir, nkpts, t3Tw, t3Tv, mo_offset, mo_energy, t1T, fvo, jobs, slices, \
-               data_ptrs, permute_idx, stderr)
+               data_ptrs, permute_idx, err_fp)
 {
         int a, b, c;
         size_t k;
         complex double *cache1 = malloc(sizeof(double complex) * (nocc*nocc*nocc*3+2));
         if (cache1 == NULL) {
-                fprintf(stderr, "malloc(%zu) failed in CCsd_zcontract_t3T\n",
+                fprintf(err_fp, "malloc(%zu) failed in CCsd_zcontract_t3T\n",
                         sizeof(double complex) * nocc*nocc*nocc*3);
                 exit(1);
         }
diff --git a/pyscf/lib/cc/uccsd_t.c b/pyscf/lib/cc/uccsd_t.c
index fedb09049e..19571dbb8c 100644
--- a/pyscf/lib/cc/uccsd_t.c
+++ b/pyscf/lib/cc/uccsd_t.c
@@ -299,16 +299,17 @@ void CCuccsd_t_aaa(double complex *e_tot,
         int *permute_idx = malloc(sizeof(int) * nocc*nocc*nocc * 6);
         _make_permute_indices(permute_idx, nocc);
 
+        FILE *err_fp = stderr;
 #pragma omp parallel default(none) \
         shared(njobs, nocc, nvir, mo_energy, t1T, t2T, nirrep, o_ir_loc, \
                v_ir_loc, oo_ir_loc, orbsym, vooo, fvohalf, jobs, e_tot, \
-               permute_idx, stderr)
+               permute_idx, err_fp)
 {
         int a, b, c;
         size_t k;
         double *cache1 = malloc(sizeof(double) * (nocc*nocc*nocc*3+2));
         if (cache1 == NULL) {
-                fprintf(stderr, "malloc(%zu) failed in CCuccsd_t_aaa\n",
+                fprintf(err_fp, "malloc(%zu) failed in CCuccsd_t_aaa\n",
                         sizeof(double) * nocc*nocc*nocc*3);
                 exit(1);
         }
@@ -549,15 +550,16 @@ void CCuccsd_t_baa(double complex *e_tot,
         double *vs_ts[] = {mo_ea, mo_eb, fvo, fVO, vooo, vOoO, VoOo,
                 t1aT, t1bT, t2aaT, t2abT};
 
+        FILE *err_fp = stderr;
 #pragma omp parallel default(none) \
-        shared(njobs, nocca, noccb, nvira, nvirb, vs_ts, jobs, e_tot, stderr)
+        shared(njobs, nocca, noccb, nvira, nvirb, vs_ts, jobs, e_tot, err_fp)
 {
         int a, b, c;
         size_t k;
         double *cache1 = malloc(sizeof(double) * (noccb*nocca*nocca*5+1 +
                                                   nocca*2+noccb*2));
         if (cache1 == NULL) {
-                fprintf(stderr, "malloc(%zu) failed in CCuccsd_t_baa\n",
+                fprintf(err_fp, "malloc(%zu) failed in CCuccsd_t_baa\n",
                         sizeof(double) * noccb*nocca*nocca*5);
                 exit(1);
         }
@@ -705,17 +707,18 @@ void CCuccsd_t_zaaa(double complex *e_tot,
         int *permute_idx = malloc(sizeof(int) * nocc*nocc*nocc * 6);
         _make_permute_indices(permute_idx, nocc);
 
+        FILE *err_fp = stderr;
 #pragma omp parallel default(none) \
         shared(njobs, nocc, nvir, mo_energy, t1T, t2T, nirrep, o_ir_loc, \
                v_ir_loc, oo_ir_loc, orbsym, vooo, fvohalf, jobs, e_tot, \
-               permute_idx, stderr)
+               permute_idx, err_fp)
 {
         int a, b, c;
         size_t k;
         double complex *cache1 = malloc(sizeof(double complex) *
                                         (nocc*nocc*nocc*3+2));
         if (cache1 == NULL) {
-                fprintf(stderr, "malloc(%zu) failed in CCuccsd_t_zaaa\n",
+                fprintf(err_fp, "malloc(%zu) failed in CCuccsd_t_zaaa\n",
                         sizeof(double complex) * nocc*nocc*nocc*3);
                 exit(1);
         }
@@ -915,8 +918,9 @@ void CCuccsd_t_zbaa(double complex *e_tot,
                 (double complex *)mo_eb, fvo, fVO, vooo, vOoO, VoOo,
                 t1aT, t1bT, t2aaT, t2abT};
 
+        FILE *err_fp = stderr;
 #pragma omp parallel default(none) \
-        shared(njobs, nocca, noccb, nvira, nvirb, vs_ts, jobs, e_tot, stderr)
+        shared(njobs, nocca, noccb, nvira, nvirb, vs_ts, jobs, e_tot, err_fp)
 {
         int a, b, c;
         size_t k;
@@ -924,7 +928,7 @@ void CCuccsd_t_zbaa(double complex *e_tot,
                                         (noccb*nocca*nocca*5+1 +
                                          nocca*2+noccb*2));
         if (cache1 == NULL) {
-                fprintf(stderr, "malloc(%zu) failed in CCuccsd_t_zbaa\n",
+                fprintf(err_fp, "malloc(%zu) failed in CCuccsd_t_zbaa\n",
                         sizeof(double complex) * noccb*nocca*nocca*5);
                 exit(1);
         }
diff --git a/pyscf/lib/ccsdt/rccsdt.c b/pyscf/lib/ccsdt/rccsdt.c
index 30adf69aac..e2c9200c98 100644
--- a/pyscf/lib/ccsdt/rccsdt.c
+++ b/pyscf/lib/ccsdt/rccsdt.c
@@ -1,4 +1,4 @@
-/* Copyright 2014-2025 The PySCF Developers. All Rights Reserved.
+/* Copyright 2014-2026 The PySCF Developers. All Rights Reserved.
 
    Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -496,6 +496,17 @@ const int64_t tp_t3[6][3] = {
     {2, 1, 0}, // reverse
 };
 
+static inline int64_t src_idx_from_full3(const int64_t *restrict perm, int64_t v0, int64_t v1, int64_t v2, int64_t nvir)
+{
+    int64_t src_abc[3];
+
+    src_abc[perm[0]] = v0;
+    src_abc[perm[1]] = v1;
+    src_abc[perm[2]] = v2;
+
+    return ((src_abc[0] * nvir + src_abc[1]) * nvir + src_abc[2]);
+}
+
 // Unpack triangular-stored T3 amplitudes into a full T3 block.
 //
 // This kernel reconstructs the full permutation-expanded T3 tensor block from the compressed triangular
@@ -521,6 +532,7 @@ void unpack_t3_tri2block_(const double *restrict t3_tri,
 {
 #define MAP(sym, x, y, z) map[(((sym) * nocc + (x)) * nocc + (y)) * nocc + (z)]
 #define MASK(sym, x, y, z) mask[(((sym) * nocc + (x)) * nocc + (y)) * nocc + (z)]
+#define VIDX(a, b, c) (((a) * nvir + (b)) * nvir + (c))
 
 #pragma omp parallel for collapse(4) schedule(dynamic)
     for (int64_t sym = 0; sym < 6; ++sym)
@@ -549,15 +561,10 @@ void unpack_t3_tri2block_(const double *restrict t3_tri,
                         {
                             for (int64_t c = 0; c < nvir; ++c)
                             {
-                                int64_t abc[3] = {a, b, c};
-                                int64_t aa = abc[perm[0]];
-                                int64_t bb = abc[perm[1]];
-                                int64_t cc = abc[perm[2]];
-
-                                int64_t src_idx = src_base + (a * nvir + b) * nvir + c;
-                                int64_t dest_idx = dest_base + (aa * nvir + bb) * nvir + cc;
+                                int64_t src = src_base + src_idx_from_full3(perm, a, b, c, nvir);
+                                int64_t dest = dest_base + VIDX(a, b, c);
 
-                                t3_blk[dest_idx] = t3_tri[src_idx];
+                                t3_blk[dest] = t3_tri[src];
                             }
                         }
                     }
@@ -569,112 +576,11 @@ void unpack_t3_tri2block_(const double *restrict t3_tri,
 #undef MASK
 }
 
-// Unpack a triangular-stored T3 (i, j, k) element into its 6-fold
-// permutation representation for a single occupied triplet.
-//
-// This routine identifies the symmetry representative of (i0, j0, k0) in the triangular (i <= j <= k) index domain,
-// applies the corresponding (a, b, c) permutation, and scatters the resulting amplitudes into `t3_blk`.
-// In addition, a second symmetry partner (selected via `tmp_indices`) is accumulated to complete the required
-// two-term contribution.  Conceptually, this corresponds to reconstructing:
-//
-//     t3_full[i0, j0, k0, :, :, :] + t3_full[j0, i0, k0, :, :, :].transpose(1, 0, 2)
-//
-// Input
-//   t3_tri     : triangular-stored T3 amplitudes
-//   t3_blk     : output buffer [nvir**3]
-//   map        : mapping (sym, i, j, k) -> tri index
-//   mask       : triangular-domain mask for valid (i, j, k)
-//   i0, j0, k0 : occupied indices for this element
-//   nocc       : number of occupied orbitals
-//   nvir       : number of virtual orbitals
-void unpack_t3_tri2single_pair_(const double *restrict t3_tri,
-                                double *restrict t3_blk,
-                                const int64_t *restrict map,
-                                const bool *restrict mask,
-                                int64_t i0, int64_t j0, int64_t k0,
-                                int64_t nocc, int64_t nvir)
-{
-
-#define MAP(sym, x, y, z) map[(((sym) * nocc + (x)) * nocc + (y)) * nocc + (z)]
-#define MASK(sym, x, y, z) mask[(((sym) * nocc + (x)) * nocc + (y)) * nocc + (z)]
-
-    int64_t sym;
-    for (sym = 0; sym < 6; ++sym)
-    {
-        if (MASK(sym, i0, j0, k0))
-            break;
-    }
-
-    const int64_t *perm = tp_t3[sym];
-    int64_t idx = MAP(sym, i0, j0, k0);
-
-#pragma omp parallel for collapse(3) schedule(static)
-    for (int64_t a = 0; a < nvir; ++a)
-    {
-        for (int64_t b = 0; b < nvir; ++b)
-        {
-            for (int64_t c = 0; c < nvir; ++c)
-            {
-                int64_t abc[3] = {a, b, c};
-                int64_t aa = abc[perm[0]];
-                int64_t bb = abc[perm[1]];
-                int64_t cc = abc[perm[2]];
-
-                int64_t src_idx = ((idx * nvir + a) * nvir + b) * nvir + c;
-                int64_t dest_idx = (aa * nvir + bb) * nvir + cc;
-
-                t3_blk[dest_idx] = t3_tri[src_idx];
-            }
-        }
-    }
-
-    const int64_t tmp_indices[6] = {2, 4, 0, 5, 1, 3};
-
-    for (sym = 0; sym < 6; ++sym)
-    {
-        if (MASK(tmp_indices[sym], i0, j0, k0))
-            break;
-    }
-
-    const int64_t *perm2 = tp_t3[tmp_indices[sym]];
-    idx = MAP(tmp_indices[sym], i0, j0, k0);
-
-#pragma omp parallel for collapse(3) schedule(static)
-    for (int64_t a = 0; a < nvir; ++a)
-    {
-        for (int64_t b = 0; b < nvir; ++b)
-        {
-            for (int64_t c = 0; c < nvir; ++c)
-            {
-                int64_t abc[3] = {a, b, c};
-                int64_t aa = abc[perm2[0]];
-                int64_t bb = abc[perm2[1]];
-                int64_t cc = abc[perm2[2]];
-
-                int64_t src_idx = ((idx * nvir + a) * nvir + b) * nvir + c;
-                int64_t dest_idx = (aa * nvir + bb) * nvir + cc;
 
-                t3_blk[dest_idx] += t3_tri[src_idx];
-            }
-        }
-    }
-#undef MAP
-#undef MASK
-}
-
-// Unpack triangular-stored T3 amplitudes into a full T3 block.
+// Unpack triangular-stored T3 amplitudes directly into the final block:
 //
-// This kernel reconstructs the full permutation-expanded T3 tensor block from the compressed triangular
-// representation without forming the full tensor in memory.
+//   t3_tmp + t3_tmp.transpose(0, 1, 2, 4, 5, 3)
 //
-// Input:
-//   t3_tri                    : triangular-stored T3 amplitudes
-//   t3_blk                    : output buffer [blk_i * blk_j * blk_k * nvir**3]
-//   map                       : mapping index table for (i, j, k) -> tri index
-//   mask                      : mask indicating which (i, j, k) indices are stored (triangular domain)
-//   [i0:i1), [j0:j1), [k0:k1) : occupied index block ranges
-//   nocc, nvir                : number of occupied / virtual orbitals
-//   blk_i, blk_j, blk_k       : block sizes for the destination tensor
 void unpack_t3_tri2block_pair_(const double *restrict t3_tri,
                                double *restrict t3_blk,
                                const int64_t *restrict map,
@@ -688,9 +594,7 @@ void unpack_t3_tri2block_pair_(const double *restrict t3_tri,
 
 #define MAP(sym, x, y, z) map[(((sym) * nocc + (x)) * nocc + (y)) * nocc + (z)]
 #define MASK(sym, x, y, z) mask[(((sym) * nocc + (x)) * nocc + (y)) * nocc + (z)]
-
-    const int64_t tmp_indices[6] = {5, 3, 4, 1, 2, 0};
-    const int64_t trans_indices[6] = {1, 0, 3, 2, 5, 4};
+#define VIDX(a, b, c) (((a) * nvir + (b)) * nvir + (c))
 
 #pragma omp parallel for collapse(4) schedule(dynamic)
     for (int64_t sym = 0; sym < 6; ++sym)
@@ -719,59 +623,11 @@ void unpack_t3_tri2block_pair_(const double *restrict t3_tri,
                         {
                             for (int64_t c = 0; c < nvir; ++c)
                             {
-                                int64_t abc[3] = {a, b, c};
-                                int64_t aa = abc[perm[0]];
-                                int64_t bb = abc[perm[1]];
-                                int64_t cc = abc[perm[2]];
+                                const int64_t src0 = src_base + src_idx_from_full3(perm, a, b, c, nvir);
+                                const int64_t src1 = src_base + src_idx_from_full3(perm, b, c, a, nvir);
+                                const int64_t dest = dest_base + VIDX(a, b, c);
 
-                                int64_t src_idx = src_base + (a * nvir + b) * nvir + c;
-                                int64_t dest_idx = dest_base + (aa * nvir + bb) * nvir + cc;
-
-                                t3_blk[dest_idx] = t3_tri[src_idx];
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-#pragma omp parallel for collapse(4) schedule(dynamic)
-    for (int64_t sym = 0; sym < 6; ++sym)
-    {
-        for (int64_t i = i0; i < i1; ++i)
-        {
-            for (int64_t j = j0; j < j1; ++j)
-            {
-                for (int64_t k = k0; k < k1; ++k)
-                {
-                    if (!MASK(tmp_indices[sym], i, j, k))
-                        continue;
-
-                    const int64_t *perm2 = tp_t3[trans_indices[sym]];
-
-                    int64_t loc_i = i - i0;
-                    int64_t loc_j = j - j0;
-                    int64_t loc_k = k - k0;
-
-                    int64_t src_base = MAP(tmp_indices[sym], i, j, k) * nvir * nvir * nvir;
-                    int64_t dest_base = ((loc_i * blk_j + loc_j) * blk_k + loc_k) * nvir * nvir * nvir;
-
-                    for (int64_t a = 0; a < nvir; ++a)
-                    {
-                        for (int64_t b = 0; b < nvir; ++b)
-                        {
-                            for (int64_t c = 0; c < nvir; ++c)
-                            {
-                                int64_t abc[3] = {a, b, c};
-                                int64_t aa = abc[perm2[0]];
-                                int64_t bb = abc[perm2[1]];
-                                int64_t cc = abc[perm2[2]];
-
-                                int64_t src_idx = src_base + (a * nvir + b) * nvir + c;
-                                int64_t dest_idx = dest_base + (aa * nvir + bb) * nvir + cc;
-
-                                t3_blk[dest_idx] += t3_tri[src_idx];
+                                t3_blk[dest] = t3_tri[src0] + t3_tri[src1];
                             }
                         }
                     }
@@ -851,40 +707,3 @@ void accumulate_t3_block2tri_(double *restrict t3_tri,
     }
 #undef MAP
 }
-
-// Accumulate a single (i0, j0, k0) full T3 slice into the triangular 6-fold compressed T3 storage.
-//
-// Inputs
-//   t3_tri      : triangular-stored T3 amplitudes
-//   t3_blk      : full T3 slice [nvir**3] for (i0, j0, k0)
-//   map         : mapping (sym, i, j, k) -> triangular index (sym = 0 used here)
-//   i0, j0, k0  : occupied indices
-//   nocc        : number of occupied orbitals
-//   nvir        : number of virtual orbitals
-//   alpha, beta : scaling coefficients for accumulation
-void accumulate_t3_single2tri_(double *restrict t3_tri,
-                               const double *restrict t3_blk,
-                               const int64_t *restrict map,
-                               int64_t i0, int64_t j0, int64_t k0,
-                               int64_t nocc, int64_t nvir,
-                               double alpha, double beta)
-{
-#define MAP(sym, x, y, z) map[(((sym) * nocc + (x)) * nocc + (y)) * nocc + (z)]
-
-    int64_t p = MAP(0, i0, j0, k0);
-    int64_t tri_base = p * nvir * nvir * nvir;
-
-#pragma omp parallel for collapse(3) schedule(static)
-    for (int64_t a = 0; a < nvir; ++a)
-    {
-        for (int64_t b = 0; b < nvir; ++b)
-        {
-            for (int64_t c = 0; c < nvir; ++c)
-            {
-                int64_t idx = ((a * nvir + b) * nvir + c);
-                t3_tri[tri_base + idx] = beta * t3_tri[tri_base + idx] + alpha * t3_blk[idx];
-            }
-        }
-    }
-#undef MAP
-}
diff --git a/pyscf/lib/ccsdt/rccsdtq.c b/pyscf/lib/ccsdt/rccsdtq.c
index 3a25a803ea..10f916de5d 100644
--- a/pyscf/lib/ccsdt/rccsdtq.c
+++ b/pyscf/lib/ccsdt/rccsdtq.c
@@ -1,4 +1,4 @@
-/* Copyright 2014-2025 The PySCF Developers. All Rights Reserved.
+/* Copyright 2014-2026 The PySCF Developers. All Rights Reserved.
 
    Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -27,8 +27,9 @@
 // Apply spin summation projection to T4 amplitudes in place.
 // A: pointer to T4 tensor (size nocc4 * nvir**4)
 // pattern: "P4_full" : P(A) = (1 + P_c^d) (1 + P_b^c + P_b^d) (1 + P_a^b + P_a^c + P_a^d) A
-//          "P4_422"  : P(A) = (1 + 0 * P_c^d) (1 + 0 * P_b^c + 0 * P_b^d) (2 - P_a^b - P_a^c - P_a^d) A
-//          "P4_201"  : P(A) = (1 + 0 * P_c^d) (2 - P_b^c - P_b^d) (2 - P_a^b - P_a^c - P_a^d) A
+//          "P4_444"  : P(A) = (2 - P_c^d) (2 - P_b^c - P_b^d) (2 - P_a^b - P_a^c - P_a^d) A
+//          "P4_422"  : P(A) = (1 + 0 * P_c^d) (2 - P_b^c - P_b^d) (2 - P_a^b - P_a^c - P_a^d) A
+//          "P4_201"  : P(A) = (1 + 0 * P_c^d) (1 + 0 * P_b^c + 0 * P_b^d) (2 - P_a^b - P_a^c - P_a^d) A
 // alpha, beta: A = beta * A + alpha * P(A)
 void t4_spin_summation_inplace_(double *A, int64_t nocc4, int64_t nvir, char *pattern, double alpha, double beta)
 {
@@ -69,6 +70,18 @@ void t4_spin_summation_inplace_(double *A, int64_t nocc4, int64_t nvir, char *pa
         p[7] = 1.0;
         p[8] = 0.0;
     }
+    else if (strcmp(pattern, "P4_444") == 0)
+    {
+        p[0] = 2.0;
+        p[1] = -1.0;
+        p[2] = -1.0;
+        p[3] = -1.0;
+        p[4] = 2.0;
+        p[5] = -1.0;
+        p[6] = -1.0;
+        p[7] = 2.0;
+        p[8] = -1.0;
+    }
     else
     {
         fprintf(stderr, "Error: unrecognized pattern \"%s\"\n", pattern);
@@ -530,6 +543,18 @@ void t4_spin_summation(const double *A, double *B, int64_t nocc4, int64_t nvir,
         p[7] = 1.0;
         p[8] = 0.0;
     }
+    else if (strcmp(pattern, "P4_444") == 0)
+    {
+        p[0] = 2.0;
+        p[1] = -1.0;
+        p[2] = -1.0;
+        p[3] = -1.0;
+        p[4] = 2.0;
+        p[5] = -1.0;
+        p[6] = -1.0;
+        p[7] = 2.0;
+        p[8] = -1.0;
+    }
     else
     {
         fprintf(stderr, "Error: unrecognized pattern \"%s\"\n", pattern);
@@ -947,6 +972,491 @@ void t4_spin_summation(const double *A, double *B, int64_t nocc4, int64_t nvir,
     }
 }
 
+void t4_spin_summation_single_inplace_(double *A, int64_t nvir, char *pattern, double alpha, double beta)
+{
+    int64_t nvv = nvir * nvir;
+    int64_t nvvv = nvir * nvv;
+
+    double p[9];
+
+    if (strcmp(pattern, "P4_full") == 0)
+    {
+        for (int i = 0; i < 9; i++)
+            p[i] = 1.0;
+    }
+    else if (strcmp(pattern, "P4_201") == 0)
+    {
+        p[0] = 2.0;
+        p[1] = -1.0;
+        p[2] = -1.0;
+        p[3] = -1.0;
+        p[4] = 1.0;
+        p[5] = 0.0;
+        p[6] = 0.0;
+        p[7] = 1.0;
+        p[8] = 0.0;
+    }
+    else if (strcmp(pattern, "P4_442") == 0)
+    {
+        p[0] = 2.0;
+        p[1] = -1.0;
+        p[2] = -1.0;
+        p[3] = -1.0;
+        p[4] = 2.0;
+        p[5] = -1.0;
+        p[6] = -1.0;
+        p[7] = 1.0;
+        p[8] = 0.0;
+    }
+    else if (strcmp(pattern, "P4_444") == 0)
+    {
+        p[0] = 2.0;
+        p[1] = -1.0;
+        p[2] = -1.0;
+        p[3] = -1.0;
+        p[4] = 2.0;
+        p[5] = -1.0;
+        p[6] = -1.0;
+        p[7] = 2.0;
+        p[8] = -1.0;
+    }
+    else
+    {
+        fprintf(stderr, "Error: unrecognized pattern \"%s\"\n", pattern);
+        return;
+    }
+
+    int64_t total_combinations = (nvir * (nvir + 1) * (nvir + 2) * (nvir + 3)) / 24;
+
+#pragma omp parallel for schedule(static)
+    for (int64_t idx_linear = 0; idx_linear < total_combinations; idx_linear++)
+    {
+        int64_t a, b, c, d;
+        int64_t remaining = idx_linear;
+
+        a = 0;
+        while (a < nvir)
+        {
+            int64_t count_with_a = ((a + 1) * (a + 2) * (a + 3)) / 6;
+            if (remaining < count_with_a)
+            {
+                break;
+            }
+            remaining -= count_with_a;
+            a++;
+        }
+
+        b = 0;
+        while (b <= a)
+        {
+            int64_t count_with_b = ((b + 1) * (b + 2)) / 2;
+            if (remaining < count_with_b)
+            {
+                break;
+            }
+            remaining -= count_with_b;
+            b++;
+        }
+
+        c = 0;
+        while (c <= b)
+        {
+            int64_t count_with_c = c + 1;
+            if (remaining < count_with_c)
+            {
+                break;
+            }
+            remaining -= count_with_c;
+            c++;
+        }
+
+        d = remaining;
+
+        int64_t nvvv = nvir * nvir * nvir;
+        int64_t nvv = nvir * nvir;
+        if (a > b && b > c && c > d)
+        {
+            double T1_local[24];
+            double T2_local[24];
+
+            int64_t indices[24];
+            indices[0] = a * nvvv + b * nvv + c * nvir + d;
+            indices[1] = a * nvvv + b * nvv + d * nvir + c;
+            indices[2] = a * nvvv + c * nvv + b * nvir + d;
+            indices[3] = a * nvvv + c * nvv + d * nvir + b;
+            indices[4] = a * nvvv + d * nvv + b * nvir + c;
+            indices[5] = a * nvvv + d * nvv + c * nvir + b;
+            indices[6] = b * nvvv + a * nvv + c * nvir + d;
+            indices[7] = b * nvvv + a * nvv + d * nvir + c;
+            indices[8] = b * nvvv + c * nvv + a * nvir + d;
+            indices[9] = b * nvvv + c * nvv + d * nvir + a;
+            indices[10] = b * nvvv + d * nvv + a * nvir + c;
+            indices[11] = b * nvvv + d * nvv + c * nvir + a;
+            indices[12] = c * nvvv + a * nvv + b * nvir + d;
+            indices[13] = c * nvvv + a * nvv + d * nvir + b;
+            indices[14] = c * nvvv + b * nvv + a * nvir + d;
+            indices[15] = c * nvvv + b * nvv + d * nvir + a;
+            indices[16] = c * nvvv + d * nvv + a * nvir + b;
+            indices[17] = c * nvvv + d * nvv + b * nvir + a;
+            indices[18] = d * nvvv + a * nvv + b * nvir + c;
+            indices[19] = d * nvvv + a * nvv + c * nvir + b;
+            indices[20] = d * nvvv + b * nvv + a * nvir + c;
+            indices[21] = d * nvvv + b * nvv + c * nvir + a;
+            indices[22] = d * nvvv + c * nvv + a * nvir + b;
+            indices[23] = d * nvvv + c * nvv + b * nvir + a;
+
+            T1_local[0] = p[0] * A[indices[0]] + p[1] * A[indices[6]] + p[2] * A[indices[14]] + p[3] * A[indices[21]];
+            T1_local[1] = p[0] * A[indices[1]] + p[1] * A[indices[7]] + p[2] * A[indices[20]] + p[3] * A[indices[15]];
+            T1_local[2] = p[0] * A[indices[2]] + p[1] * A[indices[12]] + p[2] * A[indices[8]] + p[3] * A[indices[23]];
+            T1_local[3] = p[0] * A[indices[3]] + p[1] * A[indices[13]] + p[2] * A[indices[22]] + p[3] * A[indices[9]];
+            T1_local[4] = p[0] * A[indices[4]] + p[1] * A[indices[18]] + p[2] * A[indices[10]] + p[3] * A[indices[17]];
+            T1_local[5] = p[0] * A[indices[5]] + p[1] * A[indices[19]] + p[2] * A[indices[16]] + p[3] * A[indices[11]];
+            T1_local[6] = p[0] * A[indices[6]] + p[1] * A[indices[0]] + p[2] * A[indices[12]] + p[3] * A[indices[19]];
+            T1_local[7] = p[0] * A[indices[7]] + p[1] * A[indices[1]] + p[2] * A[indices[18]] + p[3] * A[indices[13]];
+            T1_local[8] = p[0] * A[indices[8]] + p[1] * A[indices[14]] + p[2] * A[indices[2]] + p[3] * A[indices[22]];
+            T1_local[9] = p[0] * A[indices[9]] + p[1] * A[indices[15]] + p[2] * A[indices[23]] + p[3] * A[indices[3]];
+            T1_local[10] = p[0] * A[indices[10]] + p[1] * A[indices[20]] + p[2] * A[indices[4]] + p[3] * A[indices[16]];
+            T1_local[11] = p[0] * A[indices[11]] + p[1] * A[indices[21]] + p[2] * A[indices[17]] + p[3] * A[indices[5]];
+            T1_local[12] = p[0] * A[indices[12]] + p[1] * A[indices[2]] + p[2] * A[indices[6]] + p[3] * A[indices[18]];
+            T1_local[13] = p[0] * A[indices[13]] + p[1] * A[indices[3]] + p[2] * A[indices[19]] + p[3] * A[indices[7]];
+            T1_local[14] = p[0] * A[indices[14]] + p[1] * A[indices[8]] + p[2] * A[indices[0]] + p[3] * A[indices[20]];
+            T1_local[15] = p[0] * A[indices[15]] + p[1] * A[indices[9]] + p[2] * A[indices[21]] + p[3] * A[indices[1]];
+            T1_local[16] = p[0] * A[indices[16]] + p[1] * A[indices[22]] + p[2] * A[indices[5]] + p[3] * A[indices[10]];
+            T1_local[17] = p[0] * A[indices[17]] + p[1] * A[indices[23]] + p[2] * A[indices[11]] + p[3] * A[indices[4]];
+            T1_local[18] = p[0] * A[indices[18]] + p[1] * A[indices[4]] + p[2] * A[indices[7]] + p[3] * A[indices[12]];
+            T1_local[19] = p[0] * A[indices[19]] + p[1] * A[indices[5]] + p[2] * A[indices[13]] + p[3] * A[indices[6]];
+            T1_local[20] = p[0] * A[indices[20]] + p[1] * A[indices[10]] + p[2] * A[indices[1]] + p[3] * A[indices[14]];
+            T1_local[21] = p[0] * A[indices[21]] + p[1] * A[indices[11]] + p[2] * A[indices[15]] + p[3] * A[indices[0]];
+            T1_local[22] = p[0] * A[indices[22]] + p[1] * A[indices[16]] + p[2] * A[indices[3]] + p[3] * A[indices[8]];
+            T1_local[23] = p[0] * A[indices[23]] + p[1] * A[indices[17]] + p[2] * A[indices[9]] + p[3] * A[indices[2]];
+
+            T2_local[0] = p[4] * T1_local[0] + p[5] * T1_local[2] + p[6] * T1_local[5];
+            T2_local[1] = p[4] * T1_local[1] + p[5] * T1_local[4] + p[6] * T1_local[3];
+            T2_local[2] = p[4] * T1_local[2] + p[5] * T1_local[0] + p[6] * T1_local[4];
+            T2_local[3] = p[4] * T1_local[3] + p[5] * T1_local[5] + p[6] * T1_local[1];
+            T2_local[4] = p[4] * T1_local[4] + p[5] * T1_local[1] + p[6] * T1_local[2];
+            T2_local[5] = p[4] * T1_local[5] + p[5] * T1_local[3] + p[6] * T1_local[0];
+            T2_local[6] = p[4] * T1_local[6] + p[5] * T1_local[8] + p[6] * T1_local[11];
+            T2_local[7] = p[4] * T1_local[7] + p[5] * T1_local[10] + p[6] * T1_local[9];
+            T2_local[8] = p[4] * T1_local[8] + p[5] * T1_local[6] + p[6] * T1_local[10];
+            T2_local[9] = p[4] * T1_local[9] + p[5] * T1_local[11] + p[6] * T1_local[7];
+            T2_local[10] = p[4] * T1_local[10] + p[5] * T1_local[7] + p[6] * T1_local[8];
+            T2_local[11] = p[4] * T1_local[11] + p[5] * T1_local[9] + p[6] * T1_local[6];
+            T2_local[12] = p[4] * T1_local[12] + p[5] * T1_local[14] + p[6] * T1_local[17];
+            T2_local[13] = p[4] * T1_local[13] + p[5] * T1_local[16] + p[6] * T1_local[15];
+            T2_local[14] = p[4] * T1_local[14] + p[5] * T1_local[12] + p[6] * T1_local[16];
+            T2_local[15] = p[4] * T1_local[15] + p[5] * T1_local[17] + p[6] * T1_local[13];
+            T2_local[16] = p[4] * T1_local[16] + p[5] * T1_local[13] + p[6] * T1_local[14];
+            T2_local[17] = p[4] * T1_local[17] + p[5] * T1_local[15] + p[6] * T1_local[12];
+            T2_local[18] = p[4] * T1_local[18] + p[5] * T1_local[20] + p[6] * T1_local[23];
+            T2_local[19] = p[4] * T1_local[19] + p[5] * T1_local[22] + p[6] * T1_local[21];
+            T2_local[20] = p[4] * T1_local[20] + p[5] * T1_local[18] + p[6] * T1_local[22];
+            T2_local[21] = p[4] * T1_local[21] + p[5] * T1_local[23] + p[6] * T1_local[19];
+            T2_local[22] = p[4] * T1_local[22] + p[5] * T1_local[19] + p[6] * T1_local[20];
+            T2_local[23] = p[4] * T1_local[23] + p[5] * T1_local[21] + p[6] * T1_local[18];
+
+            A[indices[0]] = alpha * (p[7] * T2_local[0] + p[8] * T2_local[1]) + beta * A[indices[0]];
+            A[indices[1]] = alpha * (p[7] * T2_local[1] + p[8] * T2_local[0]) + beta * A[indices[1]];
+            A[indices[2]] = alpha * (p[7] * T2_local[2] + p[8] * T2_local[3]) + beta * A[indices[2]];
+            A[indices[3]] = alpha * (p[7] * T2_local[3] + p[8] * T2_local[2]) + beta * A[indices[3]];
+            A[indices[4]] = alpha * (p[7] * T2_local[4] + p[8] * T2_local[5]) + beta * A[indices[4]];
+            A[indices[5]] = alpha * (p[7] * T2_local[5] + p[8] * T2_local[4]) + beta * A[indices[5]];
+            A[indices[6]] = alpha * (p[7] * T2_local[6] + p[8] * T2_local[7]) + beta * A[indices[6]];
+            A[indices[7]] = alpha * (p[7] * T2_local[7] + p[8] * T2_local[6]) + beta * A[indices[7]];
+            A[indices[8]] = alpha * (p[7] * T2_local[8] + p[8] * T2_local[9]) + beta * A[indices[8]];
+            A[indices[9]] = alpha * (p[7] * T2_local[9] + p[8] * T2_local[8]) + beta * A[indices[9]];
+            A[indices[10]] = alpha * (p[7] * T2_local[10] + p[8] * T2_local[11]) + beta * A[indices[10]];
+            A[indices[11]] = alpha * (p[7] * T2_local[11] + p[8] * T2_local[10]) + beta * A[indices[11]];
+            A[indices[12]] = alpha * (p[7] * T2_local[12] + p[8] * T2_local[13]) + beta * A[indices[12]];
+            A[indices[13]] = alpha * (p[7] * T2_local[13] + p[8] * T2_local[12]) + beta * A[indices[13]];
+            A[indices[14]] = alpha * (p[7] * T2_local[14] + p[8] * T2_local[15]) + beta * A[indices[14]];
+            A[indices[15]] = alpha * (p[7] * T2_local[15] + p[8] * T2_local[14]) + beta * A[indices[15]];
+            A[indices[16]] = alpha * (p[7] * T2_local[16] + p[8] * T2_local[17]) + beta * A[indices[16]];
+            A[indices[17]] = alpha * (p[7] * T2_local[17] + p[8] * T2_local[16]) + beta * A[indices[17]];
+            A[indices[18]] = alpha * (p[7] * T2_local[18] + p[8] * T2_local[19]) + beta * A[indices[18]];
+            A[indices[19]] = alpha * (p[7] * T2_local[19] + p[8] * T2_local[18]) + beta * A[indices[19]];
+            A[indices[20]] = alpha * (p[7] * T2_local[20] + p[8] * T2_local[21]) + beta * A[indices[20]];
+            A[indices[21]] = alpha * (p[7] * T2_local[21] + p[8] * T2_local[20]) + beta * A[indices[21]];
+            A[indices[22]] = alpha * (p[7] * T2_local[22] + p[8] * T2_local[23]) + beta * A[indices[22]];
+            A[indices[23]] = alpha * (p[7] * T2_local[23] + p[8] * T2_local[22]) + beta * A[indices[23]];
+        }
+        else if (a > b && b > c && c == d)
+        {
+            double T1_local[12];
+            double T2_local[12];
+
+            int64_t indices[12];
+            indices[0] = a * nvvv + b * nvv + c * nvir + c;
+            indices[1] = a * nvvv + c * nvv + b * nvir + c;
+            indices[2] = a * nvvv + c * nvv + c * nvir + b;
+            indices[3] = b * nvvv + a * nvv + c * nvir + c;
+            indices[4] = b * nvvv + c * nvv + a * nvir + c;
+            indices[5] = b * nvvv + c * nvv + c * nvir + a;
+            indices[6] = c * nvvv + a * nvv + b * nvir + c;
+            indices[7] = c * nvvv + a * nvv + c * nvir + b;
+            indices[8] = c * nvvv + b * nvv + a * nvir + c;
+            indices[9] = c * nvvv + b * nvv + c * nvir + a;
+            indices[10] = c * nvvv + c * nvv + a * nvir + b;
+            indices[11] = c * nvvv + c * nvv + b * nvir + a;
+
+            T1_local[0] = p[0] * A[indices[0]] + p[1] * A[indices[3]] + p[2] * A[indices[8]] + p[3] * A[indices[9]];
+            T1_local[1] = p[0] * A[indices[1]] + p[1] * A[indices[6]] + p[2] * A[indices[4]] + p[3] * A[indices[11]];
+            T1_local[2] = p[0] * A[indices[2]] + p[1] * A[indices[7]] + p[2] * A[indices[10]] + p[3] * A[indices[5]];
+            T1_local[3] = p[0] * A[indices[3]] + p[1] * A[indices[0]] + p[2] * A[indices[6]] + p[3] * A[indices[7]];
+            T1_local[4] = p[0] * A[indices[4]] + p[1] * A[indices[8]] + p[2] * A[indices[1]] + p[3] * A[indices[10]];
+            T1_local[5] = p[0] * A[indices[5]] + p[1] * A[indices[9]] + p[2] * A[indices[11]] + p[3] * A[indices[2]];
+            T1_local[6] = p[0] * A[indices[6]] + p[1] * A[indices[1]] + p[2] * A[indices[3]] + p[3] * A[indices[6]];
+            T1_local[7] = p[0] * A[indices[7]] + p[1] * A[indices[2]] + p[2] * A[indices[7]] + p[3] * A[indices[3]];
+            T1_local[8] = p[0] * A[indices[8]] + p[1] * A[indices[4]] + p[2] * A[indices[0]] + p[3] * A[indices[8]];
+            T1_local[9] = p[0] * A[indices[9]] + p[1] * A[indices[5]] + p[2] * A[indices[9]] + p[3] * A[indices[0]];
+            T1_local[10] = p[0] * A[indices[10]] + p[1] * A[indices[10]] + p[2] * A[indices[2]] + p[3] * A[indices[4]];
+            T1_local[11] = p[0] * A[indices[11]] + p[1] * A[indices[11]] + p[2] * A[indices[5]] + p[3] * A[indices[1]];
+
+            T2_local[0] = p[4] * T1_local[0] + p[5] * T1_local[1] + p[6] * T1_local[2];
+            T2_local[1] = p[4] * T1_local[1] + p[5] * T1_local[0] + p[6] * T1_local[1];
+            T2_local[2] = p[4] * T1_local[2] + p[5] * T1_local[2] + p[6] * T1_local[0];
+            T2_local[3] = p[4] * T1_local[3] + p[5] * T1_local[4] + p[6] * T1_local[5];
+            T2_local[4] = p[4] * T1_local[4] + p[5] * T1_local[3] + p[6] * T1_local[4];
+            T2_local[5] = p[4] * T1_local[5] + p[5] * T1_local[5] + p[6] * T1_local[3];
+            T2_local[6] = p[4] * T1_local[6] + p[5] * T1_local[8] + p[6] * T1_local[11];
+            T2_local[7] = p[4] * T1_local[7] + p[5] * T1_local[10] + p[6] * T1_local[9];
+            T2_local[8] = p[4] * T1_local[8] + p[5] * T1_local[6] + p[6] * T1_local[10];
+            T2_local[9] = p[4] * T1_local[9] + p[5] * T1_local[11] + p[6] * T1_local[7];
+            T2_local[10] = p[4] * T1_local[10] + p[5] * T1_local[7] + p[6] * T1_local[8];
+            T2_local[11] = p[4] * T1_local[11] + p[5] * T1_local[9] + p[6] * T1_local[6];
+
+            A[indices[0]] = alpha * (p[7] * T2_local[0] + p[8] * T2_local[0]) + beta * A[indices[0]];
+            A[indices[1]] = alpha * (p[7] * T2_local[1] + p[8] * T2_local[2]) + beta * A[indices[1]];
+            A[indices[2]] = alpha * (p[7] * T2_local[2] + p[8] * T2_local[1]) + beta * A[indices[2]];
+            A[indices[3]] = alpha * (p[7] * T2_local[3] + p[8] * T2_local[3]) + beta * A[indices[3]];
+            A[indices[4]] = alpha * (p[7] * T2_local[4] + p[8] * T2_local[5]) + beta * A[indices[4]];
+            A[indices[5]] = alpha * (p[7] * T2_local[5] + p[8] * T2_local[4]) + beta * A[indices[5]];
+            A[indices[6]] = alpha * (p[7] * T2_local[6] + p[8] * T2_local[7]) + beta * A[indices[6]];
+            A[indices[7]] = alpha * (p[7] * T2_local[7] + p[8] * T2_local[6]) + beta * A[indices[7]];
+            A[indices[8]] = alpha * (p[7] * T2_local[8] + p[8] * T2_local[9]) + beta * A[indices[8]];
+            A[indices[9]] = alpha * (p[7] * T2_local[9] + p[8] * T2_local[8]) + beta * A[indices[9]];
+            A[indices[10]] = alpha * (p[7] * T2_local[10] + p[8] * T2_local[11]) + beta * A[indices[10]];
+            A[indices[11]] = alpha * (p[7] * T2_local[11] + p[8] * T2_local[10]) + beta * A[indices[11]];
+        }
+        else if (a > b && b == c && c > d)
+        {
+            double T1_local[12];
+            double T2_local[12];
+
+            int64_t indices[12];
+            indices[0] = a * nvvv + b * nvv + b * nvir + d;
+            indices[1] = a * nvvv + b * nvv + d * nvir + b;
+            indices[2] = a * nvvv + d * nvv + b * nvir + b;
+            indices[3] = b * nvvv + a * nvv + b * nvir + d;
+            indices[4] = b * nvvv + a * nvv + d * nvir + b;
+            indices[5] = b * nvvv + b * nvv + a * nvir + d;
+            indices[6] = b * nvvv + b * nvv + d * nvir + a;
+            indices[7] = b * nvvv + d * nvv + a * nvir + b;
+            indices[8] = b * nvvv + d * nvv + b * nvir + a;
+            indices[9] = d * nvvv + a * nvv + b * nvir + b;
+            indices[10] = d * nvvv + b * nvv + a * nvir + b;
+            indices[11] = d * nvvv + b * nvv + b * nvir + a;
+
+            T1_local[0] = p[0] * A[indices[0]] + p[1] * A[indices[3]] + p[2] * A[indices[5]] + p[3] * A[indices[11]];
+            T1_local[1] = p[0] * A[indices[1]] + p[1] * A[indices[4]] + p[2] * A[indices[10]] + p[3] * A[indices[6]];
+            T1_local[2] = p[0] * A[indices[2]] + p[1] * A[indices[9]] + p[2] * A[indices[7]] + p[3] * A[indices[8]];
+            T1_local[3] = p[0] * A[indices[3]] + p[1] * A[indices[0]] + p[2] * A[indices[3]] + p[3] * A[indices[9]];
+            T1_local[4] = p[0] * A[indices[4]] + p[1] * A[indices[1]] + p[2] * A[indices[9]] + p[3] * A[indices[4]];
+            T1_local[5] = p[0] * A[indices[5]] + p[1] * A[indices[5]] + p[2] * A[indices[0]] + p[3] * A[indices[10]];
+            T1_local[6] = p[0] * A[indices[6]] + p[1] * A[indices[6]] + p[2] * A[indices[11]] + p[3] * A[indices[1]];
+            T1_local[7] = p[0] * A[indices[7]] + p[1] * A[indices[10]] + p[2] * A[indices[2]] + p[3] * A[indices[7]];
+            T1_local[8] = p[0] * A[indices[8]] + p[1] * A[indices[11]] + p[2] * A[indices[8]] + p[3] * A[indices[2]];
+            T1_local[9] = p[0] * A[indices[9]] + p[1] * A[indices[2]] + p[2] * A[indices[4]] + p[3] * A[indices[3]];
+            T1_local[10] = p[0] * A[indices[10]] + p[1] * A[indices[7]] + p[2] * A[indices[1]] + p[3] * A[indices[5]];
+            T1_local[11] = p[0] * A[indices[11]] + p[1] * A[indices[8]] + p[2] * A[indices[6]] + p[3] * A[indices[0]];
+
+            T2_local[0] = p[4] * T1_local[0] + p[5] * T1_local[0] + p[6] * T1_local[2];
+            T2_local[1] = p[4] * T1_local[1] + p[5] * T1_local[2] + p[6] * T1_local[1];
+            T2_local[2] = p[4] * T1_local[2] + p[5] * T1_local[1] + p[6] * T1_local[0];
+            T2_local[3] = p[4] * T1_local[3] + p[5] * T1_local[5] + p[6] * T1_local[8];
+            T2_local[4] = p[4] * T1_local[4] + p[5] * T1_local[7] + p[6] * T1_local[6];
+            T2_local[5] = p[4] * T1_local[5] + p[5] * T1_local[3] + p[6] * T1_local[7];
+            T2_local[6] = p[4] * T1_local[6] + p[5] * T1_local[8] + p[6] * T1_local[4];
+            T2_local[7] = p[4] * T1_local[7] + p[5] * T1_local[4] + p[6] * T1_local[5];
+            T2_local[8] = p[4] * T1_local[8] + p[5] * T1_local[6] + p[6] * T1_local[3];
+            T2_local[9] = p[4] * T1_local[9] + p[5] * T1_local[10] + p[6] * T1_local[11];
+            T2_local[10] = p[4] * T1_local[10] + p[5] * T1_local[9] + p[6] * T1_local[10];
+            T2_local[11] = p[4] * T1_local[11] + p[5] * T1_local[11] + p[6] * T1_local[9];
+
+            A[indices[0]] = alpha * (p[7] * T2_local[0] + p[8] * T2_local[1]) + beta * A[indices[0]];
+            A[indices[1]] = alpha * (p[7] * T2_local[1] + p[8] * T2_local[0]) + beta * A[indices[1]];
+            A[indices[2]] = alpha * (p[7] * T2_local[2] + p[8] * T2_local[2]) + beta * A[indices[2]];
+            A[indices[3]] = alpha * (p[7] * T2_local[3] + p[8] * T2_local[4]) + beta * A[indices[3]];
+            A[indices[4]] = alpha * (p[7] * T2_local[4] + p[8] * T2_local[3]) + beta * A[indices[4]];
+            A[indices[5]] = alpha * (p[7] * T2_local[5] + p[8] * T2_local[6]) + beta * A[indices[5]];
+            A[indices[6]] = alpha * (p[7] * T2_local[6] + p[8] * T2_local[5]) + beta * A[indices[6]];
+            A[indices[7]] = alpha * (p[7] * T2_local[7] + p[8] * T2_local[8]) + beta * A[indices[7]];
+            A[indices[8]] = alpha * (p[7] * T2_local[8] + p[8] * T2_local[7]) + beta * A[indices[8]];
+            A[indices[9]] = alpha * (p[7] * T2_local[9] + p[8] * T2_local[9]) + beta * A[indices[9]];
+            A[indices[10]] = alpha * (p[7] * T2_local[10] + p[8] * T2_local[11]) + beta * A[indices[10]];
+            A[indices[11]] = alpha * (p[7] * T2_local[11] + p[8] * T2_local[10]) + beta * A[indices[11]];
+        }
+        else if (a == b && b > c && c > d)
+        {
+            double T1_local[12];
+            double T2_local[12];
+
+            int64_t indices[12];
+            indices[0] = a * nvvv + a * nvv + c * nvir + d;
+            indices[1] = a * nvvv + a * nvv + d * nvir + c;
+            indices[2] = a * nvvv + c * nvv + a * nvir + d;
+            indices[3] = a * nvvv + c * nvv + d * nvir + a;
+            indices[4] = a * nvvv + d * nvv + a * nvir + c;
+            indices[5] = a * nvvv + d * nvv + c * nvir + a;
+            indices[6] = c * nvvv + a * nvv + a * nvir + d;
+            indices[7] = c * nvvv + a * nvv + d * nvir + a;
+            indices[8] = c * nvvv + d * nvv + a * nvir + a;
+            indices[9] = d * nvvv + a * nvv + a * nvir + c;
+            indices[10] = d * nvvv + a * nvv + c * nvir + a;
+            indices[11] = d * nvvv + c * nvv + a * nvir + a;
+
+            T1_local[0] = p[0] * A[indices[0]] + p[1] * A[indices[0]] + p[2] * A[indices[6]] + p[3] * A[indices[10]];
+            T1_local[1] = p[0] * A[indices[1]] + p[1] * A[indices[1]] + p[2] * A[indices[9]] + p[3] * A[indices[7]];
+            T1_local[2] = p[0] * A[indices[2]] + p[1] * A[indices[6]] + p[2] * A[indices[2]] + p[3] * A[indices[11]];
+            T1_local[3] = p[0] * A[indices[3]] + p[1] * A[indices[7]] + p[2] * A[indices[11]] + p[3] * A[indices[3]];
+            T1_local[4] = p[0] * A[indices[4]] + p[1] * A[indices[9]] + p[2] * A[indices[4]] + p[3] * A[indices[8]];
+            T1_local[5] = p[0] * A[indices[5]] + p[1] * A[indices[10]] + p[2] * A[indices[8]] + p[3] * A[indices[5]];
+            T1_local[6] = p[0] * A[indices[6]] + p[1] * A[indices[2]] + p[2] * A[indices[0]] + p[3] * A[indices[9]];
+            T1_local[7] = p[0] * A[indices[7]] + p[1] * A[indices[3]] + p[2] * A[indices[10]] + p[3] * A[indices[1]];
+            T1_local[8] = p[0] * A[indices[8]] + p[1] * A[indices[11]] + p[2] * A[indices[5]] + p[3] * A[indices[4]];
+            T1_local[9] = p[0] * A[indices[9]] + p[1] * A[indices[4]] + p[2] * A[indices[1]] + p[3] * A[indices[6]];
+            T1_local[10] = p[0] * A[indices[10]] + p[1] * A[indices[5]] + p[2] * A[indices[7]] + p[3] * A[indices[0]];
+            T1_local[11] = p[0] * A[indices[11]] + p[1] * A[indices[8]] + p[2] * A[indices[3]] + p[3] * A[indices[2]];
+
+            T2_local[0] = p[4] * T1_local[0] + p[5] * T1_local[2] + p[6] * T1_local[5];
+            T2_local[1] = p[4] * T1_local[1] + p[5] * T1_local[4] + p[6] * T1_local[3];
+            T2_local[2] = p[4] * T1_local[2] + p[5] * T1_local[0] + p[6] * T1_local[4];
+            T2_local[3] = p[4] * T1_local[3] + p[5] * T1_local[5] + p[6] * T1_local[1];
+            T2_local[4] = p[4] * T1_local[4] + p[5] * T1_local[1] + p[6] * T1_local[2];
+            T2_local[5] = p[4] * T1_local[5] + p[5] * T1_local[3] + p[6] * T1_local[0];
+            T2_local[6] = p[4] * T1_local[6] + p[5] * T1_local[6] + p[6] * T1_local[8];
+            T2_local[7] = p[4] * T1_local[7] + p[5] * T1_local[8] + p[6] * T1_local[7];
+            T2_local[8] = p[4] * T1_local[8] + p[5] * T1_local[7] + p[6] * T1_local[6];
+            T2_local[9] = p[4] * T1_local[9] + p[5] * T1_local[9] + p[6] * T1_local[11];
+            T2_local[10] = p[4] * T1_local[10] + p[5] * T1_local[11] + p[6] * T1_local[10];
+            T2_local[11] = p[4] * T1_local[11] + p[5] * T1_local[10] + p[6] * T1_local[9];
+
+            A[indices[0]] = alpha * (p[7] * T2_local[0] + p[8] * T2_local[1]) + beta * A[indices[0]];
+            A[indices[1]] = alpha * (p[7] * T2_local[1] + p[8] * T2_local[0]) + beta * A[indices[1]];
+            A[indices[2]] = alpha * (p[7] * T2_local[2] + p[8] * T2_local[3]) + beta * A[indices[2]];
+            A[indices[3]] = alpha * (p[7] * T2_local[3] + p[8] * T2_local[2]) + beta * A[indices[3]];
+            A[indices[4]] = alpha * (p[7] * T2_local[4] + p[8] * T2_local[5]) + beta * A[indices[4]];
+            A[indices[5]] = alpha * (p[7] * T2_local[5] + p[8] * T2_local[4]) + beta * A[indices[5]];
+            A[indices[6]] = alpha * (p[7] * T2_local[6] + p[8] * T2_local[7]) + beta * A[indices[6]];
+            A[indices[7]] = alpha * (p[7] * T2_local[7] + p[8] * T2_local[6]) + beta * A[indices[7]];
+            A[indices[8]] = alpha * (p[7] * T2_local[8] + p[8] * T2_local[8]) + beta * A[indices[8]];
+            A[indices[9]] = alpha * (p[7] * T2_local[9] + p[8] * T2_local[10]) + beta * A[indices[9]];
+            A[indices[10]] = alpha * (p[7] * T2_local[10] + p[8] * T2_local[9]) + beta * A[indices[10]];
+            A[indices[11]] = alpha * (p[7] * T2_local[11] + p[8] * T2_local[11]) + beta * A[indices[11]];
+        }
+        else if (a > b && b == c && c == d)
+        {
+            double T1_local[4];
+            double T2_local[4];
+
+            int64_t indices[4];
+            indices[0] = a * nvvv + b * nvv + b * nvir + b;
+            indices[1] = b * nvvv + a * nvv + b * nvir + b;
+            indices[2] = b * nvvv + b * nvv + a * nvir + b;
+            indices[3] = b * nvvv + b * nvv + b * nvir + a;
+
+            T1_local[0] = p[0] * A[indices[0]] + p[1] * A[indices[1]] + p[2] * A[indices[2]] + p[3] * A[indices[3]];
+            T1_local[1] = p[0] * A[indices[1]] + p[1] * A[indices[0]] + p[2] * A[indices[1]] + p[3] * A[indices[1]];
+            T1_local[2] = p[0] * A[indices[2]] + p[1] * A[indices[2]] + p[2] * A[indices[0]] + p[3] * A[indices[2]];
+            T1_local[3] = p[0] * A[indices[3]] + p[1] * A[indices[3]] + p[2] * A[indices[3]] + p[3] * A[indices[0]];
+
+            T2_local[0] = p[4] * T1_local[0] + p[5] * T1_local[0] + p[6] * T1_local[0];
+            T2_local[1] = p[4] * T1_local[1] + p[5] * T1_local[2] + p[6] * T1_local[3];
+            T2_local[2] = p[4] * T1_local[2] + p[5] * T1_local[1] + p[6] * T1_local[2];
+            T2_local[3] = p[4] * T1_local[3] + p[5] * T1_local[3] + p[6] * T1_local[1];
+
+            A[indices[0]] = alpha * (p[7] * T2_local[0] + p[8] * T2_local[0]) + beta * A[indices[0]];
+            A[indices[1]] = alpha * (p[7] * T2_local[1] + p[8] * T2_local[1]) + beta * A[indices[1]];
+            A[indices[2]] = alpha * (p[7] * T2_local[2] + p[8] * T2_local[3]) + beta * A[indices[2]];
+            A[indices[3]] = alpha * (p[7] * T2_local[3] + p[8] * T2_local[2]) + beta * A[indices[3]];
+        }
+        else if (a == b && b == c && c > d)
+        {
+            double T1_local[4];
+            double T2_local[4];
+
+            int64_t indices[4];
+            indices[0] = a * nvvv + a * nvv + a * nvir + d;
+            indices[1] = a * nvvv + a * nvv + d * nvir + a;
+            indices[2] = a * nvvv + d * nvv + a * nvir + a;
+            indices[3] = d * nvvv + a * nvv + a * nvir + a;
+
+            T1_local[0] = p[0] * A[indices[0]] + p[1] * A[indices[0]] + p[2] * A[indices[0]] + p[3] * A[indices[3]];
+            T1_local[1] = p[0] * A[indices[1]] + p[1] * A[indices[1]] + p[2] * A[indices[3]] + p[3] * A[indices[1]];
+            T1_local[2] = p[0] * A[indices[2]] + p[1] * A[indices[3]] + p[2] * A[indices[2]] + p[3] * A[indices[2]];
+            T1_local[3] = p[0] * A[indices[3]] + p[1] * A[indices[2]] + p[2] * A[indices[1]] + p[3] * A[indices[0]];
+
+            T2_local[0] = p[4] * T1_local[0] + p[5] * T1_local[0] + p[6] * T1_local[2];
+            T2_local[1] = p[4] * T1_local[1] + p[5] * T1_local[2] + p[6] * T1_local[1];
+            T2_local[2] = p[4] * T1_local[2] + p[5] * T1_local[1] + p[6] * T1_local[0];
+            T2_local[3] = p[4] * T1_local[3] + p[5] * T1_local[3] + p[6] * T1_local[3];
+
+            A[indices[0]] = alpha * (p[7] * T2_local[0] + p[8] * T2_local[1]) + beta * A[indices[0]];
+            A[indices[1]] = alpha * (p[7] * T2_local[1] + p[8] * T2_local[0]) + beta * A[indices[1]];
+            A[indices[2]] = alpha * (p[7] * T2_local[2] + p[8] * T2_local[2]) + beta * A[indices[2]];
+            A[indices[3]] = alpha * (p[7] * T2_local[3] + p[8] * T2_local[3]) + beta * A[indices[3]];
+        }
+        else if (a == b && b > c && c == d)
+        {
+            double T1_local[6];
+            double T2_local[6];
+
+            int64_t indices[6];
+            indices[0] = b * nvvv + b * nvv + c * nvir + c;
+            indices[1] = b * nvvv + c * nvv + b * nvir + c;
+            indices[2] = b * nvvv + c * nvv + c * nvir + b;
+            indices[3] = c * nvvv + b * nvv + b * nvir + c;
+            indices[4] = c * nvvv + b * nvv + c * nvir + b;
+            indices[5] = c * nvvv + c * nvv + b * nvir + b;
+
+            T1_local[0] = p[0] * A[indices[0]] + p[1] * A[indices[0]] + p[2] * A[indices[3]] + p[3] * A[indices[4]];
+            T1_local[1] = p[0] * A[indices[1]] + p[1] * A[indices[3]] + p[2] * A[indices[1]] + p[3] * A[indices[5]];
+            T1_local[2] = p[0] * A[indices[2]] + p[1] * A[indices[4]] + p[2] * A[indices[5]] + p[3] * A[indices[2]];
+            T1_local[3] = p[0] * A[indices[3]] + p[1] * A[indices[1]] + p[2] * A[indices[0]] + p[3] * A[indices[3]];
+            T1_local[4] = p[0] * A[indices[4]] + p[1] * A[indices[2]] + p[2] * A[indices[4]] + p[3] * A[indices[0]];
+            T1_local[5] = p[0] * A[indices[5]] + p[1] * A[indices[5]] + p[2] * A[indices[2]] + p[3] * A[indices[1]];
+
+            T2_local[0] = p[4] * T1_local[0] + p[5] * T1_local[1] + p[6] * T1_local[2];
+            T2_local[1] = p[4] * T1_local[1] + p[5] * T1_local[0] + p[6] * T1_local[1];
+            T2_local[2] = p[4] * T1_local[2] + p[5] * T1_local[2] + p[6] * T1_local[0];
+            T2_local[3] = p[4] * T1_local[3] + p[5] * T1_local[3] + p[6] * T1_local[5];
+            T2_local[4] = p[4] * T1_local[4] + p[5] * T1_local[5] + p[6] * T1_local[4];
+            T2_local[5] = p[4] * T1_local[5] + p[5] * T1_local[4] + p[6] * T1_local[3];
+
+            A[indices[0]] = alpha * (p[7] * T2_local[0] + p[8] * T2_local[0]) + beta * A[indices[0]];
+            A[indices[1]] = alpha * (p[7] * T2_local[1] + p[8] * T2_local[2]) + beta * A[indices[1]];
+            A[indices[2]] = alpha * (p[7] * T2_local[2] + p[8] * T2_local[1]) + beta * A[indices[2]];
+            A[indices[3]] = alpha * (p[7] * T2_local[3] + p[8] * T2_local[4]) + beta * A[indices[3]];
+            A[indices[4]] = alpha * (p[7] * T2_local[4] + p[8] * T2_local[3]) + beta * A[indices[4]];
+            A[indices[5]] = alpha * (p[7] * T2_local[5] + p[8] * T2_local[5]) + beta * A[indices[5]];
+        }
+        else if (a == b && b == c && c == d)
+        {
+            double T1_local[1];
+            double T2_local[1];
+
+            int64_t indices[1];
+            indices[0] = a * nvvv + a * nvv + a * nvir + a;
+
+            T1_local[0] = p[0] * A[indices[0]] + p[1] * A[indices[0]] + p[2] * A[indices[0]] + p[3] * A[indices[0]];
+
+            T2_local[0] = p[4] * T1_local[0] + p[5] * T1_local[0] + p[6] * T1_local[0];
+
+            A[indices[0]] = alpha * (p[7] * T2_local[0] + p[8] * T2_local[0]) + beta * A[indices[0]];
+        }
+    }
+}
+
 // Apply permutation-symmetry projection to T4 amplitudes in place.
 // A = beta * A + alpha * P(A)
 // where P(A) ijklabcd = ijklabcd + ijlkabdc + ...
@@ -1256,6 +1766,38 @@ void eijkl_division_(double *r4, const double *eia, const int64_t nocc, const in
     }
 }
 
+void eijkl_division_single_(double *r4, const double *e_occ, const double *e_vir,
+                            const int64_t i, const int64_t j, const int64_t k, const int64_t l, const int64_t nvir)
+{
+    double eijkl = e_occ[i] + e_occ[j] + e_occ[k] + e_occ[l];
+
+#pragma omp parallel for collapse(4) schedule(static)
+    for (int64_t a = 0; a < nvir; a++)
+    {
+        for (int64_t b = 0; b < nvir; b++)
+        {
+            for (int64_t c = 0; c < nvir; c++)
+            {
+                for (int64_t d = 0; d < nvir; d++)
+                {
+                    int64_t r4_idx = ((a * nvir + b) * nvir + c) * nvir + d;
+
+                    double eijklabcd = eijkl - e_vir[a] - e_vir[b] - e_vir[c] - e_vir[d];
+
+                    if (fabs(eijklabcd) > 1e-15)
+                    {
+                        r4[r4_idx] /= eijklabcd;
+                    }
+                    else
+                    {
+                        r4[r4_idx] = 0.0;
+                    }
+                }
+            }
+        }
+    }
+}
+
 void t4_add_(double *t4, const double *r4, const int64_t nocc4, const int64_t nvir)
 {
     const int64_t total_size = nocc4 * nvir * nvir * nvir * nvir;
@@ -1294,6 +1836,18 @@ const int64_t tp_t4[24][4] = {
     {3, 2, 1, 0},
 };
 
+static inline int64_t src_idx_from_full4(const int64_t *restrict perm, int64_t v0, int64_t v1, int64_t v2, int64_t v3, int64_t nvir)
+{
+    int64_t src_abcd[4];
+
+    src_abcd[perm[0]] = v0;
+    src_abcd[perm[1]] = v1;
+    src_abcd[perm[2]] = v2;
+    src_abcd[perm[3]] = v3;
+
+    return (((src_abcd[0] * nvir + src_abcd[1]) * nvir + src_abcd[2]) * nvir + src_abcd[3]);
+}
+
 // Unpack triangular-stored T4 amplitudes into a full T4 block.
 //
 // This kernel reconstructs the full permutation-expanded T4 tensor block from the compressed triangular
@@ -1320,6 +1874,7 @@ void unpack_t4_tri2block_(const double *restrict t4_tri,
 {
 #define MAP(sym, w, x, y, z) map[((((sym) * nocc + (w)) * nocc + (x)) * nocc + (y)) * nocc + (z)]
 #define MASK(sym, w, x, y, z) mask[((((sym) * nocc + (w)) * nocc + (x)) * nocc + (y)) * nocc + (z)]
+#define VIDX(a, b, c, d) ((((a) * nvir + (b)) * nvir + (c)) * nvir + (d))
 
 #pragma omp parallel for collapse(5) schedule(dynamic)
     for (int64_t sym = 0; sym < 24; ++sym)
@@ -1353,16 +1908,82 @@ void unpack_t4_tri2block_(const double *restrict t4_tri,
                                 {
                                     for (int64_t d = 0; d < nvir; ++d)
                                     {
-                                        int64_t abcd[4] = {a, b, c, d};
-                                        int64_t aa = abcd[perm[0]];
-                                        int64_t bb = abcd[perm[1]];
-                                        int64_t cc = abcd[perm[2]];
-                                        int64_t dd = abcd[perm[3]];
+                                        int64_t src = src_base + src_idx_from_full4(perm, a, b, c, d, nvir);
+                                        int64_t dest = dest_base + VIDX(a, b, c, d);
+
+                                        t4_blk[dest] = t4_tri[src];
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+#undef MAP
+#undef MASK
+}
+
+// Unpack triangular-stored T4 amplitudes directly into the final block:
+//
+//   t4_tmp + t4_tmp.transpose(0, 1, 2, 3, 5, 6, 4, 7) + t4_tmp.transpose(0, 1, 2, 3, 5, 7, 6, 4)
+//
+void unpack_t4_tri2block_triples_(const double *restrict t4_tri,
+                                  double *restrict t4_blk,
+                                  const int64_t *restrict map,
+                                  const bool *restrict mask,
+                                  int64_t i0, int64_t i1,
+                                  int64_t j0, int64_t j1,
+                                  int64_t k0, int64_t k1,
+                                  int64_t l0, int64_t l1,
+                                  int64_t nocc, int64_t nvir,
+                                  int64_t blk_i, int64_t blk_j, int64_t blk_k, int64_t blk_l)
+{
+#define MAP(sym, w, x, y, z) map[((((sym) * nocc + (w)) * nocc + (x)) * nocc + (y)) * nocc + (z)]
+#define MASK(sym, w, x, y, z) mask[((((sym) * nocc + (w)) * nocc + (x)) * nocc + (y)) * nocc + (z)]
+#define VIDX(a, b, c, d) ((((a) * nvir + (b)) * nvir + (c)) * nvir + (d))
+
+    const int64_t nvir4 = nvir * nvir * nvir * nvir;
 
-                                        int64_t src_idx = src_base + ((a * nvir + b) * nvir + c) * nvir + d;
-                                        int64_t dest_idx = dest_base + ((aa * nvir + bb) * nvir + cc) * nvir + dd;
+#pragma omp parallel for collapse(5) schedule(dynamic)
+    for (int64_t sym = 0; sym < 24; ++sym)
+    {
+        for (int64_t i = i0; i < i1; ++i)
+        {
+            for (int64_t j = j0; j < j1; ++j)
+            {
+                for (int64_t k = k0; k < k1; ++k)
+                {
+                    for (int64_t l = l0; l < l1; ++l)
+                    {
+                        if (!MASK(sym, i, j, k, l))
+                            continue;
+
+                        const int64_t *perm = tp_t4[sym];
 
-                                        t4_blk[dest_idx] = t4_tri[src_idx];
+                        const int64_t loc_i = i - i0;
+                        const int64_t loc_j = j - j0;
+                        const int64_t loc_k = k - k0;
+                        const int64_t loc_l = l - l0;
+
+                        const int64_t src_base = MAP(sym, i, j, k, l) * nvir4;
+
+                        const int64_t dest_base = (((loc_i * blk_j + loc_j) * blk_k + loc_k) * blk_l + loc_l) * nvir4;
+
+                        for (int64_t a = 0; a < nvir; ++a)
+                        {
+                            for (int64_t b = 0; b < nvir; ++b)
+                            {
+                                for (int64_t c = 0; c < nvir; ++c)
+                                {
+                                    for (int64_t d = 0; d < nvir; ++d)
+                                    {
+                                        const int64_t src0 = src_base + src_idx_from_full4(perm, a, b, c, d, nvir);
+                                        const int64_t src1 = src_base + src_idx_from_full4(perm, c, a, b, d, nvir);
+                                        const int64_t src2 = src_base + src_idx_from_full4(perm, d, a, c, b, nvir);
+                                        const int64_t dest = dest_base + VIDX(a, b, c, d);
+                                        t4_blk[dest] = t4_tri[src0] + t4_tri[src1] + t4_tri[src2];
                                     }
                                 }
                             }
@@ -1372,6 +1993,7 @@ void unpack_t4_tri2block_(const double *restrict t4_tri,
             }
         }
     }
+#undef VIDX
 #undef MAP
 #undef MASK
 }
@@ -1452,3 +2074,324 @@ void accumulate_t4_block2tri_(double *restrict t4_tri,
     }
 #undef MAP
 }
+
+const int64_t swap_pairs[6][2] = {
+    {0, 1}, // ab
+    {0, 2}, // ac
+    {0, 3}, // ad
+    {1, 2}, // bc
+    {1, 3}, // bd
+    {2, 3}  // cd
+};
+
+static inline int64_t idx4(int64_t a, int64_t b, int64_t c, int64_t d, int64_t nvir, int64_t nvv, int64_t nvvv)
+{
+    return a * nvvv + b * nvv + c * nvir + d;
+}
+
+static inline void swap4(int64_t in[4], int p, int q, int64_t out[4])
+{
+    out[0] = in[0];
+    out[1] = in[1];
+    out[2] = in[2];
+    out[3] = in[3];
+
+    int64_t tmp = out[p];
+    out[p] = out[q];
+    out[q] = tmp;
+}
+
+static inline int same_tuple4(int64_t x[4], int64_t y[4])
+{
+    return x[0] == y[0] && x[1] == y[1] && x[2] == y[2] && x[3] == y[3];
+}
+
+static int find_tuple4(int64_t tuples[24][4], int ntuples, int64_t target[4])
+{
+    for (int i = 0; i < ntuples; i++)
+    {
+        if (same_tuple4(tuples[i], target))
+            return i;
+    }
+
+    fprintf(stderr, "Error: tuple not found in orbit.\n");
+    return -1;
+}
+
+static void apply_omega_local(double *y, const double *x, int64_t tuples[24][4], int ntuples)
+{
+    for (int i = 0; i < ntuples; i++)
+    {
+        y[i] = 0.0;
+
+        for (int s = 0; s < 6; s++)
+        {
+            int64_t target[4];
+            swap4(tuples[i], swap_pairs[s][0], swap_pairs[s][1], target);
+
+            int j = find_tuple4(tuples, ntuples, target);
+            y[i] += x[j];
+        }
+    }
+}
+
+static int build_unique_orbit(int64_t a, int64_t b, int64_t c, int64_t d, int64_t tuples[24][4])
+{
+    int64_t base[4] = {a, b, c, d};
+    int ntuples = 0;
+
+    for (int p = 0; p < 24; p++)
+    {
+        int64_t cand[4] = {base[tp_t4[p][0]], base[tp_t4[p][1]], base[tp_t4[p][2]], base[tp_t4[p][3]]};
+
+        int duplicate = 0;
+        for (int q = 0; q < ntuples; q++)
+        {
+            if (same_tuple4(tuples[q], cand))
+            {
+                duplicate = 1;
+                break;
+            }
+        }
+
+        if (!duplicate)
+        {
+            tuples[ntuples][0] = cand[0];
+            tuples[ntuples][1] = cand[1];
+            tuples[ntuples][2] = cand[2];
+            tuples[ntuples][3] = cand[3];
+            ntuples++;
+        }
+    }
+    return ntuples;
+}
+
+static int s_omega_action_24[24][6];
+static int s_omega_action_24_ready = 0;
+
+static void init_omega_action_24(void)
+{
+    if (s_omega_action_24_ready)
+        return;
+
+    int rev[4][4][4][4];
+    for (int p = 0; p < 24; p++)
+        rev[tp_t4[p][0]][tp_t4[p][1]][tp_t4[p][2]][tp_t4[p][3]] = p;
+
+    for (int p = 0; p < 24; p++)
+        for (int s = 0; s < 6; s++)
+        {
+            int t[4] = {tp_t4[p][0], tp_t4[p][1], tp_t4[p][2], tp_t4[p][3]};
+            int i = swap_pairs[s][0], j = swap_pairs[s][1];
+            int tmp = t[i];
+            t[i] = t[j];
+            t[j] = tmp;
+            s_omega_action_24[p][s] = rev[t[0]][t[1]][t[2]][t[3]];
+        }
+    s_omega_action_24_ready = 1;
+}
+
+static inline void apply_omega_24(double *restrict y, const double *restrict x)
+{
+    for (int p = 0; p < 24; p++)
+        y[p] = x[s_omega_action_24[p][0]] + x[s_omega_action_24[p][1]] + x[s_omega_action_24[p][2]] + x[s_omega_action_24[p][3]] + x[s_omega_action_24[p][4]] + x[s_omega_action_24[p][5]];
+}
+
+static inline void project_orbit_(double *restrict A, int64_t h, int64_t a, int64_t b, int64_t c, int64_t d,
+                                  int64_t nvir, int64_t nvv, int64_t nvvv, double alpha, double beta)
+{
+    if (a > b && b > c && c > d)
+    {
+        int64_t idx[24];
+        idx[0] = a * nvvv + b * nvv + c * nvir + d;
+        idx[1] = a * nvvv + b * nvv + d * nvir + c;
+        idx[2] = a * nvvv + c * nvv + b * nvir + d;
+        idx[3] = a * nvvv + c * nvv + d * nvir + b;
+        idx[4] = a * nvvv + d * nvv + b * nvir + c;
+        idx[5] = a * nvvv + d * nvv + c * nvir + b;
+        idx[6] = b * nvvv + a * nvv + c * nvir + d;
+        idx[7] = b * nvvv + a * nvv + d * nvir + c;
+        idx[8] = b * nvvv + c * nvv + a * nvir + d;
+        idx[9] = b * nvvv + c * nvv + d * nvir + a;
+        idx[10] = b * nvvv + d * nvv + a * nvir + c;
+        idx[11] = b * nvvv + d * nvv + c * nvir + a;
+        idx[12] = c * nvvv + a * nvv + b * nvir + d;
+        idx[13] = c * nvvv + a * nvv + d * nvir + b;
+        idx[14] = c * nvvv + b * nvv + a * nvir + d;
+        idx[15] = c * nvvv + b * nvv + d * nvir + a;
+        idx[16] = c * nvvv + d * nvv + a * nvir + b;
+        idx[17] = c * nvvv + d * nvv + b * nvir + a;
+        idx[18] = d * nvvv + a * nvv + b * nvir + c;
+        idx[19] = d * nvvv + a * nvv + c * nvir + b;
+        idx[20] = d * nvvv + b * nvv + a * nvir + c;
+        idx[21] = d * nvvv + b * nvv + c * nvir + a;
+        idx[22] = d * nvvv + c * nvv + a * nvir + b;
+        idx[23] = d * nvvv + c * nvv + b * nvir + a;
+
+        double x[24], x1[24], x2[24], x3[24], x4[24], y[24];
+        for (int p = 0; p < 24; p++)
+            x[p] = A[h + idx[p]];
+
+        apply_omega_24(x1, x);
+        for (int p = 0; p < 24; p++)
+            x1[p] -= 6.0 * x[p];
+
+        apply_omega_24(x2, x1);
+        for (int p = 0; p < 24; p++)
+            x2[p] -= 2.0 * x1[p];
+
+        apply_omega_24(x3, x2);
+        apply_omega_24(x4, x3);
+
+        for (int p = 0; p < 24; p++)
+            y[p] = (2.0 * x4[p] + 19.0 * x3[p] + 48.0 * x2[p]) / 576.0;
+
+        for (int p = 0; p < 24; p++)
+            A[h + idx[p]] = beta * x[p] + alpha * y[p];
+    }
+    else
+    {
+        int64_t tuples[24][4];
+        int64_t indices[24];
+        int perm_map[24][6];
+        double x[24], x1[24], x2[24], x3[24], x4[24], y[24];
+        static const int swaps[6][2] = {{0, 1}, {0, 2}, {0, 3}, {1, 2}, {1, 3}, {2, 3}};
+
+        int ntuples = build_unique_orbit(a, b, c, d, tuples);
+
+        for (int p = 0; p < ntuples; p++)
+            for (int s = 0; s < 6; s++)
+            {
+                int64_t target[4];
+                swap4(tuples[p], swaps[s][0], swaps[s][1], target);
+                perm_map[p][s] = find_tuple4(tuples, ntuples, target);
+            }
+        for (int p = 0; p < ntuples; p++)
+        {
+            indices[p] = idx4(tuples[p][0], tuples[p][1], tuples[p][2], tuples[p][3], nvir, nvv, nvvv);
+            x[p] = A[h + indices[p]];
+        }
+        for (int p = 0; p < ntuples; p++)
+        {
+            double om = 0.0;
+            for (int s = 0; s < 6; s++)
+                om += x[perm_map[p][s]];
+            x1[p] = om - 6.0 * x[p];
+        }
+        for (int p = 0; p < ntuples; p++)
+        {
+            double om = 0.0;
+            for (int s = 0; s < 6; s++)
+                om += x1[perm_map[p][s]];
+            x2[p] = om - 2.0 * x1[p];
+        }
+        for (int p = 0; p < ntuples; p++)
+        {
+            double om = 0.0;
+            for (int s = 0; s < 6; s++)
+                om += x2[perm_map[p][s]];
+            x3[p] = om;
+        }
+        for (int p = 0; p < ntuples; p++)
+        {
+            double om = 0.0;
+            for (int s = 0; s < 6; s++)
+                om += x3[perm_map[p][s]];
+            x4[p] = om;
+        }
+        for (int p = 0; p < ntuples; p++)
+            y[p] = (2.0 * x4[p] + 19.0 * x3[p] + 48.0 * x2[p]) / 576.0;
+        for (int p = 0; p < ntuples; p++)
+            A[h + indices[p]] = beta * x[p] + alpha * y[p];
+    }
+}
+
+void t4_project_1_minus_p4_p31_inplace_(double *A, int64_t nocc4, int64_t nvir, double alpha, double beta)
+{
+    init_omega_action_24();
+    int64_t nvv = nvir * nvir;
+    int64_t nvvv = nvir * nvv;
+    int64_t nvvvv = nvir * nvvv;
+    const int64_t bl = 8;
+
+#pragma omp parallel for schedule(static)
+    for (int64_t ijkl = 0; ijkl < nocc4; ijkl++)
+    {
+        int64_t h = ijkl * nvvvv;
+        for (int64_t a0 = 0; a0 < nvir; a0 += bl)
+            for (int64_t b0 = 0; b0 <= a0; b0 += bl)
+                for (int64_t c0 = 0; c0 <= b0; c0 += bl)
+                    for (int64_t d0 = 0; d0 <= c0; d0 += bl)
+                        for (int64_t a = a0; a < a0 + bl && a < nvir; a++)
+                            for (int64_t b = b0; b < b0 + bl && b <= a; b++)
+                                for (int64_t c = c0; c < c0 + bl && c <= b; c++)
+                                    for (int64_t d = d0; d < d0 + bl && d <= c; d++)
+                                        project_orbit_(A, h, a, b, c, d, nvir, nvv, nvvv, alpha, beta);
+    }
+}
+
+void r4_tri_divide_e_(double *restrict r4_tri, const double *restrict eia, int64_t nocc, int64_t nvir)
+{
+    const int64_t nvir2 = nvir * nvir;
+    const int64_t nvir3 = nvir2 * nvir;
+    const int64_t nvir4 = nvir3 * nvir;
+
+    int64_t *i_start = (int64_t *)malloc((size_t)(nocc + 1) * sizeof(int64_t));
+    if (!i_start)
+    {
+        fprintf(stderr, "r4_tri_divide_e_: malloc failed\n");
+        return;
+    }
+    i_start[0] = 0;
+    for (int64_t i = 1; i <= nocc; i++)
+    {
+        int64_t m = nocc - i + 1;
+        i_start[i] = i_start[i - 1] + m * (m + 1) * (m + 2) / 6;
+    }
+
+#pragma omp parallel for schedule(dynamic)
+    for (int64_t i = 0; i < nocc; i++)
+    {
+        int64_t idx = i_start[i];
+        const double *eia_i = eia + i * nvir;
+        for (int64_t j = i; j < nocc; j++)
+        {
+            const double *eia_j = eia + j * nvir;
+            for (int64_t k = j; k < nocc; k++)
+            {
+                const double *eia_k = eia + k * nvir;
+                for (int64_t l = k; l < nocc; l++, idx++)
+                {
+                    const double *eia_l = eia + l * nvir;
+                    double *blk = r4_tri + idx * nvir4;
+                    for (int64_t a = 0; a < nvir; a++)
+                    {
+                        double eia_ia = eia_i[a];
+                        for (int64_t b = 0; b < nvir; b++)
+                        {
+                            double eijab = eia_ia + eia_j[b];
+                            for (int64_t c = 0; c < nvir; c++)
+                            {
+                                double eijkabc = eijab + eia_k[c];
+                                double *ptr = blk + a * nvir3 + b * nvir2 + c * nvir;
+                                for (int64_t d = 0; d < nvir; d++)
+                                {
+                                    if (fabs(eijkabc + eia_l[d]) > 1e-15)
+                                    {
+                                        ptr[d] /= eijkabc + eia_l[d];
+                                    }
+                                    else
+                                    {
+                                        ptr[d] = 0.0;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    free(i_start);
+}
diff --git a/pyscf/lib/dft/libxc_itrf.c b/pyscf/lib/dft/libxc_itrf.c
index 02ebfbf91f..2c1d03df94 100644
--- a/pyscf/lib/dft/libxc_itrf.c
+++ b/pyscf/lib/dft/libxc_itrf.c
@@ -1036,15 +1036,19 @@ int LIBXC_max_deriv_order(const int nfunc, const xc_func_type *func)
         };
 
         for (i = 0; i < nfunc; i++) {
-                /* find the minimum order of all functionals */
+                /* find the highest order this functional supports, then keep
+                 * the minimum across all functionals. Iterate o>=0 to also
+                 * cover order-0 (EXC-only) functionals. */
                 const int flag = func[i].info->flags;
-                for (o = ord; o > 0; o--) {
+                int found = 0;
+                for (o = ord; o >= 0; o--) {
                         if (flag & DERIV_FLAGS_TABLE[o]) {
                                 ord = o;
+                                found = 1;
                                 break;
                         }
                 }
-                if (o == -1) return -1;
+                if (!found) return -1;
         }
 
         return ord;
diff --git a/pyscf/lib/dft/multigrid.c b/pyscf/lib/dft/multigrid.c
index 11344295bb..76c118f101 100644
--- a/pyscf/lib/dft/multigrid.c
+++ b/pyscf/lib/dft/multigrid.c
@@ -94,7 +94,10 @@ void init_rs_grid(RS_Grid** rs_grid, GridLevel_Info** gridlevel_info, int comp)
     int *mesh = gl_info->mesh;
     rg->data = (double**)malloc(sizeof(double*) * nlevels);
     for (i = 0; i < nlevels; i++) {
-        ngrid = mesh[i*3] * mesh[i*3+1] * mesh[i*3+2];
+        // Cast to size_t before multiplying so very fine meshes
+        // (mesh > ~1024 on a side) do not overflow int and silently
+        // under-size the FFT-grid allocation.
+        ngrid = (size_t)mesh[i*3] * mesh[i*3+1] * mesh[i*3+2];
         (rg->data)[i] = calloc(comp*ngrid, sizeof(double));
     }
     *rs_grid = rg;
@@ -171,7 +174,11 @@ double pgfpair_radius(int la, int lb, double zeta, double zetb, double* ra, doub
     double zetp = zeta + zetb;
     double eps = precision * precision;
 
-    if (rab[0] < RZERO && rab[1] < RZERO && rab[2] < RZERO) {
+    // Same-atom shortcut: compare the magnitude of the displacement vector,
+    // not the signed components. The previous "rab[0] < RZERO && ..." test
+    // wrongly fired on any all-negative displacement (~1/8 of periodic-image
+    // shifted pairs) and returned the wrong radius.
+    if (SQUARE(rab) < RZERO*RZERO) {
         radius = pgf_rcut(la+lb, zetp, 1., eps, radius);
         return radius;
     }
diff --git a/pyscf/lib/dft/test/test_sparse_dot.py b/pyscf/lib/dft/test/test_sparse_dot.py
index 8b948c5a61..feb216d8ab 100644
--- a/pyscf/lib/dft/test/test_sparse_dot.py
+++ b/pyscf/lib/dft/test/test_sparse_dot.py
@@ -162,7 +162,7 @@ def test_dot_ao_ao_case1(self):
             ctypes.c_int(nbins), s_index.ctypes.data_as(ctypes.c_void_p),
             pair_mask.ctypes.data_as(ctypes.c_void_p),
             ao_loc.ctypes.data_as(ctypes.c_void_p))
-        self.assertAlmostEqual(abs(ref - out).max(), 0, 24)
+        self.assertAlmostEqual(abs(ref - out).max(), 0, 23)
 
     def test_dot_ao_ao_case2(self):
         np.random.seed(1)
diff --git a/pyscf/lib/gto/deriv2.c b/pyscf/lib/gto/deriv2.c
index e9fa27e6bf..55a02d1c91 100644
--- a/pyscf/lib/gto/deriv2.c
+++ b/pyscf/lib/gto/deriv2.c
@@ -46,15 +46,15 @@ void GTOshell_eval_grid_cart_deriv2(double *cgto, double *ri, double *exps,
         const size_t bgrids0 = (bgrids >= SIMDD) ? (bgrids+1-SIMDD) : 0;
         int lx, ly, lz;
         size_t i, j, j1, k, l1, n;
-        double fx0[SIMDD*16];
-        double fy0[SIMDD*16];
-        double fz0[SIMDD*16];
-        double fx1[SIMDD*16];
-        double fy1[SIMDD*16];
-        double fz1[SIMDD*16];
-        double fx2[SIMDD*16];
-        double fy2[SIMDD*16];
-        double fz2[SIMDD*16];
+        double fx0[SIMDD*(LMAX+5)];
+        double fy0[SIMDD*(LMAX+5)];
+        double fz0[SIMDD*(LMAX+5)];
+        double fx1[SIMDD*(LMAX+5)];
+        double fy1[SIMDD*(LMAX+5)];
+        double fz1[SIMDD*(LMAX+5)];
+        double fx2[SIMDD*(LMAX+5)];
+        double fy2[SIMDD*(LMAX+5)];
+        double fz2[SIMDD*(LMAX+5)];
         double buf[SIMDD*10];
         double *gridx = coord;
         double *gridy = coord+BLKSIZE;
@@ -220,18 +220,18 @@ void GTOshell_eval_grid_cart_deriv3(double *cgto, double *ri, double *exps,
         const size_t bgrids0 = (bgrids >= SIMDD) ? (bgrids+1-SIMDD) : 0;
         int lx, ly, lz;
         size_t i, j, j1, k, l1, n;
-        double fx0[SIMDD*16];
-        double fy0[SIMDD*16];
-        double fz0[SIMDD*16];
-        double fx1[SIMDD*16];
-        double fy1[SIMDD*16];
-        double fz1[SIMDD*16];
-        double fx2[SIMDD*16];
-        double fy2[SIMDD*16];
-        double fz2[SIMDD*16];
-        double fx3[SIMDD*16];
-        double fy3[SIMDD*16];
-        double fz3[SIMDD*16];
+        double fx0[SIMDD*(LMAX+5)];
+        double fy0[SIMDD*(LMAX+5)];
+        double fz0[SIMDD*(LMAX+5)];
+        double fx1[SIMDD*(LMAX+5)];
+        double fy1[SIMDD*(LMAX+5)];
+        double fz1[SIMDD*(LMAX+5)];
+        double fx2[SIMDD*(LMAX+5)];
+        double fy2[SIMDD*(LMAX+5)];
+        double fz2[SIMDD*(LMAX+5)];
+        double fx3[SIMDD*(LMAX+5)];
+        double fy3[SIMDD*(LMAX+5)];
+        double fz3[SIMDD*(LMAX+5)];
         double buf[SIMDD*20];
         double *gridx = coord;
         double *gridy = coord+BLKSIZE;
@@ -447,21 +447,21 @@ void GTOshell_eval_grid_cart_deriv4(double *cgto, double *ri, double *exps,
         const size_t bgrids0 = (bgrids >= SIMDD) ? (bgrids+1-SIMDD) : 0;
         int lx, ly, lz;
         size_t i, j, j1, k, l1, n;
-        double fx0[SIMDD*16];
-        double fy0[SIMDD*16];
-        double fz0[SIMDD*16];
-        double fx1[SIMDD*16];
-        double fy1[SIMDD*16];
-        double fz1[SIMDD*16];
-        double fx2[SIMDD*16];
-        double fy2[SIMDD*16];
-        double fz2[SIMDD*16];
-        double fx3[SIMDD*16];
-        double fy3[SIMDD*16];
-        double fz3[SIMDD*16];
-        double fx4[SIMDD*16];
-        double fy4[SIMDD*16];
-        double fz4[SIMDD*16];
+        double fx0[SIMDD*(LMAX+5)];
+        double fy0[SIMDD*(LMAX+5)];
+        double fz0[SIMDD*(LMAX+5)];
+        double fx1[SIMDD*(LMAX+5)];
+        double fy1[SIMDD*(LMAX+5)];
+        double fz1[SIMDD*(LMAX+5)];
+        double fx2[SIMDD*(LMAX+5)];
+        double fy2[SIMDD*(LMAX+5)];
+        double fz2[SIMDD*(LMAX+5)];
+        double fx3[SIMDD*(LMAX+5)];
+        double fy3[SIMDD*(LMAX+5)];
+        double fz3[SIMDD*(LMAX+5)];
+        double fx4[SIMDD*(LMAX+5)];
+        double fy4[SIMDD*(LMAX+5)];
+        double fz4[SIMDD*(LMAX+5)];
         double buf[SIMDD*35];
         double *gridx = coord;
         double *gridy = coord+BLKSIZE;
diff --git a/pyscf/lib/gto/fill_grids_int2c.c b/pyscf/lib/gto/fill_grids_int2c.c
index b0e9c83b2f..76b21b9b21 100644
--- a/pyscf/lib/gto/fill_grids_int2c.c
+++ b/pyscf/lib/gto/fill_grids_int2c.c
@@ -18,6 +18,7 @@
 
 #include <stdlib.h>
 #include <complex.h>
+#include <assert.h>
 #include "config.h"
 #include "cint.h"
 #include "np_helper/np_helper.h"
@@ -75,6 +76,11 @@ void GTOgrids_int2c(int (*intor)(), double *mat, int comp, int hermi,
         const int njsh = jsh1 - jsh0;
         const size_t naoi = ao_loc[ish1] - ao_loc[ish0];
         const size_t naoj = ao_loc[jsh1] - ao_loc[jsh0];
+        // The hermi-mode symmetrization loop below decodes (i, j) via
+        // ig/naoj and then writes to both mat[j*naoi+i] and mat[i*naoi+j],
+        // which only addresses the right cells when naoi == naoj and the
+        // slice is square.
+        assert(hermi == PLAIN || (ish0 == jsh0 && naoi == naoj));
         const size_t cache_size = _max_cache_size(intor, shls_slice, 2,
                                                   atm, natm, bas, nbas, env);
         const int dims[] = {naoi, naoj, ngrids};
@@ -92,17 +98,17 @@ void GTOgrids_int2c(int (*intor)(), double *mat, int comp, int hermi,
                         // fill up only upper triangle of F-array
                         continue;
                 }
+                ish += ish0;
+                jsh += jsh0;
+                shls[0] = ish;
+                shls[1] = jsh;
+                i0 = ao_loc[ish] - ao_loc[ish0];
+                j0 = ao_loc[jsh] - ao_loc[jsh0];
 
                 for (grid0 = 0; grid0 < ngrids; grid0 += BLKSIZE) {
                         grid1 = MIN(grid0 + BLKSIZE, ngrids);
-                        ish += ish0;
-                        jsh += jsh0;
-                        shls[0] = ish;
-                        shls[1] = jsh;
                         shls[2] = grid0;
                         shls[3] = grid1;
-                        i0 = ao_loc[ish] - ao_loc[ish0];
-                        j0 = ao_loc[jsh] - ao_loc[jsh0];
                         (*intor)(mat+ngrids*(j0*naoi+i0)+grid0, dims, shls,
                                  atm, natm, bas, nbas, env, opt, cache);
                 }
@@ -153,6 +159,9 @@ void GTOgrids_int2c_spinor(int (*intor)(), double complex *mat, int comp, int he
         const int njsh = jsh1 - jsh0;
         const size_t naoi = ao_loc[ish1] - ao_loc[ish0];
         const size_t naoj = ao_loc[jsh1] - ao_loc[jsh0];
+        // Hermi-mode symmetrization assumes a square layout (same reasoning
+        // as the real variant above).
+        assert(hermi == PLAIN || (ish0 == jsh0 && naoi == naoj));
         const size_t cache_size = _max_cache_size(intor, shls_slice, 2,
                                                   atm, natm, bas, nbas, env);
         int dims[] = {naoi, naoj, ngrids};
@@ -169,17 +178,17 @@ void GTOgrids_int2c_spinor(int (*intor)(), double complex *mat, int comp, int he
                 if (hermi != PLAIN && ish > jsh) {
                         continue;
                 }
+                ish += ish0;
+                jsh += jsh0;
+                shls[0] = ish;
+                shls[1] = jsh;
+                i0 = ao_loc[ish] - ao_loc[ish0];
+                j0 = ao_loc[jsh] - ao_loc[jsh0];
 
                 for (grid0 = 0; grid0 < ngrids; grid0 += BLKSIZE) {
                         grid1 = MIN(grid0 + BLKSIZE, ngrids);
-                        ish += ish0;
-                        jsh += jsh0;
-                        shls[0] = ish;
-                        shls[1] = jsh;
                         shls[2] = grid0;
                         shls[3] = grid1;
-                        i0 = ao_loc[ish] - ao_loc[ish0];
-                        j0 = ao_loc[jsh] - ao_loc[jsh0];
                         (*intor)(mat+ngrids*(j0*naoi+i0)+grid0, dims, shls,
                                  atm, natm, bas, nbas, env, opt, cache);
                 }
diff --git a/pyscf/lib/gto/fill_int2c.c b/pyscf/lib/gto/fill_int2c.c
index 137fe8c15a..e0a5706d2b 100644
--- a/pyscf/lib/gto/fill_int2c.c
+++ b/pyscf/lib/gto/fill_int2c.c
@@ -19,6 +19,7 @@
 #include <stdlib.h>
 #include <complex.h>
 #include <math.h>
+#include <assert.h>
 #include "config.h"
 #include "cint.h"
 #include "np_helper/np_helper.h"
@@ -44,6 +45,12 @@ void GTOint2c(int (*intor)(), double *mat, int comp, int hermi,
         const int njsh = jsh1 - jsh0;
         const size_t naoi = ao_loc[ish1] - ao_loc[ish0];
         const size_t naoj = ao_loc[jsh1] - ao_loc[jsh0];
+        // Hermi-mode symmetrization (NPdsymm_triu below) and the upper-
+        // triangle skip (ish > jsh in slice-relative indices) both assume a
+        // square layout with ish0 == jsh0. The per-component stride
+        // ic*naoi*naoi also assumes that. Calling with hermi != PLAIN on a
+        // rectangular or offset slice silently corrupts the matrix.
+        assert(hermi == PLAIN || (ish0 == jsh0 && naoi == naoj));
         const int cache_size = GTOmax_cache_size(intor, shls_slice, 2,
                                                  atm, natm, bas, nbas, env);
 #pragma omp parallel
@@ -92,6 +99,9 @@ void GTOint2c_spinor(int (*intor)(), double complex *mat, int comp, int hermi,
         const int njsh = jsh1 - jsh0;
         const size_t naoi = ao_loc[ish1] - ao_loc[ish0];
         const size_t naoj = ao_loc[jsh1] - ao_loc[jsh0];
+        // Hermi-mode symmetrization assumes a square layout (same reasoning
+        // as the real variant above).
+        assert(hermi == PLAIN || (ish0 == jsh0 && naoi == naoj));
         const int cache_size = GTOmax_cache_size(intor, shls_slice, 2,
                                                  atm, natm, bas, nbas, env);
 
diff --git a/pyscf/lib/gto/ft_ao.c b/pyscf/lib/gto/ft_ao.c
index 04ba219360..76b536b86c 100644
--- a/pyscf/lib/gto/ft_ao.c
+++ b/pyscf/lib/gto/ft_ao.c
@@ -549,7 +549,7 @@ void GTO_Gv_orth(double *gzR, double *gzI, double fac, double aij,
         int *idy = idx + nGv;
         int *idz = idy + nGv;
 
-        double cutoff = EXPCUTOFF * aij * 4;
+        double cutoff = envs->expcutoff * aij * 4;
         double aij4 = .25 / aij;
         double complex fac1 = fac * envs->common_factor;
         int n, ix, iy, iz;
@@ -636,7 +636,7 @@ void GTO_Gv_nonorth(double *gzR, double *gzI, double fac, double aij,
         int *idy = idx + nGv;
         int *idz = idy + nGv;
 
-        double cutoff = EXPCUTOFF * aij * 4;
+        double cutoff = envs->expcutoff * aij * 4;
         double aij4 = -.25 / aij;
         double complex fac1 = fac * envs->common_factor;
         int ix, iy, iz;
@@ -1094,7 +1094,7 @@ if (ioff == joff) {
                 for (i = 0; i < di; i++) {
                         pbufR = bufR + ic * dijg + dg * (j*di+i);
                         pbufI = bufI + ic * dijg + dg * (j*di+i);
-                        ij = j * nj + i;
+                        ij = j * ni + i;
                         ji = i * nj + j;
                         for (n = 0; n < dg; n++) {
                                 pout_ij[(ij*NGv+n)*OF_CMPLX  ] += pbufR[n];
diff --git a/pyscf/lib/gto/grid_ao_drv.c b/pyscf/lib/gto/grid_ao_drv.c
index fde945f325..ca9a8fa31c 100644
--- a/pyscf/lib/gto/grid_ao_drv.c
+++ b/pyscf/lib/gto/grid_ao_drv.c
@@ -17,6 +17,7 @@
  */
 
 #include <stdlib.h>
+#include <assert.h>
 #include <math.h>
 #include <stdint.h>
 #include <complex.h>
@@ -32,8 +33,10 @@ void GTO_screen_index(uint8_t *screen_index, int nbins, double cutoff,
                       double *coords, int ngrids, int blksize,
                       int *atm, int natm, int *bas, int nbas, double *env)
 {
+        // Keep nbins < 120 so si = nbins - arr*scale + 1 fits in uint8_t
+        // without saturating the screen_index = 255 cap below.
+        assert(nbins < 120);
         double scale = -nbins / log(MIN(cutoff, .1));
-        nbins = MIN(127, nbins);
 #pragma omp parallel
 {
         const int nblk = (ngrids+blksize-1) / blksize;
@@ -101,8 +104,15 @@ void GTO_screen_index(uint8_t *screen_index, int nbins, double cutoff,
                                         - log_coeff;
                         }
                         si = nbins - arr * scale;
+                        /* screen_index is uint8: 0 = screened out, otherwise stored value
+                         * is (raw_si + 1). Cap at 254 to keep the +1 from wrapping mod
+                         * 256, which would silently demote a very-significant entry
+                         * (large -arr from extremely tight AOs) to 0 = "screened out".
+                         * Behavior for si <= 254 is unchanged. */
                         if (si <= 0) {
                                 screen_index[ib*nbas+bas_id] = 0;
+                        } else if (si > 254) {
+                                screen_index[ib*nbas+bas_id] = 255;
                         } else {
                                 screen_index[ib*nbas+bas_id] = (uint8_t)(si + 1);
                         }
diff --git a/pyscf/lib/gto/nr_ecp.c b/pyscf/lib/gto/nr_ecp.c
index 652033323a..f59ca31caa 100644
--- a/pyscf/lib/gto/nr_ecp.c
+++ b/pyscf/lib/gto/nr_ecp.c
@@ -18,6 +18,7 @@
 
 #include <stdlib.h>
 #include <stdint.h>
+#include <stdio.h>
 #include <math.h>
 #include <complex.h>
 #include "cint.h"
@@ -4686,7 +4687,11 @@ static const double _j_inv[] = { // 1/j
 };
 void ECPsph_ine_opt(double *out, int order, double z)
 {
-        if (z < 1e-7 || z > 16) {
+        // The default branch below reads/writes k0[0..order+K_TAYLOR_MAX] which
+        // would overrun the K_TAB_COL-wide _sph_ine_tab row (and the K_TAB_COL
+        // local buf) when order > K_TAB_COL-K_TAYLOR_MAX-1 = 16. Fall back to
+        // the slow general routine for high-l basis combined with high-l ECP.
+        if (z < 1e-7 || z > 16 || order > K_TAB_COL - K_TAYLOR_MAX - 1) {
                 return ECPsph_ine(out, order, z);
         } else {
                 /*
@@ -5457,7 +5462,7 @@ int ECPtype2_cart(double *gctr, int *shls, int *ecpbas, int necpbas,
                         pradi = radi + ic * nrs * lilc1;
                         pradj = radj + jc * nrs * ljlc1;
                         for (lab = 0; lab <= li+lj; lab++, ijl++) {
-                                if (!converged[ijl]) {
+                                if (converged[ijl] < 2) {
         prur = rur + lab * nrs;
         prad = rad_all + ijl*d2;
         for (i = 0; i < d2; i++) {
@@ -5474,11 +5479,20 @@ int ECPtype2_cart(double *gctr, int *shls, int *ecpbas, int necpbas,
                 prad[i*ljlc1+j] = s;
         } }
 
-        for (i = 0; i < d2; i++) {
-                if (!CLOSE_ENOUGH(plast[i],prad[i])) {
+        {
+                int _pair_close = 1;
+                for (i = 0; i < d2; i++) {
+                        if (!CLOSE_ENOUGH(plast[i],prad[i])) {
+                                _pair_close = 0;
+                                break;
+                        }
+                }
+                if (_pair_close) {
+                        converged[ijl] += 1;
+                        if (converged[ijl] < 2) { all_conv = 0; }
+                } else {
                         converged[ijl] = 0;
                         all_conv = 0;
-                        break;
                 }
         }
                                 }
@@ -5571,7 +5585,6 @@ int ECPtype_so_cart(double *gctr, int *shls, int *ecpbas, int necpbas,
         MARK_STACK;
         MALLOC_INSTACK(angi, (li+1)*nfi*(ECP_LMAX*2+1)*(li+ECP_LMAX+1));
         MALLOC_INSTACK(angj, (lj+1)*nfj*(ECP_LMAX*2+1)*(lj+ECP_LMAX+1));
-        MALLOC_INSTACK(buf, nfi*(ECP_LMAX*2+1)*(lj+ECP_LMAX+1));
         MALLOC_INSTACK(jmm_angj, (lj+1)*nfj*(ECP_LMAX*2+1)*(lj+ECP_LMAX+1)*3);
         MALLOC_INSTACK(buf, nfi*(ECP_LMAX*2+1)*(lj+ECP_LMAX+1));
 
@@ -5613,6 +5626,19 @@ int ECPtype_so_cart(double *gctr, int *shls, int *ecpbas, int necpbas,
                         n = ecpbas[ATOM_OF+ecploc[iloc]*BAS_SLOTS];
                         lc = ecp_lmax[n] + 1;
                 }
+                // _angular_moment_matrix[] only has entries for lc in 0..4
+                // (s..g). Higher-l SO-ECP projectors (or Ul fallbacks that push
+                // lc past 4) would read past the table in transform_angj and
+                // overflow the angi/angj/jmm_angj allocations sized assuming
+                // lc <= ECP_LMAX. Skip rather than crash silently.
+                if (lc > 4) {
+                        fprintf(stderr,
+                                "ECPtype_so_cart: SO-ECP projector with lc=%d "
+                                "(atom %d) is not supported (max lc=4); "
+                                "skipping.\n",
+                                lc, ecpbas[ATOM_OF+ecploc[iloc]*BAS_SLOTS]);
+                        continue;
+                }
                 atm_id = ecpbas[ATOM_OF+ecploc[iloc]*BAS_SLOTS];
                 rc = env + atm[PTR_COORD+atm_id*ATM_SLOTS];
                 ecpshls = ecploc + iloc;
@@ -5672,7 +5698,7 @@ int ECPtype_so_cart(double *gctr, int *shls, int *ecpbas, int necpbas,
                         pradi = radi + ic * nrs * lilc1;
                         pradj = radj + jc * nrs * ljlc1;
                         for (lab = 0; lab <= li+lj; lab++, ijl++) {
-                                if (!converged[ijl]) {
+                                if (converged[ijl] < 2) {
         prur = rur + lab * nrs;
         prad = rad_all + ijl*d2;
         for (i = 0; i < d2; i++) {
@@ -5689,11 +5715,20 @@ int ECPtype_so_cart(double *gctr, int *shls, int *ecpbas, int necpbas,
                 prad[i*ljlc1+j] = s;
         } }
 
-        for (i = 0; i < d2; i++) {
-                if (!CLOSE_ENOUGH(plast[i], prad[i])) {
+        {
+                int _pair_close = 1;
+                for (i = 0; i < d2; i++) {
+                        if (!CLOSE_ENOUGH(plast[i], prad[i])) {
+                                _pair_close = 0;
+                                break;
+                        }
+                }
+                if (_pair_close) {
+                        converged[ijl] += 1;
+                        if (converged[ijl] < 2) { all_conv = 0; }
+                } else {
                         converged[ijl] = 0;
                         all_conv = 0;
-                        break;
                 }
         }
                                 }
@@ -5919,7 +5954,7 @@ int ECPtype1_cart(double *gctr, int *shls, int *ecpbas, int necpbas,
                 all_conv = 1;
                 for (ip = 0; ip < npi; ip++) {
                 for (jp = 0; jp < npj; jp++) {
-                        if (!converged[ip*npj+jp]) {
+                        if (converged[ip*npj+jp] < 2) {
                                 prad = rad_all + (ip*npj+jp)*d2;
                                 for (i = 0; i < d2; i++) {
                                         plast[i] = prad[i];
@@ -5930,12 +5965,20 @@ int ECPtype1_cart(double *gctr, int *shls, int *ecpbas, int necpbas,
                                 rij[2] = ai[ip] * rca[2] + aj[jp] * rcb[2];
                                 type1_rad_part(prad, li+lj, sqrt(SQUARE(rij))*2,
                                                ai[ip]+aj[jp], ur, rs+start, nrs, step, cache);
-                                converged[ip*npj+jp] = 1;
-                                for (i = 0; i < d2; i++) {
-                                        if (!CLOSE_ENOUGH(plast[i],prad[i])) {
+                                {
+                                        int _pair_close = 1;
+                                        for (i = 0; i < d2; i++) {
+                                                if (!CLOSE_ENOUGH(plast[i],prad[i])) {
+                                                        _pair_close = 0;
+                                                        break;
+                                                }
+                                        }
+                                        if (_pair_close) {
+                                                converged[ip*npj+jp] += 1;
+                                                if (converged[ip*npj+jp] < 2) { all_conv = 0; }
+                                        } else {
                                                 converged[ip*npj+jp] = 0;
                                                 all_conv = 0;
-                                                break;
                                         }
                                 }
                         }
@@ -6389,7 +6432,7 @@ void ECPdel_optimizer(ECPOpt **opt)
                 free(opt0->u_ecp);
         }
         free(opt0);
-        opt = NULL;
+        *opt = NULL;
 }
 
 
diff --git a/pyscf/lib/gto/nr_ecp.h b/pyscf/lib/gto/nr_ecp.h
index 98bd122dd7..68da22e4a0 100644
--- a/pyscf/lib/gto/nr_ecp.h
+++ b/pyscf/lib/gto/nr_ecp.h
@@ -13,7 +13,7 @@
 #define SIM_ZERO        1e-50
 #define EXPCUTOFF       39   // 1e-17
 #define CUTOFF          460  // ~ 1e200
-#define CLOSE_ENOUGH(x, y)      (fabs(x-y) < 1e-12*fabs(y) || fabs(x-y) < 1e-12)
+#define CLOSE_ENOUGH(x, y)      (fabs(x-y) <= 1e-12 * fmax(fabs(x), fabs(y)))
 #define SQUARE(r)       (r[0]*r[0]+r[1]*r[1]+r[2]*r[2])
 #define CART_CUM        (455+1) // upto l = 12
 #define K_TAYLOR_MAX    7
diff --git a/pyscf/lib/mcscf/fci_rdm.c b/pyscf/lib/mcscf/fci_rdm.c
index ef7ea2aa5d..9b81156b0b 100644
--- a/pyscf/lib/mcscf/fci_rdm.c
+++ b/pyscf/lib/mcscf/fci_rdm.c
@@ -149,7 +149,8 @@ static void tril_particle_symm(double *rdm2, double *tbra, double *tket,
         const char TRANS_T = 'T';
         int nnorb = norb * norb;
         int i, j, k, m, n;
-        int blk = MIN(((int)(48/norb))*norb, nnorb);
+        int blk_units = 48 / norb;
+        int blk = MIN(MAX(blk_units, 1) * norb, nnorb);
         double *buf = malloc(sizeof(double) * nnorb*bcount);
         double *p1;
 
@@ -309,7 +310,7 @@ void FCIrdm12kern_sf(double *rdm1, double *rdm2, double *bra, double *ket,
                            clink_indexa, clink_indexb);
         if (csum > CSUMTHR) {
                 dgemv_(&TRANS_N, &nnorb, &bcount, &D1, buf, &nnorb,
-                       ket+stra_id*nb+strb_id, &INC1, &D1, rdm1, &INC1);
+                       ket+stra_id*(size_t)nb+strb_id, &INC1, &D1, rdm1, &INC1);
                 switch (symm) {
                 case BRAKETSYM:
                         dsyrk_(&UP, &TRANS_N, &nnorb, &bcount,
@@ -366,7 +367,7 @@ void FCIrdm12kern_spin0(double *rdm1, double *rdm2, double *bra, double *ket,
         }
         if (csum > CSUMTHR) {
                 dgemv_(&TRANS_N, &nnorb, &fill1, &D2, buf, &nnorb,
-                       ket+stra_id*na+strb_id, &INC1, &D1, rdm1, &INC1);
+                       ket+stra_id*(size_t)na+strb_id, &INC1, &D1, rdm1, &INC1);
 
                 for (i = fill0*nnorb; i < fill1*nnorb; i++) {
                         buf[i] *= SQRT2;
@@ -417,7 +418,7 @@ void FCItdm12kern_sf(double *tdm1, double *tdm2, double *bra, double *ket,
                            clink_indexa, clink_indexb);
         if (csum < CSUMTHR) { goto _normal_end; }
         dgemv_(&TRANS_N, &nnorb, &bcount, &D1, buf0, &nnorb,
-               bra+stra_id*nb+strb_id, &INC1, &D1, tdm1, &INC1);
+               bra+stra_id*(size_t)nb+strb_id, &INC1, &D1, tdm1, &INC1);
         switch (symm) {
         case PARTICLESYM:
                 tril_particle_symm(tdm2, buf1, buf0, bcount, norb, D1, D1);
@@ -456,7 +457,7 @@ void FCIrdm12kern_a(double *rdm1, double *rdm2, double *bra, double *ket,
                               norb, nb, nlinka, clink_indexa);
         if (csum > CSUMTHR) {
                 dgemv_(&TRANS_N, &nnorb, &bcount, &D1, buf, &nnorb,
-                       ket+stra_id*nb+strb_id, &INC1, &D1, rdm1, &INC1);
+                       ket+stra_id*(size_t)nb+strb_id, &INC1, &D1, rdm1, &INC1);
                 switch (symm) {
                 case BRAKETSYM:
                         dsyrk_(&UP, &TRANS_N, &nnorb, &bcount,
@@ -494,7 +495,7 @@ void FCIrdm12kern_b(double *rdm1, double *rdm2, double *bra, double *ket,
                               norb, nb, nlinkb, clink_indexb);
         if (csum > CSUMTHR) {
                 dgemv_(&TRANS_N, &nnorb, &bcount, &D1, buf, &nnorb,
-                       ket+stra_id*nb+strb_id, &INC1, &D1, rdm1, &INC1);
+                       ket+stra_id*(size_t)nb+strb_id, &INC1, &D1, rdm1, &INC1);
                 switch (symm) {
                 case BRAKETSYM:
                         dsyrk_(&UP, &TRANS_N, &nnorb, &bcount,
@@ -533,7 +534,7 @@ void FCItdm12kern_a(double *tdm1, double *tdm2, double *bra, double *ket,
                               norb, nb, nlinka, clink_indexa);
         if (csum < CSUMTHR) { goto _normal_end; }
         dgemv_(&TRANS_N, &nnorb, &bcount, &D1, buf0, &nnorb,
-               bra+stra_id*nb+strb_id, &INC1, &D1, tdm1, &INC1);
+               bra+stra_id*(size_t)nb+strb_id, &INC1, &D1, tdm1, &INC1);
         switch (symm) {
         case PARTICLESYM:
                 tril_particle_symm(tdm2, buf1, buf0, bcount, norb, D1, D1);
@@ -568,7 +569,7 @@ void FCItdm12kern_b(double *tdm1, double *tdm2, double *bra, double *ket,
                               norb, nb, nlinkb, clink_indexb);
         if (csum < CSUMTHR) { goto _normal_end; }
         dgemv_(&TRANS_N, &nnorb, &bcount, &D1, buf0, &nnorb,
-               bra+stra_id*nb+strb_id, &INC1, &D1, tdm1, &INC1);
+               bra+stra_id*(size_t)nb+strb_id, &INC1, &D1, tdm1, &INC1);
         switch (symm) {
         case PARTICLESYM:
                 tril_particle_symm(tdm2, buf1, buf0, bcount, norb, D1, D1);
diff --git a/pyscf/lib/mcscf/fci_string.c b/pyscf/lib/mcscf/fci_string.c
index 5ef53b5161..e04e90314b 100644
--- a/pyscf/lib/mcscf/fci_string.c
+++ b/pyscf/lib/mcscf/fci_string.c
@@ -187,7 +187,9 @@ void FCIaddrs2str(uint64_t *strings, int *addrs, int count, int norb, int nelec)
         for (i = 0; i < count; i++) {
                 addr = addrs[i];
                 if (addr == 0 || nelec == norb || nelec == 0) {
-                        strings[i] = (1UL << nelec) - 1UL;
+                        // 1ULL not 1UL: on LLP64 (Windows) "unsigned long" is
+                        // 32 bits, so 1UL << nelec wraps for nelec >= 32.
+                        strings[i] = (1ULL << nelec) - 1ULL;
                         continue;
                 }
 
@@ -199,10 +201,10 @@ void FCIaddrs2str(uint64_t *strings, int *addrs, int count, int norb, int nelec)
                         if (nelec_left == 0) {
                                 break;
                         } else if (addr == 0) {
-                                str1 |= (1UL << nelec_left) - 1UL;
+                                str1 |= (1ULL << nelec_left) - 1ULL;
                                 break;
                         } else if (nextaddr <= addr) {
-                                str1 |= 1UL << norb_left;
+                                str1 |= 1ULL << norb_left;
                                 addr -= nextaddr;
                                 nextaddr *= nelec_left;
                                 nextaddr /= norb_left;
diff --git a/pyscf/lib/mcscf/select_ci.c b/pyscf/lib/mcscf/select_ci.c
index 9486c68821..4652de6e80 100644
--- a/pyscf/lib/mcscf/select_ci.c
+++ b/pyscf/lib/mcscf/select_ci.c
@@ -355,6 +355,11 @@ void SCIcontract_2e_bbaa(double *eri, double *ci0, double *ci1,
         FCIcompress_link_tril(clinka, link_indexa, na, nlinka);
         FCIcompress_link_tril(clinkb, link_indexb, nb, nlinkb);
 
+        // NOTE: ci1 is intentionally NOT zeroed here. The Python wrappers
+        // (selected_ci.py / selected_ci_spin0.py) call this after the
+        // (aa|aa) and (bb|bb) SCIcontract_2e_aaaa kernels and rely on the
+        // (bb|aa) contribution being accumulated on top.
+
 #pragma omp parallel
 {
         int strk, ib, blen;
@@ -573,6 +578,11 @@ void SCIcontract_2e_bbaa_symm(double *eri, double *ci0, double *ci1,
         FCIcompress_link_tril(clinka, link_indexa, na, nlinka);
         FCIcompress_link_tril(clinkb, link_indexb, nb, nlinkb);
 
+        // NOTE: ci1 is intentionally NOT zeroed here. The Python wrappers
+        // (selected_ci_symm.py / selected_ci_spin0_symm.py) call this after
+        // the (aa|aa) and (bb|bb) SCIcontract_2e_aaaa_symm kernels and
+        // rely on accumulation.
+
 #pragma omp parallel
 {
         int strk, ib, blen;
diff --git a/pyscf/lib/misc.py b/pyscf/lib/misc.py
index 746e8583e8..6b8236ffe6 100644
--- a/pyscf/lib/misc.py
+++ b/pyscf/lib/misc.py
@@ -22,6 +22,7 @@
 
 import os
 import sys
+import atexit
 import time
 import random
 import platform
@@ -91,36 +92,89 @@ def _ldd(so_file):
 c_int_p = ctypes.POINTER(ctypes.c_int)
 c_null_ptr = ctypes.POINTER(ctypes.c_void_p)
 
+_dll_deps = {
+    'libcgto':       ['libcint'],
+    'libcvhf':       ['libcint'],
+    'libao2mo':      ['libcint', 'libcvhf'],
+    'libdft':        ['libcvhf', 'libcgto', 'libcint'],
+    'libpbc':        ['libcint', 'libcgto'],
+    'libri':         ['libao2mo', 'libcvhf', 'libcgto', 'libcint'],
+    'libxc_itrf':    ['xc'],
+    'libxcfun_itrf': ['xcfun'],
+}
+
 @functools.lru_cache(128)
 def load_library(libname):
+    lib = None
     try:
         _loaderpath = os.path.dirname(__file__)
-        return numpy.ctypeslib.load_library(libname, _loaderpath)
+        lib = numpy.ctypeslib.load_library(libname, _loaderpath)
     except OSError:
+        pass
+
+    if lib is None and sys.platform == 'win32':
+        for env_path in [os.path.join(sys.prefix, 'Library', 'bin'),
+                         os.path.join(sys.prefix, 'Library', 'lib')]:
+            try:
+                lib = numpy.ctypeslib.load_library(libname, env_path)
+                break
+            except OSError:
+                pass
+
+    if lib is None:
         from pyscf import __path__ as ext_modules
         for path in ext_modules:
             libpath = os.path.join(path, 'lib')
             if os.path.isdir(libpath):
                 for files in os.listdir(libpath):
                     if files.startswith(libname):
-                        return numpy.ctypeslib.load_library(libname, libpath)
-        raise
+                        lib = numpy.ctypeslib.load_library(libname, libpath)
+                        break
+                if lib is not None:
+                    break
+        if lib is None:
+            raise OSError(f'Library {libname} not found')
+
+    if sys.platform == 'win32' and libname in _dll_deps:
+        deps = [load_library(d) for d in _dll_deps[libname]]
+        lib = make_dll_wrapper(lib, *deps)
+    return lib
+
+
+def make_dll_wrapper(lib, *fallbacks):
+    if sys.platform != 'win32':
+        return lib
+    class _DllWrapper:
+        def __init__(self, primary, *fallbacks):
+            object.__setattr__(self, '_primary', primary)
+            object.__setattr__(self, '_fallbacks', fallbacks)
+        def __getattr__(self, name):
+            for dll in (self._primary,) + self._fallbacks:
+                try:
+                    return getattr(dll, name)
+                except AttributeError:
+                    pass
+            raise AttributeError(f"function '{name}' not found")
+    return _DllWrapper(lib, *fallbacks)
 
 #Fixme, the standard resource module gives wrong number when objects are released
 # http://fa.bianp.net/blog/2013/different-ways-to-get-memory-consumption-or-lessons-learned-from-memory_profiler/#fn:1
 #or use slow functions as memory_profiler._get_memory did
-CLOCK_TICKS = os.sysconf("SC_CLK_TCK")
-PAGESIZE = os.sysconf("SC_PAGE_SIZE")
 def current_memory():
     '''Return the size of used memory and allocated virtual memory (in MB)'''
-    #import resource
-    #return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000
     if sys.platform.startswith('linux'):
+        pagesize = os.sysconf("SC_PAGE_SIZE")
         with open("/proc/%s/statm" % os.getpid()) as f:
-            vms, rss = [int(x)*PAGESIZE for x in f.readline().split()[:2]]
+            vms, rss = [int(x) * pagesize for x in f.readline().split()[:2]]
             return rss/1e6, vms/1e6
     else:
-        return 0, 0
+        try:
+            import psutil
+            process = psutil.Process(os.getpid())
+            mem_info = process.memory_info()
+            return mem_info.rss/1e6, mem_info.vms/1e6
+        except (ImportError, Exception):
+            return 0, 0
 
 def num_threads(n=None):
     '''Set the number of OMP threads.  If argument is not specified, the
@@ -492,7 +546,7 @@ def __enter__(self):
         self._contents = None
         self.old_stdout_fileno = sys.stdout.fileno()
         self.bak_stdout_fd = os.dup(self.old_stdout_fileno)
-        self.ftmp = tempfile.NamedTemporaryFile(dir=param.TMPDIR)
+        self.ftmp = NamedTemporaryFile(dir=param.TMPDIR)
         os.dup2(self.ftmp.file.fileno(), self.old_stdout_fileno)
         return self
     def __exit__(self, type, value, traceback):
@@ -1257,6 +1311,35 @@ def __exit__(self, type, value, traceback):
         self.close()
 
 
+def NamedTemporaryFile(*args, **kwargs):
+    '''Create a named temporary file object. This function wraps
+    `tempfile.NamedTemporaryFile`. On Windows, `delete=False` is forced
+    to prevent permission errors when the file is reopened by another
+    handle.
+
+    Examples:
+
+    >>> from pyscf import lib
+    >>> ftmp = lib.NamedTemporaryFile()
+    >>> ftmp.name
+    '''
+    if sys.platform == 'win32':
+        kwargs['delete'] = False
+    f = tempfile.NamedTemporaryFile(*args, **kwargs)
+    if sys.platform == 'win32':
+        def _close_and_unlink():
+            try:
+                f.close()
+            except Exception:
+                pass
+            try:
+                os.unlink(f.name)
+            except OSError:
+                pass
+        atexit.register(_close_and_unlink)
+    return f
+
+
 def fingerprint(a):
     '''Fingerprint of numpy array'''
     a = numpy.asarray(a)
diff --git a/pyscf/lib/np_helper/condense.c b/pyscf/lib/np_helper/condense.c
index 586ca80e07..56051bbd91 100644
--- a/pyscf/lib/np_helper/condense.c
+++ b/pyscf/lib/np_helper/condense.c
@@ -251,6 +251,9 @@ int8_t NP_all(int8_t *a, int nd, int di, int dj)
 
 uint8_t NP_Bmax(uint8_t *a, int nd, int di, int dj)
 {
+        if (di == 0 || dj == 0) {
+                return 0;
+        }
         int i, j;
         uint8_t out = a[0];
         for (i = 0; i < di; i++) {
@@ -262,6 +265,9 @@ uint8_t NP_Bmax(uint8_t *a, int nd, int di, int dj)
 
 int NP_imax(int *a, int nd, int di, int dj)
 {
+        if (di == 0 || dj == 0) {
+                return 0;
+        }
         int i, j;
         int out = a[0];
         for (i = 0; i < di; i++) {
@@ -273,6 +279,9 @@ int NP_imax(int *a, int nd, int di, int dj)
 
 float NP_fmax(float *a, int nd, int di, int dj)
 {
+        if (di == 0 || dj == 0) {
+                return 0.f;
+        }
         int i, j;
         float out = a[0];
         for (i = 0; i < di; i++) {
diff --git a/pyscf/lib/np_helper/pack_tril.c b/pyscf/lib/np_helper/pack_tril.c
index 0859c44305..f855f09c03 100644
--- a/pyscf/lib/np_helper/pack_tril.c
+++ b/pyscf/lib/np_helper/pack_tril.c
@@ -217,8 +217,8 @@ void NPdunpack_tril_2d(int count, int n, double *tril, double *mat, int hermi)
         shared(count, n, tril, mat, hermi)
 {
         int ic;
-        size_t nn = n * n;
-        size_t n2 = n*(n+1)/2;
+        size_t nn = (size_t)n * n;
+        size_t n2 = (size_t)n*(n+1)/2;
 #pragma omp for schedule (static)
         for (ic = 0; ic < count; ic++) {
                 NPdunpack_tril(n, tril+n2*ic, mat+nn*ic, hermi);
@@ -233,8 +233,8 @@ void NPzunpack_tril_2d(int count, int n,
         shared(count, n, tril, mat, hermi)
 {
         int ic;
-        size_t nn = n * n;
-        size_t n2 = n*(n+1)/2;
+        size_t nn = (size_t)n * n;
+        size_t n2 = (size_t)n*(n+1)/2;
 #pragma omp for schedule (static)
         for (ic = 0; ic < count; ic++) {
                 NPzunpack_tril(n, tril+n2*ic, mat+nn*ic, hermi);
@@ -248,8 +248,8 @@ void NPdpack_tril_2d(int count, int n, double *tril, double *mat)
         shared(count, n, tril, mat)
 {
         int ic;
-        size_t nn = n * n;
-        size_t n2 = n*(n+1)/2;
+        size_t nn = (size_t)n * n;
+        size_t n2 = (size_t)n*(n+1)/2;
 #pragma omp for schedule (static)
         for (ic = 0; ic < count; ic++) {
                 NPdpack_tril(n, tril+n2*ic, mat+nn*ic);
@@ -263,8 +263,8 @@ void NPzpack_tril_2d(int count, int n, double complex *tril, double complex *mat
         shared(count, n, tril, mat)
 {
         int ic;
-        size_t nn = n * n;
-        size_t n2 = n*(n+1)/2;
+        size_t nn = (size_t)n * n;
+        size_t n2 = (size_t)n*(n+1)/2;
 #pragma omp for schedule (static)
         for (ic = 0; ic < count; ic++) {
                 NPzpack_tril(n, tril+n2*ic, mat+nn*ic);
diff --git a/pyscf/lib/np_helper/transpose.c b/pyscf/lib/np_helper/transpose.c
index cb4ba042d5..265debc3f9 100644
--- a/pyscf/lib/np_helper/transpose.c
+++ b/pyscf/lib/np_helper/transpose.c
@@ -56,7 +56,7 @@ void NPdtranspose_021(int *shape, double *a, double *at)
         shared(shape, a, at)
 {
         int ic;
-        size_t nm = shape[1] * shape[2];
+        size_t nm = (size_t)shape[1] * shape[2];
 #pragma omp for schedule (static)
         for (ic = 0; ic < shape[0]; ic++) {
                 NPdtranspose(shape[1], shape[2], a+ic*nm, at+ic*nm);
@@ -70,7 +70,7 @@ void NPztranspose_021(int *shape, double complex *a, double complex *at)
         shared(shape, a, at)
 {
         int ic;
-        size_t nm = shape[1] * shape[2];
+        size_t nm = (size_t)shape[1] * shape[2];
 #pragma omp for schedule (static)
         for (ic = 0; ic < shape[0]; ic++) {
                 NPztranspose(shape[1], shape[2], a+ic*nm, at+ic*nm);
@@ -132,7 +132,7 @@ void NPdsymm_021_sum(int *shape, double *a, double *out, int hermi)
         shared(shape, a, out, hermi)
 {
         int ic;
-        size_t nn = shape[1] * shape[1];
+        size_t nn = (size_t)shape[1] * shape[1];
 #pragma omp for schedule (static)
         for (ic = 0; ic < shape[0]; ic++) {
                 NPdsymm_sum(shape[1], a+ic*nn, out+ic*nn, hermi);
@@ -146,7 +146,7 @@ void NPzhermi_021_sum(int *shape, double complex *a, double complex *out, int he
         shared(shape, a, out, hermi)
 {
         int ic;
-        size_t nn = shape[1] * shape[1];
+        size_t nn = (size_t)shape[1] * shape[1];
 #pragma omp for schedule (static)
         for (ic = 0; ic < shape[0]; ic++) {
                 NPzhermi_sum(shape[1], a+ic*nn, out+ic*nn, hermi);
diff --git a/pyscf/lib/numpy_helper.py b/pyscf/lib/numpy_helper.py
index 276f0288cf..fd90ff4070 100644
--- a/pyscf/lib/numpy_helper.py
+++ b/pyscf/lib/numpy_helper.py
@@ -153,10 +153,16 @@ def contract(subscripts, A, B, alpha=1, beta=0, out=None, **kwargs):
     if A.size < EINSUM_MAX_SIZE or B.size < EINSUM_MAX_SIZE:
         return _numpy_einsum(idx_str, A, B, alpha=alpha, beta=beta, out=out)
 
+    C_dtype = numpy.result_type(A, B)
     if EINSUM_BACKEND == 'pytblis':
+        # pytblis cannot apply alpha/beta when it has to fall back to numpy
+        # tensordot, and it requires the output to share the IEEE type of the
+        # inputs. Route these cases through numpy instead.
+        if ((out is not None and out.dtype != C_dtype) or
+                numpy.result_type(C_dtype, alpha, beta) != C_dtype):
+            return _numpy_einsum(idx_str, A, B, alpha=alpha, beta=beta, out=out)
         return pytblis.contract(idx_str, A, B, alpha=alpha, beta=beta, out=out)
 
-    C_dtype = numpy.result_type(A, B)
     if EINSUM_BACKEND =='pyscf-tblis' and C_dtype == numpy.double:
         # tblis is slow for complex type
         return tblis_einsum.contract(idx_str, A, B, alpha=alpha, beta=beta, out=out)
diff --git a/pyscf/lib/pbc/fill_ints.c b/pyscf/lib/pbc/fill_ints.c
index 8c1a30fb2e..4e76098263 100644
--- a/pyscf/lib/pbc/fill_ints.c
+++ b/pyscf/lib/pbc/fill_ints.c
@@ -1350,7 +1350,8 @@ static int _nr2c_fill(int (*intor)(), double complex *out,
         int ishloc[ish1-ish0+1];
         int nishloc = shloc_partition(ishloc, ao_loc, ish0, ish1, dimax);
 
-        int m, msh0, msh1, dmjc, ish, di, empty;
+        int m, msh0, msh1, dmjc, ish, di;
+        int empty = 1;
         int jL;
         int shls[2];
         double *bufk_r = buf;
diff --git a/pyscf/lib/pbc/fill_ints_screened.c b/pyscf/lib/pbc/fill_ints_screened.c
index 5d100c7ae3..63c648daae 100644
--- a/pyscf/lib/pbc/fill_ints_screened.c
+++ b/pyscf/lib/pbc/fill_ints_screened.c
@@ -265,7 +265,7 @@ static void _nr3c_screened_sum_auxbas_fill_g(int (*intor)(), void (*fsort)(), do
 
         for (ksh = ksh0; ksh < ksh1; ksh++){
             dk = ao_loc[ksh+1] - ao_loc[ksh];
-            assert(dk < dkmax);
+            assert(dk <= dkmax);
             dijk = dij * dk;
             shls[2] = ksh;
             ksh_off = ksh - nshij;
@@ -646,7 +646,7 @@ static void _nr3c1e_screened_nuc_grad_fill_g(int (*intor)(), void (*fcontract)()
 
     for (ksh = ksh0; ksh < ksh1; ksh++){
         dk = ao_loc[ksh+1] - ao_loc[ksh];
-        assert(dk < dkmax);
+        assert(dk <= dkmax);
         dijk = dij * dk;
         shls[2] = ksh;
         ksh_off = ksh - nbas*2;
@@ -894,7 +894,8 @@ static int _nr2c_screened_fill(
         int ishloc[ish1-ish0+1];
         int nishloc = shloc_partition(ishloc, ao_loc, ish0, ish1, dimax);
 
-        int m, msh0, msh1, dijc, dmjc, ish, di, empty;
+        int m, msh0, msh1, dijc, dmjc, ish, di;
+        int empty = 1;
         int jL, idx_j;
         int shls[2];
         double *bufk_r = buf;
diff --git a/pyscf/lib/pbc/hf_grad.c b/pyscf/lib/pbc/hf_grad.c
index 7c781fba19..aa3ca32ede 100644
--- a/pyscf/lib/pbc/hf_grad.c
+++ b/pyscf/lib/pbc/hf_grad.c
@@ -67,7 +67,10 @@ void contract_vhf_dm(double* out, double* vhf, double* dm,
         jsh = ij % njsh + jsh0;
 
         if (nl0 != NULL) {
-            nimgs = ((nl0->pairs)[ish*nbas + jsh])->nimgs;
+            // Use nl0->njsh, not nbas: the neighbor list may have been built
+            // with a narrower shls_slice and indexing with nbas would read
+            // past the pairs allocation.
+            nimgs = ((nl0->pairs)[ish*nl0->njsh + jsh])->nimgs;
         }
         if (nimgs > 0) { // this shell pair has contribution
             p0 = ao_loc[ish] - ao_loc[ish0];
@@ -80,7 +83,10 @@ void contract_vhf_dm(double* out, double* vhf, double* dm,
             pdm = dm + (p0 * naoj + q0);
             for (ic = 0; ic < comp; ic++) {
                 for (i = 0; i < ni; i++) {
-                    buf[iatm*3+ic] += ddot_(&nj, pvhf+i*naoj, &I1, pdm+i*naoj, &I1);
+                    // Stride is comp, not 3: comp is the gradient/derivative
+                    // component count and is 3 for first derivatives but can
+                    // be larger for higher-order routines.
+                    buf[iatm*comp+ic] += ddot_(&nj, pvhf+i*naoj, &I1, pdm+i*naoj, &I1);
                 }
                 pvhf += naoi * naoj;
             }
diff --git a/pyscf/lib/pbc/inner_dot.c b/pyscf/lib/pbc/inner_dot.c
index a44458957c..4d9cd5ad99 100644
--- a/pyscf/lib/pbc/inner_dot.c
+++ b/pyscf/lib/pbc/inner_dot.c
@@ -252,13 +252,13 @@ void PBC_zdot_CNC_s1(double *outR, double *outI, double *aR, double *aI,
                         }
                         dg = ig1 - ig0;
                         dgemm_(&TRANS_T, &TRANS_N, &nc, &dab, &dg,
-                               &D1, cR+ig0, &ng, bufR, &gsize, &D1, outR, &nc);
+                               &D1, cR+ig0, &ng, bufR, &gsize, &D1, poutR, &nc);
                         dgemm_(&TRANS_T, &TRANS_N, &nc, &dab, &dg,
-                               &D1, cI+ig0, &ng, bufI, &gsize, &D1, outR, &nc);
+                               &D1, cI+ig0, &ng, bufI, &gsize, &D1, poutR, &nc);
                         dgemm_(&TRANS_T, &TRANS_N, &nc, &dab, &dg,
-                               &D1, cR+ig0, &ng, bufI, &gsize, &D1, outI, &nc);
+                               &D1, cR+ig0, &ng, bufI, &gsize, &D1, poutI, &nc);
                         dgemm_(&TRANS_T, &TRANS_N, &nc, &dab, &dg,
-                               &ND1, cI+ig0, &ng, bufR, &gsize, &D1, outI, &nc);
+                               &ND1, cI+ig0, &ng, bufR, &gsize, &D1, poutI, &nc);
                 }
         }
         free(bufR);
@@ -378,13 +378,13 @@ void PBC_zdot_CNN_s1(double *outR, double *outI, double *aR, double *aI,
                         }
                         dg = ig1 - ig0;
                         dgemm_(&TRANS_T, &TRANS_N, &nc, &dab, &dg,
-                               &D1, cR+ig0, &ng, bufR, &gsize, &D1, outR, &nc);
+                               &D1, cR+ig0, &ng, bufR, &gsize, &D1, poutR, &nc);
                         dgemm_(&TRANS_T, &TRANS_N, &nc, &dab, &dg,
-                               &ND1, cI+ig0, &ng, bufI, &gsize, &D1, outR, &nc);
+                               &ND1, cI+ig0, &ng, bufI, &gsize, &D1, poutR, &nc);
                         dgemm_(&TRANS_T, &TRANS_N, &nc, &dab, &dg,
-                               &D1, cR+ig0, &ng, bufI, &gsize, &D1, outI, &nc);
+                               &D1, cR+ig0, &ng, bufI, &gsize, &D1, poutI, &nc);
                         dgemm_(&TRANS_T, &TRANS_N, &nc, &dab, &dg,
-                               &D1, cI+ig0, &ng, bufR, &gsize, &D1, outI, &nc);
+                               &D1, cI+ig0, &ng, bufR, &gsize, &D1, poutI, &nc);
                 }
         }
         free(bufR);
diff --git a/pyscf/lib/pbc/nr_direct.c b/pyscf/lib/pbc/nr_direct.c
index 299340186d..6ed6711b87 100644
--- a/pyscf/lib/pbc/nr_direct.c
+++ b/pyscf/lib/pbc/nr_direct.c
@@ -954,8 +954,6 @@ void PBCVHF_direct_drv_nodddd(
                         k = kl / nlsh + ksh0;
                         l = kl % nlsh + lsh0;
                 }
-                k = kl / nlsh + ksh0;
-                l = kl % nlsh + lsh0;
                 qklkl_max = _max_qindex(qindex, Nbas,
                                         seg2sh[seg_loc[k]], seg2sh[seg_loc[k+1]],
                                         seg2sh[seg_loc[l]], seg2sh[seg_loc[l+1]]);
diff --git a/pyscf/lib/pbc/optimizer.c b/pyscf/lib/pbc/optimizer.c
index a37494ca0a..366815da80 100644
--- a/pyscf/lib/pbc/optimizer.c
+++ b/pyscf/lib/pbc/optimizer.c
@@ -43,7 +43,7 @@ void PBCdel_optimizer(PBCOpt **opt)
         if (opt0->rrcut != NULL) {
                 free(opt0->rrcut);
         }
-        if (!opt0->rcut) {
+        if (opt0->rcut != NULL) {
                 free(opt0->rcut);
         }
         free(opt0);
diff --git a/pyscf/lib/pdft/nr_numint.c b/pyscf/lib/pdft/nr_numint.c
index 6a14771b11..ae8cf94b2a 100644
--- a/pyscf/lib/pdft/nr_numint.c
+++ b/pyscf/lib/pdft/nr_numint.c
@@ -79,7 +79,7 @@ static void dot_ao_mo(double *vv, double *ao, double *mo,
                                 lenj = MIN(nmo-b0j, BOXSIZE);
                                 dgemm_(&TRANS_T, &TRANS_N, &lenj, &leni, &bgrids, &D1,
                                        mo+b0j*ngrids, &ngrids, ao+b0i*ngrids, &ngrids,
-                                       &D1, vv+b0i*nao+b0j, &nmo);
+                                       &D1, vv+b0i*nmo+b0j, &nmo);
                         } 
                 } }
         } else {
diff --git a/pyscf/lib/test/test_chkfile.py b/pyscf/lib/test/test_chkfile.py
index 06101f3e4b..fe136177fa 100644
--- a/pyscf/lib/test/test_chkfile.py
+++ b/pyscf/lib/test/test_chkfile.py
@@ -15,14 +15,13 @@
 
 import unittest
 import numpy
-import tempfile
 from pyscf import lib, gto
 
 class KnownValues(unittest.TestCase):
     def test_save_load_mol(self):
         mol = gto.M(atom=[['H', (0,0,i)] for i in range(8)],
                     basis='sto3g')
-        fchk = tempfile.NamedTemporaryFile()
+        fchk = lib.NamedTemporaryFile()
         lib.chkfile.save_mol(mol, fchk.name)
         mol1 = lib.chkfile.load_mol(fchk.name)
         self.assertTrue(numpy.all(mol1._atm == mol._atm))
@@ -30,7 +29,7 @@ def test_save_load_mol(self):
         self.assertTrue(numpy.all(mol1._env == mol._env))
 
     def test_save_load_arrays(self):
-        fchk = tempfile.NamedTemporaryFile()
+        fchk = lib.NamedTemporaryFile()
         a = numpy.eye(3)
         lib.chkfile.save(fchk.name, 'a', a)
         self.assertTrue(numpy.all(a == lib.chkfile.load(fchk.name, 'a')))
diff --git a/pyscf/lib/test/test_diis.py b/pyscf/lib/test/test_diis.py
index c568e84b65..0abce1377b 100644
--- a/pyscf/lib/test/test_diis.py
+++ b/pyscf/lib/test/test_diis.py
@@ -15,7 +15,6 @@
 
 import unittest
 import numpy
-import tempfile
 from pyscf import lib, gto
 
 def make_ab(n):
@@ -51,7 +50,7 @@ def test_without_errvec(self):
     def test_restore(self):
         a, b, adiag, arest, x = make_ab(16)
         lib.diis.INCORE_SIZE, bak = 4, lib.diis.INCORE_SIZE
-        ftmp = tempfile.NamedTemporaryFile()
+        ftmp = lib.NamedTemporaryFile()
         ad = lib.diis.DIIS(filename=ftmp.name)
         for i in range(8):
             x = (b - arest.dot(x)) / adiag
diff --git a/pyscf/lib/test/test_einsum.py b/pyscf/lib/test/test_einsum.py
index 4109a6b854..6d247cc2ea 100644
--- a/pyscf/lib/test/test_einsum.py
+++ b/pyscf/lib/test/test_einsum.py
@@ -3,6 +3,11 @@
 from pyscf import lib
 einsum = lib.einsum
 
+# pytblis does not support contractions between operands of different
+# floating-point precision (e.g. float64 vs float32); real/complex mixing at
+# the same precision is fine
+_pytblis = lib.numpy_helper.EINSUM_BACKEND == 'pytblis'
+
 def setUpModule():
     global bak
     lib.numpy_helper.EINSUM_MAX_SIZE, bak = 0, lib.numpy_helper.EINSUM_MAX_SIZE
@@ -92,6 +97,7 @@ def test_dslice_dslice1(self):
         self.assertTrue(c0.dtype == c1.dtype)
         self.assertTrue(abs(c0-c1).max() < 1e-14)
 
+    @unittest.skipIf(_pytblis, 'pytblis does not support operands of different precision')
     def test_d_cslice(self):
         a = numpy.random.random((7,1,3,4))
         b = numpy.random.random((2,4,5,7)).astype(numpy.float32)
@@ -100,6 +106,7 @@ def test_d_cslice(self):
         self.assertTrue(c0.dtype == c1.dtype)
         self.assertTrue(abs(c0-c1).max() < 1e-14)
 
+    @unittest.skipIf(_pytblis, 'pytblis does not support operands of different precision')
     def test_z_cslice(self):
         a = numpy.random.random((7,1,3,4)).astype(numpy.float32) + 0j
         b = numpy.random.random((2,4,5,7))
@@ -108,6 +115,7 @@ def test_z_cslice(self):
         self.assertTrue(c0.dtype == c1.dtype)
         self.assertTrue(abs(c0-c1).max() < 1e-14)
 
+    @unittest.skipIf(_pytblis, 'pytblis does not support operands of different precision')
     def test_cslice_dslice(self):
         a = numpy.random.random((7,1,3,4)).astype(numpy.float32) + 0j
         b = numpy.random.random((2,4,5,7))
diff --git a/pyscf/lib/vhf/fill_nr_s8.c b/pyscf/lib/vhf/fill_nr_s8.c
index 4a8b1747e6..368799f661 100644
--- a/pyscf/lib/vhf/fill_nr_s8.c
+++ b/pyscf/lib/vhf/fill_nr_s8.c
@@ -92,7 +92,7 @@ static void store_ij(int (*intor)(), double *eri, double *buf, int ish, int jsh,
         for (i0 = ao_loc[ish], i = 0; i < di; i++, i0++) {
         for (j0 = ao_loc[jsh], j = 0; j < dj; j++, j0++) {
                 if (i0 >= j0) {
-                        ij0 = i0*(i0+1)/2 + j0;
+                        ij0 = (size_t)i0*(i0+1)/2 + j0;
                         peri = eri + ij0*(ij0+1)/2;
                         pbuf = buf + nao2 * (i*dj+j);
                         for (kl = 0, k = 0; k < i0; k++) {
@@ -123,11 +123,12 @@ void GTO2e_cart_or_sph(int (*intor)(), CINTOpt *cintopt, double *eri, int *ao_lo
 
 #pragma omp parallel
 {
-        int i, j, ij;
-        double *buf = malloc(sizeof(double) * (di*di*nao*nao + cache_size));
+        size_t i, j, ij;
+        double *buf = malloc(sizeof(double) * ((size_t)di*di*nao*nao + cache_size));
+        size_t nshell_pairs = (size_t)nbas*(nbas+1)/2;
 #pragma omp for nowait schedule(dynamic, 2)
-        for (ij = 0; ij < nbas*(nbas+1)/2; ij++) {
-                i = (int)(sqrt(2*ij+.25) - .5 + 1e-7);
+        for (ij = 0; ij < nshell_pairs; ij++) {
+                i = (size_t)(sqrt(2.*ij+.25) - .5 + 1e-7);
                 j = ij - (i*(i+1)/2);
                 store_ij(intor, eri, buf, i, j, vhfopt, &envs);
         }
diff --git a/pyscf/lib/vhf/hessian_screen.c b/pyscf/lib/vhf/hessian_screen.c
index 94af78a770..010cb4a6a1 100644
--- a/pyscf/lib/vhf/hessian_screen.c
+++ b/pyscf/lib/vhf/hessian_screen.c
@@ -120,7 +120,7 @@ void CVHFnr_int2e_pp_q_cond(int (*intor)(), CINTOpt *cintopt, double *q_cond,
                             int *ao_loc, int *atm, int natm,
                             int *bas, int nbas, double *env)
 {
-        int nbas2 = nbas * nbas;
+        size_t nbas2 = (size_t)nbas * nbas;
         int shls_slice[] = {0, nbas};
         const int cache_size = GTOmax_cache_size(intor, shls_slice, 1,
                                                  atm, natm, bas, nbas, env);
@@ -137,7 +137,7 @@ void CVHFnr_int2e_pp_q_cond(int (*intor)(), CINTOpt *cintopt, double *q_cond,
                 dj = ao_loc[ish+1] - ao_loc[ish];
                 di = MAX(di, dj);
         }
-        double *buf = malloc(sizeof(double) * 9 * di*di*di*di);
+        double *buf = malloc(sizeof(double) * 9 * (size_t)di*di*di*di);
         double *bufx = buf;
         double *bufy, *bufz;
 #pragma omp for schedule(dynamic, 4)
@@ -202,7 +202,8 @@ void CVHFgrad_jk_direct_scf_dm(CVHFOpt *opt, double *dm, int nset, int *ao_loc,
                 free(opt->dm_cond);
         }
         nbas = opt->nbas;
-        opt->dm_cond = (double *)malloc(sizeof(double) * nbas*nbas);
+        size_t Nbas = nbas;
+        opt->dm_cond = (double *)malloc(sizeof(double) * Nbas*Nbas);
         CVHFnr_dm_cond1(opt->dm_cond, dm, nset, ao_loc, atm, natm, bas, nbas, env);
 }
 
@@ -288,7 +289,7 @@ void CVHFnr_int2e_pppp_q_cond(int (*intor)(), CINTOpt *cintopt, double *q_cond,
                               int *ao_loc, int *atm, int natm,
                               int *bas, int nbas, double *env)
 {
-        int nbas2 = nbas * nbas;
+        size_t nbas2 = (size_t)nbas * nbas;
         int shls_slice[] = {0, nbas};
         const int cache_size = GTOmax_cache_size(intor, shls_slice, 1,
                                                  atm, natm, bas, nbas, env);
@@ -305,7 +306,7 @@ void CVHFnr_int2e_pppp_q_cond(int (*intor)(), CINTOpt *cintopt, double *q_cond,
                 dj = ao_loc[ish+1] - ao_loc[ish];
                 di = MAX(di, dj);
         }
-        double *buf = malloc(sizeof(double) * 256 * di*di*di*di);
+        double *buf = malloc(sizeof(double) * 256 * (size_t)di*di*di*di);
         double *bufxx = buf;
         double *bufxy, *bufxz, *bufyx, *bufyy, *bufyz, *bufzx, *bufzy, *bufzz;
 #pragma omp for schedule(dynamic, 4)
@@ -363,7 +364,7 @@ void CVHFipip1_direct_scf(CVHFOpt *opt, int (*intor)(), CINTOpt *cintopt,
         size_t Nbas = nbas;
         size_t Nbas2 = Nbas * Nbas;
         // First n*n elements for derivatives, the next n*n elements for regular ERIs
-        opt->q_cond = (double *)malloc(sizeof(double) * nbas*nbas*2);
+        opt->q_cond = (double *)malloc(sizeof(double) * Nbas2*2);
 
         if (ao_loc[nbas] == CINTtot_cgto_spheric(bas, nbas)) {
                 CVHFnr_int2e_q_cond(int2e_sph, NULL, opt->q_cond+Nbas2, ao_loc,
diff --git a/pyscf/lib/vhf/nr_direct.c b/pyscf/lib/vhf/nr_direct.c
index 130b40b507..c783583927 100644
--- a/pyscf/lib/vhf/nr_direct.c
+++ b/pyscf/lib/vhf/nr_direct.c
@@ -18,6 +18,7 @@
 
 #include <stdlib.h>
 #include <stdint.h>
+#include <stdio.h>
 #include <assert.h>
 #include <math.h>
 //#include <omp.h>
@@ -252,6 +253,11 @@ JKArray *CVHFallocate_JKArray(JKOperator *op, int *shls_slice, int *ao_loc,
         }
         jkarray->stack_size = 0;
         jkarray->data = malloc(sizeof(double) * (size_limit + 136*136));
+        if (jkarray->data == NULL) {
+                fprintf(stderr, "malloc(%zu) failed in CVHFallocate_JKArray\n",
+                        sizeof(double) * (size_limit + 136*136));
+                exit(1);
+        }
         jkarray->ncomp = ncomp;
         int keys_max = size_limit / (AO_BLOCK_SIZE*AO_BLOCK_SIZE*ncomp);
         jkarray->keys_cache = malloc(sizeof(int) * keys_max);
@@ -368,9 +374,9 @@ void CVHFnr_direct_drv(int (*intor)(), void (*fdot)(), JKOperator **jkop,
                                                             shls_slice, ao_loc);
         }
 
-        size_t di = GTOmax_shell_dim(ao_loc, shls_slice, 4);
-        size_t cache_size = GTOmax_cache_size(intor, shls_slice, 4,
-                                              atm, natm, bas, nbas, env);
+        int64_t di = GTOmax_shell_dim(ao_loc, shls_slice, 4);
+        int64_t cache_size = GTOmax_cache_size(intor, shls_slice, 4,
+                                               atm, natm, bas, nbas, env);
         int ish0 = shls_slice[0];
         int ish1 = shls_slice[1];
         int jsh0 = shls_slice[2];
@@ -387,17 +393,23 @@ void CVHFnr_direct_drv(int (*intor)(), void (*fdot)(), JKOperator **jkop,
         int *block_jloc = block_iloc + nish + 1;
         int *block_kloc = block_jloc + njsh + 1;
         int *block_lloc = block_kloc + nksh + 1;
-        uint32_t nblock_i = CVHFshls_block_partition(block_iloc, shls_slice+0, ao_loc, AO_BLOCK_SIZE);
-        uint32_t nblock_j = CVHFshls_block_partition(block_jloc, shls_slice+2, ao_loc, AO_BLOCK_SIZE);
-        uint32_t nblock_k = CVHFshls_block_partition(block_kloc, shls_slice+4, ao_loc, AO_BLOCK_SIZE);
-        uint32_t nblock_l = CVHFshls_block_partition(block_lloc, shls_slice+6, ao_loc, AO_BLOCK_SIZE);
-        uint32_t nblock_kl = nblock_k * nblock_l;
-        uint32_t nblock_jkl = nblock_j * nblock_kl;
+        // size_t to keep nblock^3 from overflowing for large molecules. The
+        // same fix was applied to nr_sr_vhf.c.
+        size_t nblock_i = CVHFshls_block_partition(block_iloc, shls_slice+0, ao_loc, AO_BLOCK_SIZE);
+        size_t nblock_j = CVHFshls_block_partition(block_jloc, shls_slice+2, ao_loc, AO_BLOCK_SIZE);
+        size_t nblock_k = CVHFshls_block_partition(block_kloc, shls_slice+4, ao_loc, AO_BLOCK_SIZE);
+        size_t nblock_l = CVHFshls_block_partition(block_lloc, shls_slice+6, ao_loc, AO_BLOCK_SIZE);
+        size_t nblock_kl = nblock_k * nblock_l;
+        size_t nblock_jkl = nblock_j * nblock_kl;
         int nblock_max = MAX(nblock_i, nblock_j);
         nblock_max = MAX(nblock_max, nblock_k);
         nblock_max = MAX(nblock_max, nblock_l);
-        // up to 1.6 GB per thread
-        int size_limit = (200000000 - di*di*di*di*ncomp - cache_size) / n_dm;
+        // up to 3.2 GB per thread.
+        int64_t size_limit = (400000000 - di*di*di*di*ncomp - cache_size) / n_dm;
+        if (size_limit < 0) {
+                fprintf(stderr, "Insufficient memory for caching CVHFnr_direct_drv intermediates\n");
+                exit(1);
+        }
 
 #pragma omp parallel
 {
@@ -405,7 +417,8 @@ void CVHFnr_direct_drv(int (*intor)(), void (*fdot)(), JKOperator **jkop,
         int joff = ao_loc[jsh0];
         int koff = ao_loc[ksh0];
         int loff = ao_loc[lsh0];
-        int i, j, k, l, n, r, blk_id;
+        int i, j, k, l, n;
+        size_t r, blk_id;
         JKArray *v_priv[n_dm];
         for (i = 0; i < n_dm; i++) {
                 v_priv[i] = CVHFallocate_JKArray(jkop[i], shls_slice, ao_loc,
@@ -516,9 +529,9 @@ void CVHFnr_direct_ex_drv(int (*intor)(), void (*fdot)(), JKOperator **jkop,
                                                             shls_slice, ao_loc);
         }
 
-        size_t di = GTOmax_shell_dim(ao_loc, shls_slice, 4);
-        size_t cache_size = GTOmax_cache_size(intor, shls_slice, 4,
-                                              atm, natm, bas, nbas, env);
+        int64_t di = GTOmax_shell_dim(ao_loc, shls_slice, 4);
+        int64_t cache_size = GTOmax_cache_size(intor, shls_slice, 4,
+                                               atm, natm, bas, nbas, env);
         int ish0 = shls_slice[0];
         int ish1 = shls_slice[1];
         int jsh0 = shls_slice[2];
@@ -545,8 +558,12 @@ void CVHFnr_direct_ex_drv(int (*intor)(), void (*fdot)(), JKOperator **jkop,
         int nblock_max = MAX(nblock_i, nblock_j);
         nblock_max = MAX(nblock_max, nblock_k);
         nblock_max = MAX(nblock_max, nblock_l);
-        // up to 1.6 GB per thread
-        int size_limit = (200000000 - di*di*di*di*ncomp - cache_size) / n_dm;
+        // up to 3.2 GB per thread.
+        int64_t size_limit = (400000000 - di*di*di*di*ncomp - cache_size) / n_dm;
+        if (size_limit < 0) {
+                fprintf(stderr, "Insufficient memory for caching CVHFnr_direct_ex_drv intermediates\n");
+                exit(1);
+        }
 
 #pragma omp parallel
 {
@@ -554,7 +571,8 @@ void CVHFnr_direct_ex_drv(int (*intor)(), void (*fdot)(), JKOperator **jkop,
         int joff = ao_loc[jsh0];
         int koff = ao_loc[ksh0];
         int loff = ao_loc[lsh0];
-        int i, j, k, l, n, r, blk_id;
+        int i, j, k, l, n;
+        size_t r, blk_id;
         JKArray *v_priv[n_dm];
         for (i = 0; i < n_dm; i++) {
                 v_priv[i] = CVHFallocate_JKArray(jkop[i], shls_slice, ao_loc,
diff --git a/pyscf/lib/vhf/nr_incore.c b/pyscf/lib/vhf/nr_incore.c
index b3472f0262..ffcfea53fe 100644
--- a/pyscf/lib/vhf/nr_incore.c
+++ b/pyscf/lib/vhf/nr_incore.c
@@ -629,8 +629,8 @@ void CVHFnrs8_incore_drv(double *eri, double **dms, double **vjk,
         {
                 int i, j, ic;
                 size_t ij, off;
-                size_t npair = nao*(nao+1)/2;
-                size_t nn = nao * nao;
+                size_t npair = (size_t)nao*(nao+1)/2;
+                size_t nn = (size_t)nao * nao;
                 double *v_priv = calloc(nn*n_dm, sizeof(double));
                 FjkPtr pf;
                 double *pv;
@@ -666,8 +666,8 @@ void CVHFnrs4_incore_drv(double *eri, double **dms, double **vjk,
         {
                 int i, j, ic;
                 size_t ij, off;
-                size_t npair = nao*(nao+1)/2;
-                size_t nn = nao * nao;
+                size_t npair = (size_t)nao*(nao+1)/2;
+                size_t nn = (size_t)nao * nao;
                 double *v_priv = calloc(nn*n_dm, sizeof(double));
                 FjkPtr pf;
                 double *pv;
@@ -703,8 +703,8 @@ void CVHFnrs2ij_incore_drv(double *eri, double **dms, double **vjk,
         {
                 int i, j, ic;
                 size_t ij, off;
-                size_t npair = nao*(nao+1)/2;
-                size_t nn = nao * nao;
+                size_t npair = (size_t)nao*(nao+1)/2;
+                size_t nn = (size_t)nao * nao;
                 double *v_priv = calloc(nn*n_dm, sizeof(double));
                 FjkPtr pf;
                 double *pv;
@@ -740,8 +740,8 @@ void CVHFnrs2kl_incore_drv(double *eri, double **dms, double **vjk,
         {
                 int i, j, ic;
                 size_t ij, off;
-                size_t npair = nao*(nao+1)/2;
-                size_t nn = nao * nao;
+                size_t npair = (size_t)nao*(nao+1)/2;
+                size_t nn = (size_t)nao * nao;
                 double *v_priv = calloc(nn*n_dm, sizeof(double));
                 FjkPtr pf;
                 double *pv;
@@ -777,7 +777,7 @@ void CVHFnrs1_incore_drv(double *eri, double **dms, double **vjk,
         {
                 int i, j, ic;
                 size_t ij, off;
-                size_t nn = nao * nao;
+                size_t nn = (size_t)nao * nao;
                 double *v_priv = calloc(nn*n_dm, sizeof(double));
                 FjkPtr pf;
                 double *pv;
diff --git a/pyscf/lib/vhf/nr_sgx_direct.c b/pyscf/lib/vhf/nr_sgx_direct.c
index 054feed685..6570069296 100644
--- a/pyscf/lib/vhf/nr_sgx_direct.c
+++ b/pyscf/lib/vhf/nr_sgx_direct.c
@@ -840,19 +840,10 @@ void SGXdiagonal_ints(int (*intor)(), double *m_bi, int *ao_loc, CINTOpt *cintop
                        double *widths, double *norms, double *vals, int nrad,
                        double *atm_coords)
 {
-        int shls_slice[] = {0, nbas, 0, nbas};
-        int di = GTOmax_shell_dim(ao_loc, shls_slice, 2);
-        int cache_size = _max_cache_size_sgx(intor, shls_slice, 2,
-                                             atm, natm, bas, nbas, env,
-                                             SGX_BLKSIZE);
 #pragma omp parallel
 {
         int ig0, ig1, dg;
         int ish;
-        int ncomp = 1;
-        double *buf = calloc(sizeof(double), SGX_BLKSIZE*di*di*ncomp);
-        double *cache = malloc(sizeof(double) * cache_size);
-        double *dists = malloc(sizeof(int) * SGX_BLKSIZE);
         const double omega = env[PTR_RANGE_OMEGA];
         double *grids = env + (int) env[PTR_GRIDS];
         double r;
@@ -887,9 +878,6 @@ void SGXdiagonal_ints(int (*intor)(), double *m_bi, int *ao_loc, CINTOpt *cintop
                         m_bi[ibatch * nbas + ish] = maxint;
                 }
         }
-        free(buf);
-        free(cache);
-        free(dists);
 }
 }
 
diff --git a/pyscf/lib/vhf/nr_sr_vhf.c b/pyscf/lib/vhf/nr_sr_vhf.c
index bd4a93fda1..e80caa4c03 100644
--- a/pyscf/lib/vhf/nr_sr_vhf.c
+++ b/pyscf/lib/vhf/nr_sr_vhf.c
@@ -1,4 +1,5 @@
 #include <stdlib.h>
+#include <stdio.h>
 #include <assert.h>
 #include <math.h>
 //#include <omp.h>
@@ -698,9 +699,9 @@ void CVHFnr_sr_direct_drv(int (*intor)(), void (*fdot)(), JKOperator **jkop,
                                                             shls_slice, ao_loc);
         }
 
-        size_t di = GTOmax_shell_dim(ao_loc, shls_slice, 4);
-        size_t cache_size = GTOmax_cache_size(intor, shls_slice, 4,
-                                              atm, natm, bas, nbas, env);
+        int64_t di = GTOmax_shell_dim(ao_loc, shls_slice, 4);
+        int64_t cache_size = GTOmax_cache_size(intor, shls_slice, 4,
+                                               atm, natm, bas, nbas, env);
         int ish0 = shls_slice[0];
         int ish1 = shls_slice[1];
         int jsh0 = shls_slice[2];
@@ -717,11 +718,17 @@ void CVHFnr_sr_direct_drv(int (*intor)(), void (*fdot)(), JKOperator **jkop,
         assert(nksh == nish);
         assert(nlsh == nish);
         int *block_loc = malloc(sizeof(int) * (nish+1));
-        uint32_t nblock = CVHFshls_block_partition(block_loc, shls_slice, ao_loc, AO_BLOCK_SIZE);
-        uint32_t nblock2 = nblock * nblock;
-        uint32_t nblock3 = nblock2 * nblock;
-        // up to 1.6 GB per thread
-        int size_limit = (200000000 - di*di*di*di*ncomp - cache_size) / n_dm;
+        // size_t to keep nblock3 = nblock^3 from overflowing for large molecules
+        // (nblock can reach ~10^3-10^4 with AO_BLOCK_SIZE=56).
+        size_t nblock = CVHFshls_block_partition(block_loc, shls_slice, ao_loc, AO_BLOCK_SIZE);
+        size_t nblock2 = nblock * nblock;
+        size_t nblock3 = nblock2 * nblock;
+        // up to 3.2 GB per thread.
+        int64_t size_limit = (400000000 - di*di*di*di*ncomp - cache_size) / n_dm;
+        if (size_limit < 0) {
+                fprintf(stderr, "Insufficient memory for caching CVHFnr_sr_direct_drv intermediates\n");
+                exit(1);
+        }
 
         size_t Nbas = nbas;
         size_t Nbas2 = Nbas * Nbas;
@@ -744,7 +751,8 @@ void CVHFnr_sr_direct_drv(int (*intor)(), void (*fdot)(), JKOperator **jkop,
         float log_cutoff = vhfopt->log_cutoff;
         float ij_cutoff, ik_cutoff, il_cutoff;
         float dm_max0, dm_max, log_dm;
-        int i, j, k, l, n, r, blk_id;
+        int i, j, k, l, n;
+        size_t r, blk_id;
         JKArray *v_priv[n_dm];
         for (i = 0; i < n_dm; i++) {
                 v_priv[i] = CVHFallocate_JKArray(jkop[i], shls_slice, ao_loc,
diff --git a/pyscf/lib/vhf/rkb_screen.c b/pyscf/lib/vhf/rkb_screen.c
index 223d1a64bb..613c55bfed 100644
--- a/pyscf/lib/vhf/rkb_screen.c
+++ b/pyscf/lib/vhf/rkb_screen.c
@@ -485,8 +485,13 @@ void CVHFrkbssll_direct_scf_dm(CVHFOpt *opt, double complex *dm, int nset,
                 exit(1);
         }
         nset = nset / 4;
-        size_t nbas2 = nbas * nbas;
+        size_t nbas2 = (size_t)nbas * nbas;
         opt->dm_cond = (double *)malloc(sizeof(double)*nbas2*4*(1+nset));
+        // CVHFrkbssll_dm_cond only writes the LL/SS/SL diagonal blocks for
+        // jsh<=ish; the strict upper triangle of the master slots would
+        // otherwise be read uninitialised by CVHFrkbssll_prescreen. Match
+        // the NPdset0() that the sibling CVHFrkbllll_direct_scf_dm uses.
+        NPdset0(opt->dm_cond, nbas2*4*(1+nset));
         CVHFrkbssll_dm_cond(opt->dm_cond, dm, nset, ao_loc,
                             atm, natm, bas, nbas, env);
 }
diff --git a/pyscf/lib/vhf/test/test_nr_direct.py b/pyscf/lib/vhf/test/test_nr_direct.py
index 0e5894a25e..c4e1c2530f 100644
--- a/pyscf/lib/vhf/test/test_nr_direct.py
+++ b/pyscf/lib/vhf/test/test_nr_direct.py
@@ -15,7 +15,6 @@
 
 import os
 import ctypes
-import _ctypes
 import unittest
 import numpy
 from pyscf import lib
@@ -68,7 +67,7 @@ def runjk(dm1, ncomp, intorname, filldot, *namejk):
     dmsptr = (ctypes.c_void_p*(njk*n_dm))()
     vjkptr = (ctypes.c_void_p*(njk*n_dm))()
     for i, symb in enumerate(namejk):
-        f1 = ctypes.c_void_p(_ctypes.dlsym(libcvhf2._handle, symb))
+        f1 = ctypes.cast(getattr(libcvhf2, symb), ctypes.c_void_p)
         for j in range(n_dm):
             dmsptr[i*n_dm+j] = dm1[j].ctypes.data_as(ctypes.c_void_p)
             vjkptr[i*n_dm+j] = vjk[i,j*ncomp].ctypes.data_as(ctypes.c_void_p)
diff --git a/pyscf/lo/orth.py b/pyscf/lo/orth.py
index 4cfe331906..5c623420c9 100644
--- a/pyscf/lo/orth.py
+++ b/pyscf/lo/orth.py
@@ -240,7 +240,13 @@ def ecp_ano_det_ovlp(atm_ecp, atm_ano, ecpcore):
         if symb in mol._basis:
             ano = aos[symb]
         else:
-            ano = aos[mol.atom_pure_symbol(ia)]
+            try:
+                ano = aos[mol.atom_pure_symbol(ia)]
+            except KeyError:
+                if symb in mol._ecp and mol._ecp[symb][0] == 0:
+                    # Skip ECPs with nelec=0 (typically used for ghost atoms or custom ECPs
+                    # in QM/MM embedding); see PR #3243 for details.
+                    continue
         p0, p1 = p1, p1 + ano.shape[1]
         c[p0:p1,p0:p1] = ano
     return c
diff --git a/pyscf/lo/test/test_orth.py b/pyscf/lo/test/test_orth.py
index ef8ba90bd8..1e58319b5b 100644
--- a/pyscf/lo/test/test_orth.py
+++ b/pyscf/lo/test/test_orth.py
@@ -118,6 +118,52 @@ def test_pre_orth_ao_with_ecp(self):
         c0 = orth.pre_orth_ao(mol, method='ano')
         self.assertAlmostEqual(numpy.linalg.norm(c0), 5.9621174285790959, 9)
 
+    def test_pre_orth_ao_with_coreless_ecp(self):
+        mol = gto.M(atom = '''
+            ghost-Cu2  2.33770     1.38257    -2.24106
+            Cu0       -0.51523     1.42830    -3.17698
+            Cu0        0.39700     3.33838    -1.04703
+            Cu0        1.33168     3.68956    -3.87904
+            O0         0.88779     2.45970    -2.58603
+            O0         3.78761     0.30544    -1.89609
+            O0        -0.09379     4.21706     0.49196
+            O0        -1.91824     0.39690    -3.76793
+            O0         1.77557     4.91941    -5.17205
+            X-Cu1      5.19063     1.33684    -1.30514
+            X-Cu1      4.27840    -0.57324    -3.43509
+            X-Cu1      3.34372    -0.92441    -0.60308
+            X-Cu1      1.30922     5.24846     1.08291
+            X-Cu1      3.17859     5.95081    -4.58110
+            X-Cu1     -1.42745    -0.48178    -5.30692
+            X-Cu1      2.26636     4.04073    -6.71105
+            X-Cu1     -0.53769     2.98721     1.78498
+            X-Cu1     -2.36214    -0.83295    -2.47491
+            X-Cu1      0.32566     5.99654    -5.51702
+            X-Cu1     -1.54371     5.29420     0.14700
+            X-Cu1     -3.36816     1.47403    -4.11289
+            ''',
+            verbose=0,
+            spin=1,
+            charge=-6,
+            basis={'Cu0': 'cc-pvdz', 'O0': 'cc-pvdz', 'ghost-Cu2': gto.basis.load('cc-pvdz', 'Cu')},
+            ecp={'Cu': 'cc-pvdz-pp',
+                    'X-Cu1': gto.basis.parse_ecp('''
+                    Cu nelec 0
+                    Cu ul
+                    2       1.000000000            0.000000000
+                    Cu S
+                    2      30.220000000          355.770158000
+                    2      13.190000000           70.865357000
+                    Cu P
+                    2      33.130000000          233.891976000
+                    2      13.220000000           53.947299000
+                    Cu D
+                    2      38.420000000          -31.272165000
+                    2      13.260000000           -2.741104000
+                    ''')})
+
+        c = orth.pre_orth_ao(mol, method='ano')
+        self.assertAlmostEqual(numpy.linalg.norm(c), 35.14205617894, 9)
 
 if __name__ == "__main__":
     print("Test orth")
diff --git a/pyscf/mcpdft/lpdft.py b/pyscf/mcpdft/lpdft.py
index 34db9432e3..a04985f348 100644
--- a/pyscf/mcpdft/lpdft.py
+++ b/pyscf/mcpdft/lpdft.py
@@ -686,7 +686,7 @@ def linear_multi_state(mc, weights=(0.5, 0.5), **kwargs):
         mc = mc.state_average(weights=weights, **kwargs)
 
     else:
-        base_name = mc.__class__.bases__[0].__name__
+        base_name = mc.__class__.__bases__[0].__name__
 
     mcbase_class = mc.__class__
 
@@ -724,7 +724,7 @@ def linear_multi_state_mix(mc, fcisolvers, weights=(0.5, 0.5), **kwargs):
         raise RuntimeError("already a StateAverageMCSCF solver")
 
     else:
-        base_name = mc.__class__.bases__[0].__name__
+        base_name = mc.__class__.__bases__[0].__name__
 
     mcbase_class = mc.__class__
 
diff --git a/pyscf/mcpdft/otfnal.py b/pyscf/mcpdft/otfnal.py
index a24483f812..406030be7e 100644
--- a/pyscf/mcpdft/otfnal.py
+++ b/pyscf/mcpdft/otfnal.py
@@ -35,6 +35,8 @@
 OT_ALIAS = {
     'MC23': 'tMC23',
     'MC25': 'tMC25',
+    'MC26': 'tMC26',
+    'COF26': 'tCOF26',
 }
 OT_HYB_ALIAS = {'PBE0' : '0.25*HF + 0.75*PBE, 0.25*HF + 0.75*PBE',
                 }
@@ -86,6 +88,82 @@
         'hyb': (0.28, 0.28, 0),
         'facs': (0.72, 0.72)
     },
+
+    # MC26 = a0*E_CAS + E_xc[rep-M06L]
+    # Y. Chen, D. Zhang, D. G. Truhlar, and X. He
+    # Pushing the accuracy of on-top functionals with agent-driven
+    # supervised learning, arXiv:2605.06215 (2026).
+    # https://arxiv.org/abs/2605.06215
+    'MC26': {
+        'xc_base': 'M06L',
+        'ext_params': {
+            203: np.array([12.793598175048828, 1.0464407205581665, -1.1021970510482788,
+                           -1.4680061340332031, 1.0868027210235596, 11.653898239135742,
+                           -3.4057228565216064, -20.206926345825195, -1.7893168926239014,
+                           14.40688705444336, 1.7784547805786133, -0.3958134949207306,
+                           -12.139795303344727, -0.0605972521007061, 0.016891608014702797,
+                           -7.153533806558698e-05, 0.0001199805992655456, 0.0]),
+            233: np.array([0.06, 0.0031, 0.00515088, 0.00304966,
+                           -0.6178147196769714, 8.792010307312012, -8.655962944030762,
+                           15.397195816040039, -9.685625076293945, 2.904688835144043,
+                           -0.982710599899292, 1.7047909498214722, -1.9396733045578003,
+                           -5.875694274902344, 1.1270228624343872, -0.29264968633651733,
+                           0.10097602754831314, 0.002418402349576354,
+                           -0.0004997584619559348, 0.0, -1.0493528842926025,
+                           -0.03480437397956848, 0.01626494713127613,
+                           7.84311632742174e-05, 0.000405816943384707, 0.0, 1e-10]),
+        },
+        'hyb': (0.278090700064691, 0.278090700064691, 0),
+    },
+
+    # COF26 = a0*E_CAS + E_xc[rep-M06L + rep-MN15L]
+    # Y. Chen, D. Zhang, D. G. Truhlar, and X. He
+    # Pushing the accuracy of on-top functionals with agent-driven
+    # supervised learning, arXiv:2605.06215 (2026).
+    # https://arxiv.org/abs/2605.06215
+    'COF26': {
+        'xc_base': 'MGGA_X_M06_L + MGGA_X_MN15_L, MGGA_C_M06_L + MGGA_C_MN15_L',
+        'ext_params': {
+            203: np.array([4.46751594543457, -0.620290219783783, -0.02489340677857399,
+                           -1.9508483409881592, 3.8321266174316406, -4.5821146965026855,
+                           -5.959300518035889, -0.26544812321662903, -1.444387435913086,
+                           0.7572097778320312, 3.510108470916748, -1.1088151931762695,
+                           -3.569631576538086, -0.06943392008543015, 0.042370155453681946,
+                           7.512031879741699e-05, -0.000407030078349635, 0.0]),
+            233: np.array([0.06, 0.0031, 0.00515088, 0.00304966,
+                           -4.060972213745117, 8.054978370666504, 0.16315306723117828,
+                           0.20903074741363525, 1.67588472366333, 0.837023913860321,
+                           -1.3942575454711914, -2.884153366088867, -0.7865201830863953,
+                           5.253849029541016, -6.900444984436035, -0.07099238783121109,
+                           -0.9084649085998535, -2.1175485017010942e-05,
+                           0.011801144108176231, 0.0, 1.2753486633300781,
+                           -0.022736486047506332, 0.09527082741260529,
+                           0.000708779611159116, -0.0018802996492013335, 0.0, 1e-10]),
+            260: np.array([1.5309321880340576, -0.5386894345283508, 0.2505153715610504,
+                           4.978420257568359, -5.5219902992248535, 6.497469425201416,
+                           3.688972234725952, -0.6701527833938599, -0.7988651394844055,
+                           -7.4512176513671875, 10.058389663696289, 2.617449998855591,
+                           -4.1134748458862305, -4.58927059173584, 2.2586185932159424,
+                           -8.232332229614258, 4.996926307678223, -4.7641282081604,
+                           -2.3733041286468506, 4.265657424926758, -6.0180840492248535,
+                           -6.202260494232178, 6.2710113525390625, 5.919536590576172,
+                           -0.17825216054916382, -7.480823516845703, 6.210508823394775,
+                           3.045118570327759, -1.476043462753296, -6.93911075592041,
+                           1.2295597791671753, -5.026687145233154, 11.215118408203125,
+                           2.8131494522094727, 5.998229503631592, -2.111699104309082,
+                           -10.391032218933105, -0.4673156142234802, 3.2028167247772217,
+                           -8.067900657653809]),
+            261: np.array([-0.642463207244873, -0.9184160828590393, 6.772172451019287,
+                           -9.329075813293457, 0.7022364139556885, -1.3836524486541748,
+                           11.549406051635742, -0.8307218551635742, 5.020711421966553,
+                           -0.16478510200977325, 1.7352665662765503, -1.243597149848938,
+                           4.824436187744141, -3.134183645248413, 0.6350889801979065,
+                           -7.111184597015381, 3.5491936206817627, -2.827716112136841,
+                           5.681900501251221, -4.908012866973877, 6.956517696380615,
+                           -4.321927070617676, 4.578726768493652, -1.5277433395385742]),
+        },
+        'hyb': (0.30959611760805744, 0.30959611760805744, 0),
+    },
 }
 
 def register_otfnal(xc_code, preset):
diff --git a/pyscf/mcpdft/test/test_lpdft.py b/pyscf/mcpdft/test/test_lpdft.py
index 71be3fb345..655d9c897c 100644
--- a/pyscf/mcpdft/test/test_lpdft.py
+++ b/pyscf/mcpdft/test/test_lpdft.py
@@ -15,7 +15,7 @@
 #
 # Author: Matthew Hennefarth <mhennefarth@uchicago.com>
 
-import tempfile, h5py
+import h5py
 import numpy as np
 from pyscf import gto, scf, dft, fci, lib
 from pyscf import mcpdft
@@ -57,9 +57,9 @@ def get_water(functional='tpbe', basis='6-31g'):
     solver2.spin = 2
 
     mc = mcpdft.CASSCF(mf, functional, 4, 4, grids_level=1)
-    mc.chkfile = tempfile.NamedTemporaryFile().name 
     # mc.chk_ci = True
     mc = mc.multi_state_mix([solver1, solver2], weights, "lin")
+    mc.chkfile = lib.NamedTemporaryFile().name
     mc.run()
     return mc
 
@@ -82,9 +82,9 @@ def get_water_triplet(functional='tPBE', basis="6-31G"):
     solver2.nroots = 2
 
     mc = mcpdft.CASSCF(mf, functional, 4, 4, grids_level=1)
-    mc.chkfile = tempfile.NamedTemporaryFile().name 
     # mc.chk_ci = True
     mc = mc.multi_state_mix([solver1, solver2], weights, "lin")
+    mc.chkfile = lib.NamedTemporaryFile().name
     mc.run()
     return mc
 
diff --git a/pyscf/mcpdft/test/test_mcpdft.py b/pyscf/mcpdft/test/test_mcpdft.py
index 1219d3b278..f96c66a4ef 100644
--- a/pyscf/mcpdft/test/test_mcpdft.py
+++ b/pyscf/mcpdft/test/test_mcpdft.py
@@ -30,7 +30,7 @@
 # Some assertAlmostTrue thresholds are loose because we are only
 # trying to test the API here; we need tight convergence and grids
 # to reproduce well when OMP is on.
-import tempfile, h5py
+import h5py
 import numpy as np
 from pyscf import gto, scf, mcscf, lib, fci, dft
 from pyscf import mcpdft
@@ -46,13 +46,16 @@ def auto_setup(xyz="Li 0 0 0\nH 1.5 0 0", fnal="tPBE"):
         atom=xyz, basis="sto3g", symmetry=True, verbose=0, output="/dev/null"
     )
     mf_nosym = scf.RHF(mol_nosym).run(conv_tol=1e-12)
-    mc_nosym = mcscf.CASSCF(mf_nosym, 5, 2).run(conv_tol=1e-8)
+    mc_nosym = mcscf.CASSCF(mf_nosym, 5, 2)
     mf_sym = scf.RHF(mol_sym).run()
     mc_sym = mcscf.CASSCF(mf_sym, 5, 2).run(conv_tol=1e-8)
+    mc_nosym.run (mo_coeff=mc_sym.mo_coeff,
+                  ci=mc_sym.ci,
+                  conv_tol=1e-8)
     mcp_ss_nosym = mcpdft.CASSCF(mc_nosym, fnal, 5, 2).run(conv_tol=1e-8)
     mcp_ss_sym = (
         mcpdft.CASSCF(mc_sym, fnal, 5, 2)
-        .set(chkfile=tempfile.NamedTemporaryFile().name)#, chk_ci=True)
+        .set(chkfile=lib.NamedTemporaryFile().name, chk_ci=True)
         .run(conv_tol=1e-8)
     )
     mcp_sa_0 = mcp_ss_nosym.state_average(
@@ -60,7 +63,7 @@ def auto_setup(xyz="Li 0 0 0\nH 1.5 0 0", fnal="tPBE"):
             1.0 / 5,
         ]
         * 5
-    ).run(conv_tol=1e-8)
+    )
     solver_S = fci.solver(mol_nosym, singlet=True).set(spin=0, nroots=2)
     solver_T = fci.solver(mol_nosym, singlet=False).set(spin=2, nroots=3)
     mcp_sa_1 = (
@@ -72,7 +75,6 @@ def auto_setup(xyz="Li 0 0 0\nH 1.5 0 0", fnal="tPBE"):
             * 5,
         )
         .set(ci=None)
-        .run(conv_tol=1e-8)
     )
     solver_A1 = fci.solver(mol_sym).set(wfnsym="A1", nroots=3)
     solver_E1x = fci.solver(mol_sym).set(wfnsym="E1x", nroots=1, spin=2)
@@ -85,9 +87,13 @@ def auto_setup(xyz="Li 0 0 0\nH 1.5 0 0", fnal="tPBE"):
             ]
             * 5,
         )
-        .set(ci=None, chkfile=tempfile.NamedTemporaryFile().name)#, chk_ci=True)
+        .set(ci=None, chkfile=lib.NamedTemporaryFile().name, chk_ci=True)
         .run(conv_tol=1e-8)
     )
+    mcp_sa_1.run (mo_coeff=mcp_sa_2.mo_coeff,
+                  conv_tol=1e-8)
+    mcp_sa_0.run (mo_coeff=mcp_sa_2.mo_coeff,
+                  conv_tol=1e-8)
     mcp = [[mcp_ss_nosym, mcp_ss_sym], [mcp_sa_0, mcp_sa_1, mcp_sa_2]]
     nosym = [mol_nosym, mf_nosym, mc_nosym]
     sym = [mol_sym, mf_sym, mc_sym]
diff --git a/pyscf/mcscf/test/test_h2o.py b/pyscf/mcscf/test/test_h2o.py
index b91fa08739..f226f05e32 100644
--- a/pyscf/mcscf/test/test_h2o.py
+++ b/pyscf/mcscf/test/test_h2o.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 import unittest
-import tempfile
 import numpy
 from pyscf import gto
 from pyscf import scf
@@ -200,7 +199,7 @@ def test_chkfile_mixed(self):
             * 4,
         )
         mo = mc.sort_mo([4, 5, 6, 10], base=1)
-        mc.chkfile = tempfile.NamedTemporaryFile().name
+        mc.chkfile = lib.NamedTemporaryFile().name
         mc.chk_ci = True
         mc.kernel(mo)
         self.assertAlmostEqual(mc.e_tot, mc_ref.e_tot, 8)
diff --git a/pyscf/mcscf/test/test_mc1step.py b/pyscf/mcscf/test/test_mc1step.py
index a89b8f08c5..9cb694ea23 100644
--- a/pyscf/mcscf/test/test_mc1step.py
+++ b/pyscf/mcscf/test/test_mc1step.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 import unittest
-import tempfile
 import numpy
 import h5py
 from pyscf import lib
@@ -37,7 +36,7 @@ def setUpModule():
     )
     m = scf.RHF(mol)
     m.conv_tol = 1e-10
-    m.chkfile = tempfile.NamedTemporaryFile().name
+    m.chkfile = lib.NamedTemporaryFile().name
     m.scf()
     mc0 = mcscf.CASSCF(m, 4, 4).run()
 
@@ -51,8 +50,8 @@ def setUpModule():
     symmetry = True
     )
     msym = scf.RHF(molsym)
-    msym.chkfile = tempfile.NamedTemporaryFile().name
     msym.conv_tol = 1e-10
+    msym.chkfile = lib.NamedTemporaryFile().name
     msym.scf()
 
 def tearDownModule():
diff --git a/pyscf/mcscf/test/test_umc1step.py b/pyscf/mcscf/test/test_umc1step.py
index afbca92483..37d3cded85 100644
--- a/pyscf/mcscf/test/test_umc1step.py
+++ b/pyscf/mcscf/test/test_umc1step.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 import unittest
-import tempfile
 import numpy
 from pyscf import lib
 from pyscf import gto
@@ -48,8 +47,8 @@ def tearDownModule():
 
 class KnownValues(unittest.TestCase):
     def test_ucasscf(self):
-        with tempfile.NamedTemporaryFile() as f:
-            mc = mcscf.UCASSCF(m, 4, 4)
+        mc = mcscf.UCASSCF(m, 4, 4)
+        with lib.NamedTemporaryFile() as f:
             mc.chkfile = f.name
             mc.run()
         self.assertAlmostEqual(mc.e_tot, -75.7460662487894, 6)
diff --git a/pyscf/mp/__init__.py b/pyscf/mp/__init__.py
index 33b1ee1522..023ca79f37 100644
--- a/pyscf/mp/__init__.py
+++ b/pyscf/mp/__init__.py
@@ -23,6 +23,7 @@
 from pyscf.mp import dfump2
 from pyscf.mp import gmp2
 from pyscf.mp import dfgmp2
+from pyscf.mp import cabs
 
 def MP2(mf, frozen=None, mo_coeff=None, mo_occ=None):
     if mf.istype('UHF'):
diff --git a/pyscf/mp/cabs.py b/pyscf/mp/cabs.py
new file mode 100644
index 0000000000..4122730444
--- /dev/null
+++ b/pyscf/mp/cabs.py
@@ -0,0 +1,246 @@
+#!/usr/bin/env python
+# Copyright 2026 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Complementary auxiliary basis set (CABS).
+
+Refs:
+* JCC 127, 221106 (2007); DOI:10.1063/1.2817618
+* JCP 128, 154103 (2008); DOI:10.1063/1.2889388
+"""
+
+import numpy
+import scipy.linalg
+
+from pyscf import gto, scf
+from pyscf.data import elements
+from pyscf.lib import logger
+from pyscf.scf import hf
+
+
+def find_cabs(mol, auxmol, lindep=1e-8):
+    """Project an auxiliary basis to the complement of the orbital basis."""
+    cabs_mol = gto.conc_mol(mol, auxmol)
+    nao = mol.nao_nr()
+    s = cabs_mol.intor_symmetric('int1e_ovlp')
+
+    ls12 = scipy.linalg.solve(s[:nao, :nao], s[:nao, nao:], assume_a='pos')
+    s[nao:, nao:] -= s[nao:, :nao].dot(ls12)
+    w, v = scipy.linalg.eigh(s[nao:, nao:])
+    c2 = v[:, w > lindep] / numpy.sqrt(w[w > lindep])
+    c1 = ls12.dot(c2)
+    return cabs_mol, numpy.vstack((-c1, c2))
+
+
+def make_cabs_auxmol(mol, auxbasis):
+    """Build a basis-only Mole object for the CABS basis.
+
+    The auxiliary functions must sit on the molecular centers, but they should
+    not add another copy of the nuclear attraction operator when the OBS and
+    CABS spaces are concatenated for one-electron matrix elements.
+    """
+    auxmol = mol.copy()
+    auxmol.basis = auxbasis
+    auxmol.build(False, False)
+    auxmol._atm[:, gto.CHARGE_OF] = 0
+    auxmol._ecpbas = auxmol._ecpbas[:0]
+    return auxmol
+
+
+def _as_cabs_auxmol(mol, auxmol_or_basis):
+    if isinstance(auxmol_or_basis, gto.MoleBase):
+        auxmol = auxmol_or_basis
+        if not auxmol._built:
+            auxmol.build(False, False)
+        if (
+            auxmol.natm == mol.natm
+            and numpy.linalg.norm(auxmol.atom_coords() - mol.atom_coords()) < 1e-10
+            and numpy.linalg.norm(auxmol.atom_charges()) > 1e-12
+        ):
+            return make_cabs_auxmol(mol, auxmol._basis)
+        return auxmol
+    return make_cabs_auxmol(mol, auxmol_or_basis)
+
+
+def _frozen_mask(mol, mo_occ, frozen):
+    mask = numpy.zeros(mo_occ.size, dtype=bool)
+    if frozen is None:
+        return mask
+
+    if isinstance(frozen, str):
+        scheme = frozen.lower()
+        if scheme == 'chemcore':
+            frozen = elements.chemcore(mol)
+        elif scheme == 'none':
+            frozen = 0
+        else:
+            raise ValueError(f'Unsupported CABS frozen orbital scheme {frozen!r}')
+
+    if isinstance(frozen, (bool, numpy.bool_)):
+        raise TypeError('CABS frozen orbitals must be specified as an int, sequence, tuple, or named scheme')
+    if isinstance(frozen, (int, numpy.integer)):
+        mask[:frozen] = True
+    else:
+        mask[numpy.asarray(frozen, dtype=int)] = True
+    return mask
+
+
+def _active_masks(mol, mo_occ, frozen):
+    frozen_mask = _frozen_mask(mol, mo_occ, frozen)
+    occidx = (mo_occ > 0) & ~frozen_mask
+    viridx = (mo_occ == 0) & ~frozen_mask
+    return occidx, viridx
+
+
+def _spin_masks(mol, spin_occ, frozen):
+    if isinstance(frozen, (tuple, list)) and len(frozen) == 2 and not isinstance(frozen[0], (int, numpy.integer)):
+        return tuple(_active_masks(mol, occ, frz) for occ, frz in zip(spin_occ, frozen))
+    return tuple(_active_masks(mol, occ, frozen) for occ in spin_occ)
+
+
+def _embed_dm(dm, nao, nca):
+    dm = numpy.asarray(dm)
+    dm_ext = numpy.zeros(dm.shape[:-2] + (nca, nca), dtype=dm.dtype)
+    dm_ext[..., :nao, :nao] = dm
+    return dm_ext
+
+
+def _get_jk(mf, cabs_mol, dm):
+    if getattr(mf, 'with_df', None) is not None:
+        dfmf = scf.RHF(cabs_mol).density_fit(auxbasis=mf.with_df.auxbasis)
+        dfmf.with_df.max_memory = mf.with_df.max_memory
+        dfmf.with_df.stdout = mf.with_df.stdout
+        dfmf.with_df.verbose = mf.with_df.verbose
+        return dfmf.get_jk(cabs_mol, dm, hermi=1)
+    return hf.get_jk(cabs_mol, dm, hermi=1)
+
+
+def _unrestricted_focks(mf, cabs_mol, dm):
+    vj, vk = _get_jk(mf, cabs_mol, dm)
+    hcore = mf.get_hcore(cabs_mol)
+    vj_tot = vj[0] + vj[1]
+    return hcore + vj_tot - vk[0], hcore + vj_tot - vk[1]
+
+
+def _extended_projector(mo_coeff, cabs_coeff):
+    nao, nmo = mo_coeff.shape
+    nca = cabs_coeff.shape[0]
+    pcoeff = numpy.zeros((nca, nmo + cabs_coeff.shape[1]))
+    pcoeff[:nao, :nmo] = mo_coeff
+    pcoeff[:, nmo:] = cabs_coeff
+    return pcoeff
+
+
+def _cabs_singles_from_fock(fock, pcoeff, mo_occ, mo_energy, occidx, viridx):
+    nmo = mo_occ.size
+    if not numpy.any(occidx):
+        return 0.0
+
+    # Diagonalize the external space formed by orbital-basis virtual MOs and CABS.
+    # The MO-virtual block is zero for canonical RHF/UHF, but gives the ROHF/non-canonical singles contribution,
+    # and MolPro separates those contributions.
+    extidx = numpy.r_[numpy.where(viridx)[0], numpy.arange(nmo, pcoeff.shape[1])]
+
+    fock_p = pcoeff.T.dot(fock).dot(pcoeff)
+    e_cabs, u_cabs = scipy.linalg.eigh(fock_p[numpy.ix_(extidx, extidx)])
+    fia = fock_p[numpy.ix_(occidx, extidx)].dot(u_cabs)
+    denom = mo_energy[occidx, None] - e_cabs
+    return numpy.einsum('i,ia,ia,ia->', mo_occ[occidx], fia, fia, 1.0 / denom)
+
+
+def energy_singles(mf, auxbasis, *, frozen='chemcore', lindep=1e-8):
+    r"""CABS singles correction to the Hartree-Fock reference energy.
+
+    For a closed-shell reference this evaluates
+
+    .. math::
+        E_\mathrm{CABS} = 2 \sum_{iA}
+            \frac{|F_{iA}|^2}{\epsilon_i - \epsilon_A}
+
+    where ``A`` denotes canonical orbitals in the external space formed by
+    the virtual MOs of the orbital basis and CABS. For UHF and ROHF references
+    the same expression is evaluated for the alpha and beta Fock matrices with
+    spin occupations as prefactors.
+
+    Args:
+        mf : SCF object
+            Converged molecular HF object.
+        auxbasis : Mole, str, list, tuple, or dict
+            CABS/OptRI basis as a Mole object or in the usual Mole.basis format.
+            If a normal charged Mole is supplied on the same centers as ``mf``,
+            only its basis is used; its nuclear charges and ECPs must be discarded.
+        frozen : None, int, sequence, tuple, or str
+            Frozen orbital selection. ``'chemcore'`` (default) freezes the chemical
+            core. ``None`` or ``0`` includes all orbitals. An integer freezes
+            the lowest orbitals and a sequence freezes explicit MO indices. For
+            UHF, a flat sequence is applied to both spins; a nested two-item
+            sequence gives separate alpha and beta frozen orbitals.
+        lindep : float
+            Linear-dependence threshold in the CABS projection.
+    """
+    mol = mf.mol
+    mo_coeff = mf.mo_coeff
+    mo_occ = mf.mo_occ
+    mo_energy = mf.mo_energy
+    is_uhf = isinstance(mo_coeff, (tuple, list)) or getattr(mo_coeff, 'ndim', 0) == 3
+    is_rohf = not is_uhf and numpy.any(mo_occ == 1)
+
+    if not is_uhf:
+        valid_occ = (mo_occ == 0) | (mo_occ == 2)
+        if is_rohf:
+            valid_occ |= mo_occ == 1
+        if numpy.any(~valid_occ):
+            raise NotImplementedError('CABS singles for general fractional-occupation references is not implemented.')
+
+    if is_rohf:
+        spin_coeff = (mo_coeff, mo_coeff)
+        spin_occ = (mo_occ > 0, mo_occ == 2)
+        spin_energy = (mo_energy.mo_ea, mo_energy.mo_eb)
+        spin_masks = _spin_masks(mol, spin_occ, frozen)
+    elif is_uhf:
+        spin_coeff = mo_coeff
+        spin_occ = mo_occ
+        spin_energy = mo_energy
+        spin_masks = _spin_masks(mol, spin_occ, frozen)
+    else:
+        occidx, viridx = _active_masks(mol, mo_occ, frozen)
+
+    auxmol = _as_cabs_auxmol(mol, auxbasis)
+    cabs_mol, cabs_coeff = find_cabs(mol, auxmol, lindep)
+    if cabs_coeff.shape[1] == 0:
+        logger.note(mf, 'CABS singles correction = 0.0')
+        return 0.0
+    nao = mol.nao_nr()
+    nca = cabs_mol.nao_nr()
+
+    if is_rohf or is_uhf:
+        dm = _embed_dm(mf.make_rdm1(), nao, nca)
+        focks = _unrestricted_focks(mf, cabs_mol, dm)
+        e_cabs = 0.0
+        for fock, coeff, occ, energy, (occidx, viridx) in zip(focks, spin_coeff, spin_occ, spin_energy, spin_masks):
+            pcoeff = _extended_projector(coeff, cabs_coeff)
+            e_cabs += _cabs_singles_from_fock(fock, pcoeff, occ, energy, occidx, viridx)
+    else:
+        pcoeff = _extended_projector(mo_coeff, cabs_coeff)
+        dm = _embed_dm(mf.make_rdm1(), nao, nca)
+        vj, vk = _get_jk(mf, cabs_mol, dm)
+        fock = mf.get_hcore(cabs_mol) + vj - vk * 0.5
+        e_cabs = _cabs_singles_from_fock(fock, pcoeff, mo_occ, mo_energy, occidx, viridx)
+
+    logger.note(mf, 'CABS singles correction = %.15g', e_cabs)
+    return e_cabs
+
+
+energy_cabs_singles = energy_singles
diff --git a/pyscf/mp/dfmp2.py b/pyscf/mp/dfmp2.py
index c81eed2bb9..fc4dca11a0 100644
--- a/pyscf/mp/dfmp2.py
+++ b/pyscf/mp/dfmp2.py
@@ -445,7 +445,7 @@ def _init_mp_df_eris_direct(with_df, occ_coeff, vir_coeff, max_memory, h5obj=Non
     # precompute for fitting
     j2c = fill_2c2e(mol, auxmol)
     try:
-        m2c = scipy.linalg.cholesky(j2c, lower=True)
+        m2c = np.asfortranarray(scipy.linalg.cholesky(j2c, lower=True))
         tag = 'cd'
     except scipy.linalg.LinAlgError:
         e, u = np.linalg.eigh(j2c)
diff --git a/pyscf/mp/dfump2.py b/pyscf/mp/dfump2.py
index 1835f68dd3..430922d9e1 100644
--- a/pyscf/mp/dfump2.py
+++ b/pyscf/mp/dfump2.py
@@ -468,7 +468,7 @@ def _init_mp_df_eris_direct(with_df, occ_coeff, vir_coeff, max_memory, h5obj=Non
     # precompute for fitting
     j2c = fill_2c2e(mol, auxmol)
     try:
-        m2c = scipy.linalg.cholesky(j2c, lower=True)
+        m2c = np.asfortranarray(scipy.linalg.cholesky(j2c, lower=True))
         tag = 'cd'
     except scipy.linalg.LinAlgError:
         e, u = np.linalg.eigh(j2c)
diff --git a/pyscf/mp/mp2f12_slow.py b/pyscf/mp/mp2f12_slow.py
index 83decc6c6c..4f7ab3f335 100644
--- a/pyscf/mp/mp2f12_slow.py
+++ b/pyscf/mp/mp2f12_slow.py
@@ -26,29 +26,18 @@
 import warnings
 from functools import reduce
 import numpy
-import scipy.linalg
 from pyscf import lib
 from pyscf.lib import logger
 from pyscf import gto
-from pyscf import ao2mo
-from pyscf.scf import jk
 from pyscf.mp import mp2
+from pyscf.mp import cabs
 
 warnings.warn('Module MP2-F12 is under testing')
 
 
 # The cabs space, the complimentary space to the OBS.
 def find_cabs(mol, auxmol, lindep=1e-8):
-    cabs_mol = gto.conc_mol(mol, auxmol)
-    nao = mol.nao_nr()
-    s = cabs_mol.intor_symmetric('int1e_ovlp')
-
-    ls12 = scipy.linalg.solve(s[:nao,:nao], s[:nao,nao:], assume_a='pos')
-    s[nao:,nao:] -= s[nao:,:nao].dot(ls12)
-    w, v = scipy.linalg.eigh(s[nao:,nao:])
-    c2 = v[:,w>lindep]/numpy.sqrt(w[w>lindep])
-    c1 = ls12.dot(c2)
-    return cabs_mol, numpy.vstack((-c1,c2))
+    return cabs.find_cabs(mol, auxmol, lindep)
 
 def trans(eri, mos):
     naoi, nmoi = mos[0].shape
diff --git a/pyscf/mp/test/test_cabs.py b/pyscf/mp/test/test_cabs.py
new file mode 100644
index 0000000000..db0050484d
--- /dev/null
+++ b/pyscf/mp/test/test_cabs.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python
+# Copyright 2026 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from functools import reduce
+import os
+import numpy
+from pyscf import gto
+from pyscf import lo
+from pyscf import scf
+from pyscf.mp import cabs
+
+
+def setUpModule():
+    global mol, mf, mf1
+    mol = gto.Mole()
+    mol.verbose = 7
+    mol.output = '/dev/null'
+    mol.atom = [
+        [8 , (0. , 0.     , 0.)],
+        [1 , (0. , -0.757 , 0.587)],
+        [1 , (0. , 0.757  , 0.587)]]
+
+    mol.basis = {'H': 'cc-pvdz',
+                 'O': 'cc-pvdz',}
+    mol.build()
+    mf = scf.RHF(mol)
+    mf.conv_tol = 1e-12
+    mf.scf()
+
+def tearDownModule():
+    global mol, mf
+    mol.stdout.close()
+    del mol, mf
+
+
+class KnownValues(unittest.TestCase):
+    def test_find_cabs(self):
+        auxmol = mol.copy()
+        auxmol.basis = 'def2-tzvp'
+        auxmol.build(False, False)
+        cabs_mol, cabs_coeff = cabs.find_cabs(mol, auxmol)
+        nao = mol.nao_nr()
+        nca = cabs_coeff.shape[0]
+        c1 = numpy.zeros((nca,nao))
+        c1[:nao,:nao] = lo.orth.lowdin(mol.intor('int1e_ovlp_sph'))
+        c = numpy.hstack((c1,cabs_coeff))
+        s = reduce(numpy.dot, (c.T, cabs_mol.intor('int1e_ovlp_sph'), c))
+        self.assertAlmostEqual(numpy.linalg.norm(s-numpy.eye(c.shape[1])), 0, 8)
+
+    def test_rhf_cabs_singles(self):
+        mol = gto.Mole(atom='''
+            H   0.000000000   0.000000000   0.457870600
+            F   0.000000000   0.000000000  -0.457870600
+        ''', basis='cc-pvdz', verbose=0)
+        mol.build()
+        mf = scf.RHF(mol).density_fit(auxbasis='cc-pvdz-jkfit').run()
+        e = cabs.energy_singles(mf, 'cc-pvdz-jkfit')
+        # MolPro: -0.033551214
+        self.assertAlmostEqual(e, -0.033551214448, 9)
+        e = cabs.energy_singles(mf, 'cc-pvdz-jkfit', frozen=[0])
+        self.assertAlmostEqual(e, -0.033551214448, 9)
+        e = cabs.energy_singles(mf, 'cc-pvdz-jkfit', frozen=0)
+        # MRCC:   -0.033749160781
+        self.assertAlmostEqual(e, -0.033749161358, 9)
+
+    def test_uhf_cabs_singles(self):
+        mol = gto.Mole(atom='''
+            H   0.000000000   0.000000000   0.457870600
+            O   0.000000000   0.000000000  -0.457870600
+        ''', basis='cc-pvdz', spin=1, verbose=0)
+        mol.build()
+        mf = scf.UHF(mol).density_fit(auxbasis='cc-pvdz-jkfit').run()
+        e = cabs.energy_singles(mf, 'cc-pvdz-jkfit')
+        # no MolPro, it has only ROHF-MP2
+        self.assertAlmostEqual(e, -0.022958881659, 9)
+        e = cabs.energy_singles(mf, 'cc-pvdz-jkfit', frozen=([0], [0]))
+        self.assertAlmostEqual(e, -0.022958881659, 9)
+        e = cabs.energy_singles(mf, 'cc-pvdz-jkfit', frozen=0)
+        # MRCC:   -0.023166900284
+        self.assertAlmostEqual(e, -0.023166901553, 9)
+
+    def test_rohf_cabs_singles(self):
+        mol = gto.Mole(atom='''
+            H   0.000000000   0.000000000   0.457870600
+            O   0.000000000   0.000000000  -0.457870600
+        ''', basis='cc-pvdz', spin=1, verbose=0)
+        mol.build()
+        mf = scf.ROHF(mol).density_fit(auxbasis='cc-pvdz-jkfit').run()
+        e = cabs.energy_singles(mf, 'cc-pvdz-jkfit')
+        # MolPro:
+        #                                       TOTAL          ALPHA          BETA
+        # Singles Contributions MO        -0.002681874   -0.001339551   -0.001342323
+        # Singles Contributions CABS      -0.022664571   -0.013326055   -0.009338516
+        # Pure DF-RHF relaxation          -0.022020485
+        #
+        # One has to sum MO + CABS contributions:
+        # -0.002681874 + -0.022664571 = -0.025346445
+        self.assertAlmostEqual(e, -0.025343529823, 9)
+        e = cabs.energy_singles(mf, 'cc-pvdz-jkfit', frozen=([0], [0]))
+        self.assertAlmostEqual(e, -0.025343529823, 9)
+        e = cabs.energy_singles(mf, 'cc-pvdz-jkfit', frozen=0)
+        # MolPro:
+        #                                       TOTAL          ALPHA          BETA
+        # Singles Contributions MO        -0.002709569   -0.001352954   -0.001356615
+        # Singles Contributions CABS      -0.022873615   -0.013438077   -0.009435538
+        # Pure DF-RHF relaxation          -0.022198754
+        # Sum: -0.025583184
+        self.assertAlmostEqual(e, -0.025580122540, 9)
+
+
+if __name__ == "__main__":
+    print("Full Tests for CABS")
+    unittest.main()
diff --git a/pyscf/mp/test/test_dfmp2.py b/pyscf/mp/test/test_dfmp2.py
index 42a693739b..66bfdbdf61 100644
--- a/pyscf/mp/test/test_dfmp2.py
+++ b/pyscf/mp/test/test_dfmp2.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 import unittest
-import tempfile
 from functools import reduce
 import numpy
 import numpy as np
@@ -125,7 +124,7 @@ def test_read_ovL_incore(self):
         self.assertAlmostEqual(mmp.e_corr, mmp1.e_corr, 8)
 
     def test_read_ovL_outcore(self):
-        ftmp = tempfile.NamedTemporaryFile()
+        ftmp = lib.NamedTemporaryFile()
 
         mmp = mp.dfmp2.DFMP2(mf)
         eris = mmp.ao2mo(ovL_to_save=ftmp.name)
diff --git a/pyscf/mp/test/test_dfump2.py b/pyscf/mp/test/test_dfump2.py
index ada048fdf8..3741b2c2c8 100644
--- a/pyscf/mp/test/test_dfump2.py
+++ b/pyscf/mp/test/test_dfump2.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 import unittest
-import tempfile
 from functools import reduce
 import numpy
 import numpy as np
@@ -106,7 +105,7 @@ def test_read_ovL_incore(self):
         self.assertAlmostEqual(mmp.e_corr, mmp1.e_corr, 8)
 
     def test_read_ovL_outcore(self):
-        ftmp = tempfile.NamedTemporaryFile()
+        ftmp = lib.NamedTemporaryFile()
 
         mmp = mp.dfump2.DFUMP2(mf)
         eris = mmp.ao2mo(ovL_to_save=ftmp.name)
diff --git a/pyscf/mrpt/dfnevpt2.py b/pyscf/mrpt/dfnevpt2.py
index 4a11d64c4a..6c1b9a632c 100644
--- a/pyscf/mrpt/dfnevpt2.py
+++ b/pyscf/mrpt/dfnevpt2.py
@@ -16,7 +16,6 @@
 # Authors: Bhavnesh Jangid <jangidbhavnesh@uchicago.edu>
 
 import ctypes
-import tempfile
 import numpy as np
 from functools import reduce
 from pyscf import lib
@@ -128,7 +127,7 @@ def _dfnevpt2_eris_outcore(mc, mo_coeff, with_df):
 
     # Step-3: from the transfomed (L|pq), build pacv and cvcv
     tmpdir = lib.param.TMPDIR
-    cvcvfile = tempfile.NamedTemporaryFile(dir=tmpdir)
+    cvcvfile = lib.NamedTemporaryFile(dir=tmpdir)
     # Edge cases
     if ncore * nvir == 0 or ncore * nvir == 0:
         f5 = lib.H5TmpFile(cvcvfile.name, 'w')
diff --git a/pyscf/mrpt/nevpt2.py b/pyscf/mrpt/nevpt2.py
index 39a41d7e66..77d9abde53 100644
--- a/pyscf/mrpt/nevpt2.py
+++ b/pyscf/mrpt/nevpt2.py
@@ -19,7 +19,6 @@
 
 import ctypes
 
-import tempfile
 from functools import reduce
 import numpy
 import h5py
@@ -381,7 +380,7 @@ def Sijrs(mc, eris, verbose=None):
     ncas = mo_cas.shape[1]
     nocc = ncore + ncas
     if eris is None:
-        erifile = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+        erifile = lib.NamedTemporaryFile(dir=lib.param.TMPDIR)
         feri = ao2mo.outcore.general(mc.mol, (mo_core,mo_virt,mo_core,mo_virt),
                                      erifile.name, verbose=mc.verbose)
     else:
@@ -992,7 +991,7 @@ def trans_e1_outcore(mc, mo, max_memory=None, ioblk_size=256, tmpdir=None,
 
     if tmpdir is None:
         tmpdir = lib.param.TMPDIR
-    swapfile = tempfile.NamedTemporaryFile(dir=tmpdir)
+    swapfile = lib.NamedTemporaryFile(dir=tmpdir)
     ao2mo.outcore.half_e1(mol, (mo[:,:nocc],mo[:,ncore:]), swapfile.name,
                           max_memory=max_memory, ioblk_size=ioblk_size,
                           verbose=log, compact=False)
@@ -1016,7 +1015,7 @@ def load_buf(r0,r1):
     time0 = logger.timer(mol, 'halfe1', *time0)
     time1 = [logger.process_clock(), logger.perf_counter()]
     ao_loc = numpy.array(mol.ao_loc_nr(), dtype=numpy.int32)
-    cvcvfile = tempfile.NamedTemporaryFile(dir=tmpdir)
+    cvcvfile = lib.NamedTemporaryFile(dir=tmpdir)
     with lib.H5TmpFile(cvcvfile.name, 'w') as f5:
         cvcv = f5.create_dataset('eri_mo', (ncore*nvir,ncore*nvir), 'f8')
         ppaa, papa, pacv = _trans(mo, ncore, ncas, load_buf, cvcv, ao_loc)[:3]
diff --git a/pyscf/pbc/df/df.py b/pyscf/pbc/df/df.py
index d5be40139d..96aca10ca7 100644
--- a/pyscf/pbc/df/df.py
+++ b/pyscf/pbc/df/df.py
@@ -31,8 +31,8 @@
 
 import os
 import ctypes
+import sys
 import warnings
-import tempfile
 import contextlib
 import itertools
 import numpy
@@ -166,7 +166,7 @@ def __init__(self, cell, kpts=None):
         self.linear_dep_threshold = LINEAR_DEP_THR
         self._j_only = False
 # If _cderi_to_save is specified, the 3C-integral tensor will be saved in this file.
-        self._cderi_to_save = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+        self._cderi_to_save = lib.NamedTemporaryFile(dir=lib.param.TMPDIR)
 # If _cderi is specified, the 3C-integral tensor will be read from this file
         self._cderi = None
         self._rsh_df = {}  # Range separated Coulomb DF objects
@@ -277,6 +277,10 @@ def build(self, j_only=None, with_j3c=True, kpts_band=None):
                 if self._cderi == cderi and os.path.isfile(cderi):
                     logger.warn(self, 'File %s (specified by ._cderi) is '
                                 'overwritten by GDF initialization.', cderi)
+                    # On Windows, close the handle before os.remove to avoid
+                    # permission error.
+                    if sys.platform == 'win32':
+                        self._cderi_to_save.close()
                     os.remove(cderi)
                 else:
                     logger.warn(self, 'Value of ._cderi is ignored. '
diff --git a/pyscf/pbc/df/gdf_builder.py b/pyscf/pbc/df/gdf_builder.py
index 3f0a20b0ce..880d58b8c0 100644
--- a/pyscf/pbc/df/gdf_builder.py
+++ b/pyscf/pbc/df/gdf_builder.py
@@ -26,7 +26,6 @@
 
 import os
 import ctypes
-import tempfile
 import numpy as np
 import scipy.linalg
 from pyscf import gto
@@ -204,7 +203,7 @@ def outcore_auxe2(self, cderi_file, intor='int3c2e', aosym='s2', comp=None,
             shls_slice :
                 Indicate the shell slices in the primitive cell
         '''
-        swapfile = tempfile.NamedTemporaryFile(dir=os.path.dirname(cderi_file))
+        swapfile = lib.NamedTemporaryFile(dir=os.path.dirname(cderi_file))
         fswap = lib.H5TmpFile(swapfile.name)
         swapfile = None
 
@@ -213,6 +212,7 @@ def outcore_auxe2(self, cderi_file, intor='int3c2e', aosym='s2', comp=None,
         rs_cell = self.rs_cell
         fused_cell = self.fused_cell
         naux = self.auxcell.nao
+        nauxc = self.fused_cell.nao
         kpts = self.kpts
         nkpts = kpts.shape[0]
 
@@ -243,7 +243,6 @@ def outcore_auxe2(self, cderi_file, intor='int3c2e', aosym='s2', comp=None,
         else:
             merge_dd = None
 
-        reindex_k = None
         # TODO: shape = (comp, nao_pair, naux)
         shape = (nao_pair, naux)
         if j_only or nkpts == 1:
@@ -254,14 +253,12 @@ def outcore_auxe2(self, cderi_file, intor='int3c2e', aosym='s2', comp=None,
                 # Ensure kk_idx is a subset of all possible ki-kj paris
                 assert np.all(np.isin(kk_idx, kikj_idx))
                 kikj_idx = kk_idx
-            reindex_k = kikj_idx // nkpts
         else:
             nkpts_ij = nkpts * nkpts
             if kk_idx is None:
                 kikj_idx = np.arange(nkpts_ij, dtype=np.int32)
             else:
                 kikj_idx = kk_idx
-            reindex_k = kikj_idx
             if merge_dd and kk_idx is None:
                 kpt_ij_iters = list(kk_adapted_iter(cell, kpts))
 
@@ -280,80 +277,169 @@ def outcore_auxe2(self, cderi_file, intor='int3c2e', aosym='s2', comp=None,
             self._outcore_dd_block(fswap, intor, aosym, comp, j_only,
                                    dataname, kk_idx=kk_idx)
 
-        # int3c2e for (cell, cell | fused_cell)
-        int3c = self.gen_int3c_kernel(intor, aosym, comp, j_only,
-                                      reindex_k=reindex_k, auxcell=self.fused_cell)
-
         mem_now = lib.current_memory()[0]
         log.debug2('memory = %s', mem_now)
         max_memory = max(2000, self.max_memory-mem_now)
 
         # split the 3-center tensor (nkpts_ij, i, j, aux) along shell i.
         # plus 1 to ensure the intermediates in libpbc do not overflow
-        buflen = min(max(int(max_memory*.9e6/16/naux/(nkpts_ij+1)), 1), nao_pair)
+        buflen = min(max(int(max_memory*.9e6/16/nauxc/(nkpts_ij+1)), 1), nao_pair)
         # lower triangle part
         sh_ranges = _guess_shell_ranges(cell, buflen, aosym, start=ish0, stop=ish1)
         max_buflen = max([x[2] for x in sh_ranges])
-        if max_buflen > buflen:
+
+        # The per-step int3c output buffer is (nkpts_ij_chunk, max_buflen,
+        # nauxc) doubles for both R and I, i.e. nkpts_ij_chunk*max_buflen*nauxc*16
+        # bytes. When nkpts_ij is large (e.g. 6x6x6 k-mesh with j_only=False
+        # gives 46656 pairs), this buffer dwarfs max_memory and the libpbc
+        # int3c kernel OOMs. Split kikj_idx into chunks so each chunk's int3c
+        # call fits within max_memory, then loop over chunks. The shell-block
+        # granularity (sh_ranges) is held fixed across chunks so that fswap
+        # row writes (row0:row1) remain consistent.
+        bytes_per_pair = max_buflen * nauxc * 16
+        kpts_chunk = max(1, int(max_memory * .9e6 / bytes_per_pair))
+        kpts_chunk = min(kpts_chunk, nkpts_ij)
+
+        # Build chunks of kpt-pair indices. The merge_dd branch that pairs
+        # (ij_idx, ji_idx) within an adapted-kpt group (see kpt_ij_iters
+        # below) requires both indices to live in the same chunk; for that
+        # path we group whole kk_adapted groups together. Otherwise we chunk
+        # contiguously.
+        need_group_chunks = (merge_dd is not None
+                             and not (j_only or nkpts == 1)
+                             and not gamma_point_only
+                             and kk_idx is None)
+        if need_group_chunks:
+            chunks_meta = []
+            cur_pairs = []
+            cur_groups = []
+            for grp in kpt_ij_iters:
+                _, ki_idx_g, kj_idx_g, self_conj_g = grp
+                grp_pairs = list(ki_idx_g * nkpts + kj_idx_g)
+                if not self_conj_g:
+                    grp_pairs += list(kj_idx_g * nkpts + ki_idx_g)
+                if cur_pairs and len(cur_pairs) + len(grp_pairs) > kpts_chunk:
+                    chunks_meta.append((np.asarray(cur_pairs, dtype=np.int32),
+                                        cur_groups))
+                    cur_pairs = []
+                    cur_groups = []
+                cur_pairs.extend(grp_pairs)
+                cur_groups.append(grp)
+            if cur_pairs:
+                chunks_meta.append((np.asarray(cur_pairs, dtype=np.int32),
+                                    cur_groups))
+        else:
+            chunks_meta = []
+            for start in range(0, nkpts_ij, kpts_chunk):
+                end = min(start + kpts_chunk, nkpts_ij)
+                chunks_meta.append((np.asarray(kikj_idx[start:end],
+                                               dtype=np.int32), None))
+        nchunks = len(chunks_meta)
+
+        if max_buflen > buflen and nchunks == 1:
+            # Only meaningful when chunking did not bring usage under budget.
             log.warn('memory usage of outcore_auxe2 may be %.2f times over max_memory',
                      (max_buflen/buflen - 1))
+        if nchunks > 1:
+            log.debug('outcore_auxe2: splitting %d kpt pairs into %d chunks '
+                      '(<= %d pairs/chunk, max_buflen=%d, nauxc=%d)',
+                      nkpts_ij, nchunks, kpts_chunk, max_buflen, nauxc)
 
         cpu0 = logger.process_clock(), logger.perf_counter()
         nsteps = len(sh_ranges)
-        row1 = 0
-        for istep, (sh_start, sh_end, nrow) in enumerate(sh_ranges):
-            if aosym == 's2':
-                shls_slice = (sh_start, sh_end, jsh0, sh_end, ksh0, ksh1)
+
+        for ichunk, (kikj_idx_chunk, chunk_groups) in enumerate(chunks_meta):
+            # Build a fresh int3c kernel restricted to this chunk's kpt-pairs.
+            # The chunk's reindex_k follows the same convention as the original
+            # reindex_k assignment above.
+            if j_only or nkpts == 1:
+                reindex_k_chunk = kikj_idx_chunk // nkpts
             else:
-                shls_slice = (sh_start, sh_end, jsh0, jsh1, ksh0, ksh1)
-            outR, outI = int3c(shls_slice)
-            log.debug2('      step [%d/%d], shell range [%d:%d], len(buf) = %d',
-                       istep+1, nsteps, sh_start, sh_end, nrow)
-            cpu0 = log.timer_debug1(f'outcore_auxe2 [{istep+1}/{nsteps}]', *cpu0)
-
-            outR = list(outR)
-            if outI is not None:
-                outI = list(outI)
-            for k, idx in enumerate(kikj_idx):
-                outR[k] = self.fuse(outR[k], axis=1)
-                if f'{dataname}I/{idx}' in fswap and outI[k] is not None:
-                    outI[k] = self.fuse(outI[k], axis=1)
-
-            shls_slice = (sh_start, sh_end, 0, cell.nbas)
-            row0, row1 = row1, row1 + nrow
-            if merge_dd is not None:
-                if gamma_point_only:
-                    merge_dd(outR[0], fswap[f'{dataname}R-dd/0'], shls_slice)
-                elif j_only or nkpts == 1:
-                    for k, idx in enumerate(kikj_idx):
-                        merge_dd(outR[k], fswap[f'{dataname}R-dd/{idx}'], shls_slice)
-                        merge_dd(outI[k], fswap[f'{dataname}I-dd/{idx}'], shls_slice)
-                elif kk_idx is None:
-                    for _, ki_idx, kj_idx, self_conj in kpt_ij_iters:
-                        kpt_ij_idx = ki_idx * nkpts + kj_idx
-                        if self_conj:
-                            for ij_idx in kpt_ij_idx:
-                                merge_dd(outR[ij_idx], fswap[f'{dataname}R-dd/{ij_idx}'], shls_slice)
-                                merge_dd(outI[ij_idx], fswap[f'{dataname}I-dd/{ij_idx}'], shls_slice)
-                        else:
-                            kpt_ji_idx = kj_idx * nkpts + ki_idx
-                            for ij_idx, ji_idx in zip(kpt_ij_idx, kpt_ji_idx):
-                                j3cR_dd = np.asarray(fswap[f'{dataname}R-dd/{ij_idx}'])
-                                merge_dd(outR[ij_idx], j3cR_dd, shls_slice)
-                                merge_dd(outR[ji_idx], j3cR_dd.transpose(1,0,2), shls_slice)
-                                j3cI_dd = np.asarray(fswap[f'{dataname}I-dd/{ij_idx}'])
-                                merge_dd(outI[ij_idx], j3cI_dd, shls_slice)
-                                merge_dd(outI[ji_idx],-j3cI_dd.transpose(1,0,2), shls_slice)
+                reindex_k_chunk = kikj_idx_chunk
+            chunk_int3c = self.gen_int3c_kernel(
+                intor, aosym, comp, j_only,
+                reindex_k=reindex_k_chunk, auxcell=self.fused_cell)
+            # Local map from global kpt-pair index -> position in outR/outI
+            pair_pos = {int(idx): k for k, idx in enumerate(kikj_idx_chunk)}
+
+            row1 = 0
+            for istep, (sh_start, sh_end, nrow) in enumerate(sh_ranges):
+                if aosym == 's2':
+                    int3c_shls_slice = (sh_start, sh_end, jsh0, sh_end, ksh0, ksh1)
                 else:
-                    for k, idx in enumerate(kikj_idx):
-                        merge_dd(outR[k], fswap[f'{dataname}R-dd/{idx}'], shls_slice)
-                        merge_dd(outI[k], fswap[f'{dataname}I-dd/{idx}'], shls_slice)
-
-            for k, idx in enumerate(kikj_idx):
-                fswap[f'{dataname}R/{idx}'][row0:row1] = outR[k]
-                if f'{dataname}I/{idx}' in fswap:
-                    fswap[f'{dataname}I/{idx}'][row0:row1] = outI[k]
-            outR = outI = None
+                    int3c_shls_slice = (sh_start, sh_end, jsh0, jsh1, ksh0, ksh1)
+                outR, outI = chunk_int3c(int3c_shls_slice)
+                log.debug2('      chunk [%d/%d] step [%d/%d], shell range [%d:%d], '
+                           'len(buf) = %d',
+                           ichunk+1, nchunks, istep+1, nsteps, sh_start, sh_end, nrow)
+                cpu0 = log.timer_debug1(
+                    f'outcore_auxe2 chunk[{ichunk+1}/{nchunks}] '
+                    f'step[{istep+1}/{nsteps}]', *cpu0)
+
+                outR = list(outR)
+                if outI is not None:
+                    outI = list(outI)
+                for k, idx in enumerate(kikj_idx_chunk):
+                    outR[k] = self.fuse(outR[k], axis=1)
+                    if f'{dataname}I/{idx}' in fswap and outI[k] is not None:
+                        outI[k] = self.fuse(outI[k], axis=1)
+
+                merge_shls_slice = (sh_start, sh_end, 0, cell.nbas)
+                row0, row1 = row1, row1 + nrow
+                if merge_dd is not None:
+                    if gamma_point_only:
+                        merge_dd(outR[0], fswap[f'{dataname}R-dd/0'],
+                                 merge_shls_slice)
+                    elif j_only or nkpts == 1:
+                        for k, idx in enumerate(kikj_idx_chunk):
+                            merge_dd(outR[k], fswap[f'{dataname}R-dd/{idx}'],
+                                     merge_shls_slice)
+                            merge_dd(outI[k], fswap[f'{dataname}I-dd/{idx}'],
+                                     merge_shls_slice)
+                    elif kk_idx is None:
+                        for _, ki_idx, kj_idx, self_conj in chunk_groups:
+                            kpt_ij_idx = ki_idx * nkpts + kj_idx
+                            if self_conj:
+                                for ij_idx in kpt_ij_idx:
+                                    ij_local = pair_pos[int(ij_idx)]
+                                    merge_dd(outR[ij_local],
+                                             fswap[f'{dataname}R-dd/{ij_idx}'],
+                                             merge_shls_slice)
+                                    merge_dd(outI[ij_local],
+                                             fswap[f'{dataname}I-dd/{ij_idx}'],
+                                             merge_shls_slice)
+                            else:
+                                kpt_ji_idx = kj_idx * nkpts + ki_idx
+                                for ij_idx, ji_idx in zip(kpt_ij_idx, kpt_ji_idx):
+                                    ij_local = pair_pos[int(ij_idx)]
+                                    ji_local = pair_pos[int(ji_idx)]
+                                    j3cR_dd = np.asarray(
+                                        fswap[f'{dataname}R-dd/{ij_idx}'])
+                                    merge_dd(outR[ij_local], j3cR_dd,
+                                             merge_shls_slice)
+                                    merge_dd(outR[ji_local],
+                                             j3cR_dd.transpose(1,0,2),
+                                             merge_shls_slice)
+                                    j3cI_dd = np.asarray(
+                                        fswap[f'{dataname}I-dd/{ij_idx}'])
+                                    merge_dd(outI[ij_local], j3cI_dd,
+                                             merge_shls_slice)
+                                    merge_dd(outI[ji_local],
+                                            -j3cI_dd.transpose(1,0,2),
+                                             merge_shls_slice)
+                    else:
+                        for k, idx in enumerate(kikj_idx_chunk):
+                            merge_dd(outR[k], fswap[f'{dataname}R-dd/{idx}'],
+                                     merge_shls_slice)
+                            merge_dd(outI[k], fswap[f'{dataname}I-dd/{idx}'],
+                                     merge_shls_slice)
+
+                for k, idx in enumerate(kikj_idx_chunk):
+                    fswap[f'{dataname}R/{idx}'][row0:row1] = outR[k]
+                    if f'{dataname}I/{idx}' in fswap:
+                        fswap[f'{dataname}I/{idx}'][row0:row1] = outI[k]
+                outR = outI = None
+            chunk_int3c = None
         return fswap
 
     def weighted_ft_ao(self, kpt):
diff --git a/pyscf/pbc/df/mdf.py b/pyscf/pbc/df/mdf.py
index e8f522130a..00e869660b 100644
--- a/pyscf/pbc/df/mdf.py
+++ b/pyscf/pbc/df/mdf.py
@@ -22,7 +22,6 @@
 J. Chem. Phys. 147, 164119 (2017)
 '''
 
-import tempfile
 import numpy as np
 import h5py
 import scipy.linalg
@@ -89,7 +88,7 @@ def __init__(self, cell, kpts=np.zeros((1,3))):
         self.linear_dep_threshold = df.LINEAR_DEP_THR
         self._j_only = False
 # If _cderi_to_save is specified, the 3C-integral tensor will be saved in this file.
-        self._cderi_to_save = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+        self._cderi_to_save = lib.NamedTemporaryFile(dir=lib.param.TMPDIR)
 # If _cderi is specified, the 3C-integral tensor will be read from this file
         self._cderi = None
         self._rsh_df = {}  # Range separated Coulomb DF objects
diff --git a/pyscf/pbc/df/rsdf.py b/pyscf/pbc/df/rsdf.py
index a9e7b13100..4f7142cad4 100644
--- a/pyscf/pbc/df/rsdf.py
+++ b/pyscf/pbc/df/rsdf.py
@@ -40,9 +40,9 @@
 '''
 
 import os
+import sys
 import h5py
 import scipy.linalg
-import tempfile
 import numpy as np
 
 from pyscf import lib
@@ -424,10 +424,11 @@ def outcore_auxe2(self, cderi_file, intor='int3c2e', aosym='s2', comp=None,
                       kptij_lst=None, j_only=False, dataname='j3c-junk',
                       shls_slice=None):
         # Deadlock on NFS if you open an already-opened tmpfile in H5PY
-        # swapfile = tempfile.NamedTemporaryFile(dir=os.path.dirname(cderi_file))
+        # swapfile = lib.NamedTemporaryFile(dir=os.path.dirname(cderi_file))
         fswap = lib.H5TmpFile(dir=os.path.dirname(cderi_file), prefix='.outcore_auxe2_swap')
         # avoid trash files
-        os.unlink(fswap.filename)
+        if sys.platform != 'win32':
+            os.unlink(fswap.filename)
 
         cell = self.cell
         if self.use_bvk and self.kpts_band is None:
diff --git a/pyscf/pbc/df/rsdf_builder.py b/pyscf/pbc/df/rsdf_builder.py
index 846db5cd83..b222140b88 100644
--- a/pyscf/pbc/df/rsdf_builder.py
+++ b/pyscf/pbc/df/rsdf_builder.py
@@ -28,6 +28,7 @@
 '''
 
 import os
+import sys
 import ctypes
 import warnings
 import tempfile
@@ -384,7 +385,8 @@ def outcore_auxe2(self, cderi_file, intor='int3c2e', aosym='s2', comp=None,
         # as cderi_file.
         fswap = lib.H5TmpFile(dir=os.path.dirname(cderi_file), prefix='.outcore_auxe2_swap')
         # Unlink swapfile to avoid trash files
-        os.unlink(fswap.filename)
+        if sys.platform != 'win32':
+            os.unlink(fswap.filename)
 
         log = logger.new_logger(self)
         cell = self.cell
diff --git a/pyscf/pbc/df/test/test_gdf_builder.py b/pyscf/pbc/df/test/test_gdf_builder.py
index 5f49ef477e..aeddf16c96 100644
--- a/pyscf/pbc/df/test/test_gdf_builder.py
+++ b/pyscf/pbc/df/test/test_gdf_builder.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import unittest
-import tempfile
 import numpy as np
 import scipy.linalg
 from pyscf import lib
@@ -108,7 +107,7 @@ def test_get_2c2e_cart(self):
 
     def test_make_j3c_gamma(self):
         dfbuilder = gdf_builder._CCGDFBuilder(cell, auxcell).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name)
             v2 = load(tmpf.name, kpts[[0, 0]])
             self.assertAlmostEqual(lib.fp(v2), 1.5094843470069796, 7)
@@ -124,7 +123,7 @@ def test_make_j3c_gamma(self):
 
     def test_make_j3c(self):
         dfbuilder = gdf_builder._CCGDFBuilder(cell, auxcell, kpts).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2')
             v_s2 = []
             for ki in range(nkpts):
@@ -148,7 +147,7 @@ def test_make_j3c(self):
 
     def test_make_j3c_j_only(self):
         dfbuilder = gdf_builder._CCGDFBuilder(cell, auxcell, kpts).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2', j_only=True)
             v_s2 = []
             for ki in range(nkpts):
@@ -170,7 +169,7 @@ def test_make_j3c_gamma_2d(self):
                       dimension=2)
         auxcell = df.make_auxcell(cell, auxbasis)
         dfbuilder = gdf_builder._CCGDFBuilder(cell, auxcell).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name)
             v2 = load(tmpf.name, kpts[[0, 0]])
             self.assertAlmostEqual(lib.fp(v2.T.dot(v2)), 0.3289627476345819, 8)
@@ -183,7 +182,7 @@ def test_make_j3c_gamma_1d(self):
                       dimension=1)
         auxcell = df.make_auxcell(cell, auxbasis)
         dfbuilder = gdf_builder._CCGDFBuilder(cell, auxcell).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name)
             v2 = load(tmpf.name, kpts[[0, 0]])
             self.assertAlmostEqual(lib.fp(v2), 1.7171975296579753, 6)
@@ -197,7 +196,7 @@ def test_make_j3c_gamma_0d(self):
         auxcell = df.make_auxcell(cell, auxbasis)
         dfbuilder = gdf_builder._CCGDFBuilder(cell, auxcell).build()
         ref = cholesky_eri(cell, auxmol=auxcell)
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name)
             v2 = load(tmpf.name, kpts[[0, 0]])
             self.assertAlmostEqual(abs(v2 - ref).max(), 0, 9)
@@ -278,7 +277,7 @@ def test_vs_fft(self):
         j3c = lib.dot(auxG.conj()*wcoulG, aopair.reshape(ngrids,-1))
         j2c = scipy.linalg.cholesky(j2c[0], lower=True)
         ref = scipy.linalg.solve_triangular(j2c, j3c, lower=True)
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2', j_only=True)
             v1 = load(tmpf.name, kpts[[0, 0]])
             self.assertAlmostEqual(abs(ref - v1).max(), 0, 7)
@@ -304,7 +303,7 @@ def test_get_2c2e_cart_lr(self):
 
     def test_make_j3c_gamma_lr(self):
         dfbuilder = gdf_builder._CCGDFBuilder(cell_lr, auxcell_lr).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name)
             v2 = load(tmpf.name, kpts[[0, 0]])
             self.assertAlmostEqual(lib.fp(v2), 1.0942903795950072, 7)
@@ -320,7 +319,7 @@ def test_make_j3c_gamma_lr(self):
 
     def test_make_j3c_lr(self):
         dfbuilder = gdf_builder._CCGDFBuilder(cell_lr, auxcell_lr, kpts).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2')
             v_s2 = []
             for ki in range(nkpts):
@@ -343,7 +342,7 @@ def test_make_j3c_lr(self):
 
     def test_make_j3c_j_only_lr(self):
         dfbuilder = gdf_builder._CCGDFBuilder(cell_lr, auxcell_lr, kpts).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2', j_only=True)
             v_s2 = []
             for ki in range(nkpts):
@@ -392,7 +391,7 @@ def test_vs_fft_lr(self):
         j3c = lib.dot(auxG.conj()*wcoulG, aopair.reshape(ngrids,-1))
         j2c = scipy.linalg.cholesky(j2c[0], lower=True)
         ref = scipy.linalg.solve_triangular(j2c, j3c, lower=True)
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2', j_only=True)
             v1 = load(tmpf.name, kpts[[0, 0]])
             self.assertAlmostEqual(abs(ref - v1).max(), 0, 7)
@@ -418,7 +417,7 @@ def test_get_2c2e_cart_sr(self):
 
     def test_make_j3c_gamma_sr(self):
         dfbuilder = gdf_builder._CCGDFBuilder(cell_sr, auxcell_sr).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name)
             v2 = load(tmpf.name, kpts[[0, 0]])
             self.assertAlmostEqual(lib.fp(v2), 0.9647178630555139, 7)
@@ -434,7 +433,7 @@ def test_make_j3c_gamma_sr(self):
 
     def test_make_j3c_sr(self):
         dfbuilder = gdf_builder._CCGDFBuilder(cell_sr, auxcell_sr, kpts).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2')
             v_s2 = []
             for ki in range(nkpts):
@@ -458,7 +457,7 @@ def test_make_j3c_sr(self):
 
     def test_make_j3c_j_only_sr(self):
         dfbuilder = gdf_builder._CCGDFBuilder(cell_sr, auxcell_sr, kpts).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2', j_only=True)
             v_s2 = []
             for ki in range(nkpts):
@@ -506,7 +505,7 @@ def test_vs_fft_sr(self):
         j3c = lib.dot(auxG.conj()*wcoulG, aopair.reshape(ngrids,-1))
         j2c = scipy.linalg.cholesky(j2c[0], lower=True)
         ref = scipy.linalg.solve_triangular(j2c, j3c, lower=True)
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2', j_only=True)
             v1 = load(tmpf.name, kpts[[0, 0]])
             self.assertAlmostEqual(abs(ref - v1).max(), 0, 7)
diff --git a/pyscf/pbc/df/test/test_mdf_builder.py b/pyscf/pbc/df/test/test_mdf_builder.py
index ca519c587b..f1de15a61d 100644
--- a/pyscf/pbc/df/test/test_mdf_builder.py
+++ b/pyscf/pbc/df/test/test_mdf_builder.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import unittest
-import tempfile
 import numpy as np
 import scipy.linalg
 from pyscf import lib
@@ -109,7 +108,7 @@ def test_ccmdf_get_2c2e_cart(self):
 
     def test_ccmdf_make_j3c_gamma(self):
         dfbuilder = mdf._CCMDFBuilder(cell, auxcell).set(mesh=mesh).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2')
             v2 = load(tmpf.name, kpts[[0, 0]])
             self.assertAlmostEqual(lib.fp(v2), 0.01486794482668373, 7)
@@ -127,7 +126,7 @@ def test_ccmdf_make_j3c_gamma(self):
 
     def test_ccmdf_make_j3c(self):
         dfbuilder = mdf._CCMDFBuilder(cell, auxcell, kpts).set(mesh=mesh).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2')
             v_s2 = []
             for ki in range(nkpts):
@@ -149,7 +148,7 @@ def test_ccmdf_make_j3c(self):
 
     def test_ccmdf_make_j3c_j_only(self):
         dfbuilder = mdf._CCMDFBuilder(cell, auxcell, kpts).set(mesh=mesh).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2', j_only=True)
             v_s2 = []
             for ki in range(nkpts):
@@ -200,7 +199,7 @@ def test_ccmdf_vs_fft(self):
         j2c = dfbuilder.eigenvalue_decomposed_metric(j2c[0])
         ref = lib.dot(j2c[0], j3c)
         ref = ref.T.dot(ref)
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2', j_only=True)
             v1 = load(tmpf.name, kpts[[0, 0]])
             self.assertAlmostEqual(abs(ref - v1).max(), 0, 8)
@@ -224,7 +223,7 @@ def test_rsmdf_get_2c2e_cart(self):
 
     def test_rsmdf_make_j3c_gamma(self):
         dfbuilder = mdf._RSMDFBuilder(cell, auxcell).set(mesh=mesh).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2')
             v2 = load(tmpf.name, kpts[[0, 0]])
             self.assertAlmostEqual(lib.fp(v2), 0.01486794482668373, 7)
@@ -257,7 +256,7 @@ def test_rsmdf_make_j3c_gamma(self):
 
     def test_rsmdf_make_j3c(self):
         dfbuilder = mdf._RSMDFBuilder(cell, auxcell, kpts).set(mesh=mesh).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2')
             v_s2 = []
             for ki in range(nkpts):
@@ -279,7 +278,7 @@ def test_rsmdf_make_j3c(self):
 
     def test_rsmdf_make_j3c_j_only(self):
         dfbuilder = mdf._RSMDFBuilder(cell, auxcell, kpts).set(mesh=mesh).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2', j_only=True)
             v_s2 = []
             for ki in range(nkpts):
@@ -316,7 +315,7 @@ def test_ccmdf_get_2c2e_cart_lr(self):
 
     def test_ccmdf_make_j3c_gamma_lr(self):
         dfbuilder = mdf._CCMDFBuilder(cell_lr, auxcell_lr).set(mesh=mesh).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2')
             v2 = load(tmpf.name, kpts[[0, 0]])
             self.assertAlmostEqual(lib.fp(v2), 1.0439710349332878e-05, 7)
@@ -334,7 +333,7 @@ def test_ccmdf_make_j3c_gamma_lr(self):
 
     def test_ccmdf_make_j3c_lr(self):
         dfbuilder = mdf._CCMDFBuilder(cell_lr, auxcell_lr, kpts).set(mesh=mesh).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2')
             v_s2 = []
             for ki in range(nkpts):
@@ -356,7 +355,7 @@ def test_ccmdf_make_j3c_lr(self):
 
     def test_ccmdf_make_j3c_j_only_lr(self):
         dfbuilder = mdf._CCMDFBuilder(cell_lr, auxcell_lr, kpts).set(mesh=mesh).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2', j_only=True)
             v_s2 = []
             for ki in range(nkpts):
@@ -411,7 +410,7 @@ def test_ccmdf_vs_fft_lr(self):
         j2c = dfbuilder.eigenvalue_decomposed_metric(j2c[0])
         ref = lib.dot(j2c[0], j3c)
         ref = ref.T.dot(ref)
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2', j_only=True)
             v1 = load(tmpf.name, kpts[[0, 0]])
             self.assertAlmostEqual(abs(ref - v1).max(), 0, 9)
@@ -437,7 +436,7 @@ def test_ccmdf_get_2c2e_cart_sr(self):
 
     def test_ccmdf_make_j3c_gamma_sr(self):
         dfbuilder = mdf._CCMDFBuilder(cell_sr, auxcell_sr).set(mesh=mesh).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2')
             v2 = load(tmpf.name, kpts[[0, 0]])
             self.assertAlmostEqual(lib.fp(v2), 0.014857466177913803, 7)
@@ -455,7 +454,7 @@ def test_ccmdf_make_j3c_gamma_sr(self):
 
     def test_ccmdf_make_j3c_sr(self):
         dfbuilder = mdf._CCMDFBuilder(cell_sr, auxcell_sr, kpts).set(mesh=mesh).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2')
             v_s2 = []
             for ki in range(nkpts):
@@ -477,7 +476,7 @@ def test_ccmdf_make_j3c_sr(self):
 
     def test_ccmdf_make_j3c_j_only_sr(self):
         dfbuilder = mdf._CCMDFBuilder(cell_sr, auxcell_sr, kpts).set(mesh=mesh).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2', j_only=True)
             v_s2 = []
             for ki in range(nkpts):
@@ -531,7 +530,7 @@ def test_ccmdf_vs_fft_sr(self):
         j2c = dfbuilder.eigenvalue_decomposed_metric(j2c[0])
         ref = lib.dot(j2c[0], j3c)
         ref = ref.T.dot(ref)
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2', j_only=True)
             v1 = load(tmpf.name, kpts[[0, 0]])
             self.assertAlmostEqual(abs(ref - v1).max(), 0, 8)
@@ -557,7 +556,7 @@ def test_rsmdf_get_2c2e_cart_sr(self):
 
     def test_rsmdf_make_j3c_gamma_sr(self):
         dfbuilder = mdf._RSMDFBuilder(cell_sr, auxcell_sr).set(mesh=mesh).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2')
             v2 = load(tmpf.name, kpts[[0, 0]])
             self.assertAlmostEqual(lib.fp(v2), 0.014857466177913803, 7)
@@ -590,7 +589,7 @@ def test_rsmdf_make_j3c_gamma_sr(self):
 
     def test_rsmdf_make_j3c_sr(self):
         dfbuilder = mdf._RSMDFBuilder(cell_sr, auxcell_sr, kpts).set(mesh=mesh).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2')
             v_s2 = []
             for ki in range(nkpts):
@@ -612,7 +611,7 @@ def test_rsmdf_make_j3c_sr(self):
 
     def test_rsmdf_make_j3c_j_only_sr(self):
         dfbuilder = mdf._RSMDFBuilder(cell_sr, auxcell_sr, kpts).set(mesh=mesh).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2', j_only=True)
             v_s2 = []
             for ki in range(nkpts):
diff --git a/pyscf/pbc/df/test/test_outcore.py b/pyscf/pbc/df/test/test_outcore.py
index 4541db1327..d2fcf254e0 100644
--- a/pyscf/pbc/df/test/test_outcore.py
+++ b/pyscf/pbc/df/test/test_outcore.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import unittest
-import tempfile
 import numpy
 import h5py
 from pyscf import lib
@@ -45,7 +44,7 @@ def test_aux_e1(self):
         numpy.random.seed(1)
         kptij_lst = numpy.random.random((3,2,3))
         kptij_lst[0] = 0
-        with tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR) as tmpfile:
+        with lib.NamedTemporaryFile(dir=lib.param.TMPDIR) as tmpfile:
             outcore.aux_e1(cell, cell, tmpfile.name, aosym='s2', comp=1,
                            kptij_lst=kptij_lst, verbose=0)
             refk = incore.aux_e2(cell, cell, aosym='s1', kptij_lst=kptij_lst)
diff --git a/pyscf/pbc/df/test/test_rsdf_1.py b/pyscf/pbc/df/test/test_rsdf_1.py
index 9f29f9b9e1..01d7f37884 100644
--- a/pyscf/pbc/df/test/test_rsdf_1.py
+++ b/pyscf/pbc/df/test/test_rsdf_1.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import unittest
-import tempfile
 import numpy as np
 import scipy.linalg
 from pyscf import lib
@@ -88,7 +87,7 @@ def test_make_j3c_gamma(self):
         dfbuilder = rsdf._RSGDFBuilder(cell, auxcell, kpts)
         dfbuilder.__dict__.update(dfobj.__dict__)
         dfbuilder.build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name)
             v2 = load(tmpf.name, kpts[[0, 0]])
             self.assertAlmostEqual(lib.fp(v2), 1.4877735852543206, 8)
@@ -97,7 +96,7 @@ def test_make_j3c(self):
         dfbuilder = rsdf._RSGDFBuilder(cell, auxcell, kpts)
         dfbuilder.__dict__.update(dfobj.__dict__)
         dfbuilder.build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2')
             self.assertAlmostEqual(lib.fp(load(tmpf.name, kpts[[0, 0]])), 1.4877735860707935, 7)
             self.assertAlmostEqual(lib.fp(load(tmpf.name, kpts[[2, 4]])), 4.530919637533813+0.10852447737595214j, 7)
@@ -107,7 +106,7 @@ def test_make_j3c_j_only(self):
         dfbuilder = rsdf._RSGDFBuilder(cell, auxcell, kpts)
         dfbuilder.__dict__.update(dfobj.__dict__)
         dfbuilder.build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2', j_only=True)
             self.assertAlmostEqual(lib.fp(load(tmpf.name, kpts[[0, 0]])), 1.4877735860707935, 7)
             self.assertAlmostEqual(lib.fp(load(tmpf.name, kpts[[2, 2]])), 1.4492567814298059, 7)
@@ -147,7 +146,7 @@ def test_vs_fft(self):
         j3c = lib.dot(auxG.conj()*coulG, aopair.reshape(ngrids,-1))
         j2c = scipy.linalg.cholesky(j2c[0], lower=True)
         ref = scipy.linalg.solve_triangular(j2c, j3c, lower=True)
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2', j_only=True)
             v1 = load(tmpf.name, kpts[[0, 0]])
             self.assertAlmostEqual(abs(ref - v1).max(), 0, 9)
diff --git a/pyscf/pbc/df/test/test_rsdf_builder.py b/pyscf/pbc/df/test/test_rsdf_builder.py
index 0edec5ebd4..61abe2956b 100644
--- a/pyscf/pbc/df/test/test_rsdf_builder.py
+++ b/pyscf/pbc/df/test/test_rsdf_builder.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import unittest
-import tempfile
 import numpy as np
 import scipy.linalg
 from pyscf import lib
@@ -116,7 +115,7 @@ def test_get_2c2e_cart(self):
 
     def test_make_j3c_gamma(self):
         dfbuilder = rsdf_builder._RSGDFBuilder(cell, auxcell).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name)
             v2 = load(tmpf.name, kpts[[0, 0]])
             self.assertAlmostEqual(lib.fp(v2), 1.5094843470069796, 7)
@@ -145,7 +144,7 @@ def test_make_j3c_gamma(self):
 
     def test_make_j3c(self):
         dfbuilder = rsdf_builder._RSGDFBuilder(cell, auxcell, kpts).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2')
             v_s2 = []
             for ki in range(nkpts):
@@ -168,7 +167,7 @@ def test_make_j3c(self):
 
     def test_make_j3c_j_only(self):
         dfbuilder = rsdf_builder._RSGDFBuilder(cell, auxcell, kpts).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2', j_only=True)
             v_s2 = []
             for ki in range(nkpts):
@@ -189,13 +188,13 @@ def test_make_j3c_kptij_lst(self):
         kj_idx = np.array([15, 18, 21, 1, 2 , 4, 5])
         kij_idx = np.array([ki_idx,kj_idx]).T
         kptij_lst = kpts[kij_idx]
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             cderi = tmpf.name
             dfbuilder.make_j3c(cderi, aosym='s1')
             with df.CDERIArray(cderi) as cderi_array:
                 ref = np.array([cderi_array[ki, kj] for ki, kj in kij_idx])
 
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             cderi = tmpf.name
             dfbuilder.make_j3c(cderi, aosym='s1', kptij_lst=kptij_lst)
             with df.CDERIArray(cderi) as cderi_array:
@@ -210,7 +209,7 @@ def test_make_j3c_gamma_2d(self):
                       dimension=2)
         auxcell = df.make_auxcell(cell, auxbasis)
         dfbuilder = rsdf_builder._RSGDFBuilder(cell, auxcell).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name)
             v2 = load(tmpf.name, kpts[[0, 0]])
             self.assertAlmostEqual(lib.fp(v2.T.dot(v2)), 0.3289627476345819, 7)
@@ -223,7 +222,7 @@ def test_make_j3c_gamma_1d(self):
                       dimension=1)
         auxcell = df.make_auxcell(cell, auxbasis)
         dfbuilder = rsdf_builder._RSGDFBuilder(cell, auxcell).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name)
             v2 = load(tmpf.name, kpts[[0, 0]])
             self.assertAlmostEqual(lib.fp(v2), 1.7171973261620863, 5)
@@ -236,7 +235,7 @@ def test_make_j3c_gamma_0d(self):
                       dimension=0)
         auxcell = df.make_auxcell(cell, auxbasis)
         dfbuilder = rsdf_builder._RSGDFBuilder(cell, auxcell).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name)
             v2 = load(tmpf.name, kpts[[0, 0]])
         ref = cholesky_eri(cell, auxmol=auxcell)
@@ -314,7 +313,7 @@ def test_vs_fft(self):
         j3c = lib.dot(auxG.conj()*wcoulG, aopair.reshape(ngrids,-1))
         j2c = scipy.linalg.cholesky(j2c[0], lower=True)
         ref = scipy.linalg.solve_triangular(j2c, j3c, lower=True)
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2', j_only=True)
             v1 = load(tmpf.name, kpts[[0, 0]])
             self.assertAlmostEqual(abs(ref - v1).max(), 0, 7)
@@ -344,7 +343,7 @@ def test_get_2c2e_cart_sr(self):
 
     def test_make_j3c_gamma_sr(self):
         dfbuilder = rsdf_builder._RSGDFBuilder(cell_sr, auxcell_sr).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name)
             v2 = load(tmpf.name, kpts[[0, 0]])
             self.assertAlmostEqual(lib.fp(v2), 0.9647178630614499, 8)
@@ -373,7 +372,7 @@ def test_make_j3c_gamma_sr(self):
 
     def test_make_j3c_sr_high_cost(self):
         dfbuilder = rsdf_builder._RSGDFBuilder(cell_sr, auxcell_sr, kpts).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2')
             v_s2 = []
             for ki in range(nkpts):
@@ -397,7 +396,7 @@ def test_make_j3c_sr_high_cost(self):
 
     def test_make_j3c_j_only_sr(self):
         dfbuilder = rsdf_builder._RSGDFBuilder(cell_sr, auxcell_sr, kpts).build()
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2', j_only=True)
             v_s2 = []
             for ki in range(nkpts):
@@ -445,7 +444,7 @@ def test_vs_fft_sr(self):
         j3c = lib.dot(auxG.conj()*wcoulG, aopair.reshape(ngrids,-1))
         j2c = scipy.linalg.cholesky(j2c[0], lower=True)
         ref = scipy.linalg.solve_triangular(j2c, j3c, lower=True)
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.make_j3c(tmpf.name, aosym='s2', j_only=True)
             v1 = load(tmpf.name, kpts[[0, 0]])
             self.assertAlmostEqual(abs(ref - v1).max(), 0, 7)
@@ -471,7 +470,7 @@ def test_off_center_kpts_rsdf_vs_fft(self):
         dfbuilder = rsdf_builder._RSGDFBuilder(cell, auxcell, kpts)
         dfbuilder.fft_dd_block = False
         dfbuilder.exclude_d_aux = False
-        with tempfile.NamedTemporaryFile() as tmpf:
+        with lib.NamedTemporaryFile() as tmpf:
             dfbuilder.build()
             dfbuilder.make_j3c(tmpf.name)
             nkpts = len(kpts)
diff --git a/pyscf/pbc/dft/test/test_krkspu.py b/pyscf/pbc/dft/test/test_krkspu.py
index 732d1a6b8c..3dda042b01 100644
--- a/pyscf/pbc/dft/test/test_krkspu.py
+++ b/pyscf/pbc/dft/test/test_krkspu.py
@@ -93,7 +93,7 @@ def test_get_veff(self):
         self.assertAlmostEqual(vxc.E_U, 0.07587726255165786, 11)
         self.assertAlmostEqual(lib.fp(vxc), 12.77643098220399, 8)
 
-    def test_KRKSpU_linear_response(self):
+    def test_KRKSpU_linear_response_high_cost(self):
         cell = pgto.Cell()
         cell.unit = 'A'
         cell.atom = 'C 0.,  0.,  0.; C 0.8917,  0.8917,  0.8917'
diff --git a/pyscf/pbc/dft/test/test_rks.py b/pyscf/pbc/dft/test/test_rks.py
index 8acb246522..ec641b2f1f 100644
--- a/pyscf/pbc/dft/test/test_rks.py
+++ b/pyscf/pbc/dft/test/test_rks.py
@@ -18,8 +18,8 @@
 #
 
 import unittest
-import tempfile
 import numpy as np
+from pyscf import lib
 from pyscf.pbc import gto as pbcgto
 from pyscf.pbc import dft as pbcdft
 import pyscf.pbc
@@ -82,8 +82,8 @@ def test_chkfile_k_point(self):
         cell.verbose = 0
         cell.build()
         mf1 = pbcdft.RKS(cell)
-        mf1.chkfile = tempfile.NamedTemporaryFile().name
         mf1.max_cycle = 1
+        mf1.chkfile = lib.NamedTemporaryFile().name
         mf1.kernel()
 
         cell = pbcgto.Cell()
diff --git a/pyscf/pbc/gto/test/test_cell.py b/pyscf/pbc/gto/test/test_cell.py
index cb2ce01c2b..9305ab248b 100644
--- a/pyscf/pbc/gto/test/test_cell.py
+++ b/pyscf/pbc/gto/test/test_cell.py
@@ -17,7 +17,6 @@
 #
 
 import unittest
-import tempfile
 import ctypes
 import numpy
 import numpy as np
@@ -65,7 +64,7 @@ def test_nimgs(self):
 
     def test_Gv(self):
         a = cl1.get_Gv()
-        self.assertAlmostEqual(lib.fp(a), -99.791927068519939, 10)
+        self.assertAlmostEqual(lib.fp(a), -99.791927068519939, 9)
 
     def test_SI(self):
         a = cl1.get_SI()
@@ -616,7 +615,7 @@ def test_fromstring(self):
 
     def test_fromfile(self):
         ref = cl.atom_coords().copy()
-        with tempfile.NamedTemporaryFile() as f:
+        with lib.NamedTemporaryFile() as f:
             cl.tofile(f.name, 'xyz')
             cell = pgto.Cell()
             cell.fromfile(f.name, 'xyz')
diff --git a/pyscf/pbc/gw/gw_ac.py b/pyscf/pbc/gw/gw_ac.py
new file mode 100644
index 0000000000..96590b252c
--- /dev/null
+++ b/pyscf/pbc/gw/gw_ac.py
@@ -0,0 +1,219 @@
+#!/usr/bin/env python
+# Copyright 2014-2026 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Tianyu Zhu <zhutianyu1991@gmail.com>
+# Author: Christopher Hillenbrand <chillenbrand15@gmail.com>
+# Author: Jiachen Li <lijiachen.duke@gmail.com>
+#
+
+"""
+PBC gamma-point spin-restricted G0W0 method based on the analytic continuation scheme.
+This implementation has N^4 scaling,
+and is faster than GW-CD (N^4~N5) and fully analytic GW (N^6) methods.
+GW-AC is recommended for valence states only, and is inaccurate for core states.
+
+References:
+    T. Zhu and G.K.-L. Chan, J. Chem. Theory. Comput. 17, 727-741 (2021)
+    New J. Phys. 14 053020 (2012)
+"""
+
+from functools import reduce
+import numpy as np
+
+from pyscf.ao2mo._ao2mo import nr_e2
+from pyscf.lib import current_memory, logger
+from pyscf.pbc import df, scf
+from pyscf.pbc.df.fft_ao2mo import _format_kpts
+from pyscf.pbc.df.df_ao2mo import warn_pbc2d_eri
+
+from pyscf.gw.gw_ac import GWAC as GWAC_mol
+
+
+class GWAC(GWAC_mol):
+    def __init__(self, mf, frozen=None, auxbasis=None):
+        if abs(mf.kpt).max() > 1e-9:
+            raise NotImplementedError
+        warn_pbc2d_eri(mf)
+
+        GWAC_mol.__init__(self, mf, frozen=frozen, auxbasis=auxbasis)
+        self.fc = False
+
+        return
+
+    def dump_flags(self):
+        log = logger.Logger(self.stdout, self.verbose)
+        log.info('')
+        log.info('******** %s ********', self.__class__)
+        log.info('method = %s', self.__class__.__name__)
+        nocc = self.nocc
+        nvir = self.nmo - nocc
+        log.info('GW nocc = %d, nvir = %d', nocc, nvir)
+        log.info('frozen orbitals = %s', self.frozen)
+        log.info('off-diagonal self-energy = %s', self.fullsigma)
+        log.info('GW density matrix = %s', self.rdm)
+        log.info('density-fitting for exchange = %s', self.vhf_df)
+        log.info('finite-size correction = %s', self.fc)
+        log.info('outcore for self-energy= %s', self.outcore)
+        if self.outcore is True:
+            log.info('outcore segment size = %d', self.segsize)
+        log.info('broadening parameter = %.3e', self.eta)
+        if self.nw2 is None:
+            log.info('number of grids = %d', self.nw)
+        else:
+            log.info('grid size for W is %d', self.nw)
+            log.info('grid size for self-energy is %d', self.nw2)
+        log.info('analytic continuation method = %s', self.ac)
+        log.info('imaginary frequency cutoff = %.1f', self.ac_iw_cutoff)
+        if self.ac == 'pade':
+            log.info('Pade points = %d', self.ac_pade_npts)
+            log.info('Pade step ratio = %.3f', self.ac_pade_step_ratio)
+        log.info('use perturbative linearized QP eqn = %s', self.qpe_linearized)
+        if self.qpe_linearized is True:
+            log.info('linearized factor range = %s', self.qpe_linearized_range)
+        else:
+            log.info('QPE max iter = %d', self.qpe_max_iter)
+            log.info('QPE tolerance = %.1e', self.qpe_tol)
+        log.info('')
+        return
+
+    def initialize_df(self, auxbasis=None):
+        """Initialize density fitting.
+
+        Parameters
+        ----------
+        auxbasis : str, optional
+            name of auxiliary basis set, by default None
+        """
+        if getattr(self._scf, 'with_df', None):
+            self.with_df = self._scf.with_df
+        else:
+            self.with_df = df.DF(self._scf.mol)
+            if auxbasis is not None:
+                self.with_df.auxbasis = auxbasis
+            else:
+                try:
+                    self.with_df.auxbasis = df.make_auxbasis(self._scf.mol, mp2fit=True)
+                except RuntimeError:
+                    self.with_df.auxbasis = df.make_auxbasis(self._scf.mol, mp2fit=False)
+        self._keys.update(['with_df'])
+        return
+
+    def ao2mo(self, mo_coeff=None):
+        """Transform density-fitting integral from AO to MO.
+
+        Parameters
+        ----------
+        mo_coeff : double 2d array, optional
+            coefficient from AO to MO, by default None
+
+        Returns
+        -------
+        Lpq : double 3d array
+            three-center density-fitting matrix in MO
+        """
+        if mo_coeff is None:
+            mo_coeff = self.mo_coeff
+        nmo = mo_coeff.shape[1]
+        nao = self.mo_coeff.shape[0]
+        naux = self.with_df.get_naoaux()
+        kpts = self._scf.with_df.kpts
+        max_memory = max(2000, self._scf.max_memory - current_memory()[0] - nao**2 * naux * 8 / 1e6)
+
+        mo = np.asarray(mo_coeff, order='F')
+        ijslice = (0, nmo, 0, nmo)
+
+        kptijkl = _format_kpts(kpts)
+        eri_3d = []
+        for LpqR, _, _ in self._scf.with_df.sr_loop(kptijkl[:2], max_memory=0.3 * max_memory, compact=False):
+            Lpq = None
+            Lpq = nr_e2(LpqR.reshape(-1, nao, nao), mo, ijslice, aosym='s1', mosym='s1', out=Lpq)
+            eri_3d.append(Lpq)
+        eri_3d = np.vstack(eri_3d).reshape(-1, nmo, nmo)
+
+        return eri_3d
+
+    def loop_ao2mo(self, mo_coeff=None, ijslice=None):
+        """Transform density-fitting integral from AO to MO by block.
+
+        Parameters
+        ----------
+        mo_coeff : double 2d array, optional
+            coefficient from AO to MO, by default None
+        ijslice : tuple, optional
+            tuples for (1st idx start, 1st idx end, 2nd idx start, 2nd idx end), by default None
+
+        Returns
+        -------
+        eri_3d : double 3d array
+            three-center density-fitting matrix in MO in a block
+        """
+        if mo_coeff is None:
+            mo_coeff = self.mo_coeff
+        nmo = mo_coeff.shape[1]
+        nao = self.mo_coeff.shape[0]
+        naux = self.with_df.get_naoaux()
+        kpts = self._scf.with_df.kpts
+        max_memory = max(2000, self._scf.max_memory - current_memory()[0] - nao**2 * naux * 8 / 1e6)
+
+        mo = np.asarray(mo_coeff, order='F')
+        if ijslice is None:
+            ijslice = (0, nmo, 0, nmo)
+        nislice = ijslice[1] - ijslice[0]
+        njslice = ijslice[3] - ijslice[2]
+
+        kptijkl = _format_kpts(kpts)
+        eri_3d = []
+        for LpqR, _, _ in self._scf.with_df.sr_loop(kptijkl[:2], max_memory=0.2 * max_memory, compact=False):
+            Lpq = None
+            Lpq = nr_e2(LpqR.reshape(-1, nao, nao), mo, ijslice, aosym='s1', mosym='s1', out=Lpq)
+            eri_3d.append(Lpq)
+        eri_3d = np.vstack(eri_3d).reshape(-1, nislice, njslice)
+
+        return eri_3d
+
+    def get_sigma_exchange(self, mo_coeff):
+        """Get exchange self-energy (EXX).
+        The exchange self-energy is calculated via PySCF functions with exxdiv=None,
+        then the finite-size correction for GW exchange self-energy is added if self.fc is True.
+        The finite-size correction is defined similar to k-point GW method,
+        as in equation 46 in doi.org/10.1021/acs.jctc.0c00704
+
+        Parameters
+        ----------
+        mo_coeff : double 2d array
+            orbital coefficient
+
+        Returns
+        -------
+        vk : double 2d array
+            exchange self-energy
+        """
+        dm = self._scf.make_rdm1()
+        if isinstance(self._scf.with_df, df.GDF):
+            rhf = scf.RHF(self.mol).density_fit()
+        elif isinstance(self._scf.with_df, df.RSDF):
+            rhf = scf.RHF(self.mol).rs_density_fit()
+        if hasattr(self._scf, 'sigma'):
+            rhf = scf.addons.smearing_(rhf, sigma=self._scf.sigma, method=self._scf.smearing_method)
+        rhf.exxdiv = None
+        rhf.with_df = self.with_df
+        vk = rhf.get_veff(self.mol, dm) - rhf.get_j(self.mol, dm)
+        vk = reduce(np.matmul, (mo_coeff.T, vk, mo_coeff))
+
+        if self.fc:
+            vk_corr = -2.0 / np.pi * (6.0 * np.pi**2 / self.mol.vol) ** (1.0 / 3.0)
+            for i in range(self.nocc):
+                vk[i, i] = vk[i, i] + vk_corr
+        return vk
diff --git a/pyscf/pbc/gw/krgw_ac.py b/pyscf/pbc/gw/krgw_ac.py
index c9a67b3216..5c828e611f 100644
--- a/pyscf/pbc/gw/krgw_ac.py
+++ b/pyscf/pbc/gw/krgw_ac.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright 2014-2021 The PySCF Developers. All Rights Reserved.
+# Copyright 2014-2026 The PySCF Developers. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,250 +14,495 @@
 # limitations under the License.
 #
 # Author: Tianyu Zhu <zhutianyu1991@gmail.com>
+# Author: Christopher Hillenbrand <chillenbrand15@gmail.com>
+# Author: Jiachen Li <lijiachen.duke@gmail.com>
 #
 
 '''
-PBC spin-restricted G0W0-AC QP eigenvalues with k-point sampling
-This implementation has N^4 scaling, and is faster than GW-CD (N^4)
-and analytic GW (N^6) methods.
+Periodic spin-restricted G0W0 method based on the analytic continuation scheme.
+This implementation has N^4 scaling,
+and is faster than GW-CD (N^4~N^5) and fully analytic GW (N^6) methods.
 GW-AC is recommended for valence states only, and is inaccurate for core states.
 
-Method:
-    See T. Zhu and G.K.-L. Chan, arxiv:2007.03148 (2020) for details
-    Compute Sigma on imaginary frequency with density fitting,
-    then analytically continued to real frequency.
-    Gaussian density fitting must be used (FFTDF and MDF are not supported).
+References:
+    T. Zhu and G.K.-L. Chan, J. Chem. Theory. Comput. 17, 727-741 (2021)
+    New J. Phys. 14 053020 (2012)
 '''
 
 from functools import reduce
-import numpy
-import numpy as np
 import h5py
-from scipy.optimize import newton, least_squares
+import numpy as np
+import scipy
+import time
+
+import scipy.linalg
 
 from pyscf import lib
-from pyscf.lib import logger
 from pyscf.ao2mo import _ao2mo
 from pyscf.ao2mo.incore import _conc_mos
-from pyscf.pbc import df, dft, scf
-from pyscf.pbc.mp.kmp2 import get_nocc, get_nmo, get_frozen_mask
-from pyscf import __config__
+from pyscf.lib import einsum, logger, temporary_env
+from pyscf.pbc import df, dft
+from pyscf.pbc.mp.kmp2 import get_frozen_mask
 
-einsum = lib.einsum
+from pyscf.gw.utils.ac_grid import _get_scaled_legendre_roots, PadeAC, TwoPoleAC
+from pyscf.gw.utils.gw_np_helper import mkslice, array_scale
 
-def kernel(gw, mo_energy, mo_coeff, orbs=None,
-           kptlist=None, nw=None, verbose=logger.NOTE):
-    '''GW-corrected quasiparticle orbital energies
 
-    Returns:
-        A list :  converged, mo_energy, mo_coeff
-    '''
+def kernel(gw):
     mf = gw._scf
-    assert gw.frozen is None
+    nocc = gw.nocc
+    nmo = gw.nmo
+    nkpts = gw.nkpts
 
-    if orbs is None:
-        orbs = range(gw.nmo)
+    # set frozen orbitals
+    gw.set_frozen_orbs()
+    orbs = gw.orbs
+    orbs_frz = gw.orbs_frz
+    kptlist = gw.kptlist
     if kptlist is None:
-        kptlist = range(gw.nkpts)
-    nkpts = gw.nkpts
-    nklist = len(kptlist)
+        gw.kptlist = kptlist = range(gw.nkpts)
+    mo_energy_frz = _mo_energy_frozen(gw, gw.mo_energy)
+    mo_coeff_frz = _mo_frozen(gw, gw.mo_coeff)
 
     # v_xc
-    dm = np.array(mf.make_rdm1())
-    v_mf = np.array(mf.get_veff()) - np.array(mf.get_j(dm_kpts=dm))
+    with temporary_env(mf, verbose=0), temporary_env(mf.mol, verbose=0), temporary_env(mf.with_df, verbose=0):
+        dm = mf.make_rdm1()
+        v_mf_ao = mf.get_veff() - mf.get_j(dm_kpts=dm)
+    v_mf = np.zeros(shape=[nkpts, nmo, nmo], dtype=np.complex128)
     for k in range(nkpts):
-        v_mf[k] = reduce(numpy.dot, (mo_coeff[k].T.conj(), v_mf[k], mo_coeff[k]))
-
-    nocc = gw.nocc
-    nmo = gw.nmo
+        v_mf[k] = reduce(np.matmul, (mo_coeff_frz[k].T.conj(), v_mf_ao[k], mo_coeff_frz[k]))
+    gw.vxc = v_mf
 
     # v_hf from DFT/HF density
+    vk = gw.get_sigma_exchange()
+
+    # finite size correction for exchange self-energy
     if gw.fc:
-        exxdiv = 'ewald'
+        vk_corr = -2.0 / np.pi * (6.0 * np.pi**2 / gw.mol.vol / nkpts) ** (1.0 / 3.0)
+        for k in range(nkpts):
+            for i in range(nocc):
+                vk[k][i, i] = vk[k][i, i] + vk_corr
+    gw.vk = vk
+
+    # set up Fermi level
+    gw.ef = ef = get_ef(kmf=mf, mo_energy=mf.mo_energy)
+
+    # grids for integration on imaginary axis
+    gw.freqs, gw.wts = freqs, wts = _get_scaled_legendre_roots(gw.nw)
+
+    # calculate self-energy on imaginary axis
+    if gw.outcore:
+        sigmaI, omega = get_sigma_outcore(
+            gw, freqs, wts, ef=ef, mo_energy=mo_energy_frz, orbs=orbs_frz, kptlist=kptlist, iw_cutoff=gw.ac_iw_cutoff,
+            fullsigma=gw.fullsigma,
+        )
     else:
-        exxdiv = None
-    rhf = scf.KRHF(gw.mol, gw.kpts, exxdiv=exxdiv)
-    rhf.with_df = gw.with_df
-    if getattr(gw.with_df, '_cderi', None) is None:
-        raise RuntimeError('Found incompatible integral scheme %s.'
-                           'KGWAC can be only used with GDF integrals' %
-                           gw.with_df.__class__)
-    if rhf.with_df._j_only:
-        logger.debug(gw, 'Rebuild CDERI for exchange integrals')
-        rhf.with_df.build(j_only=False)
-
-    vk = rhf.get_veff(gw.mol,dm_kpts=dm) - rhf.get_j(gw.mol,dm_kpts=dm)
-    for k in range(nkpts):
-        vk[k] = reduce(numpy.dot, (mo_coeff[k].T.conj(), vk[k], mo_coeff[k]))
-
-    # Grids for integration on imaginary axis
-    freqs,wts = _get_scaled_legendre_roots(nw)
+        sigmaI, omega = get_sigma(
+            gw, freqs, wts, ef=ef, mo_energy=mo_energy_frz, orbs=orbs_frz, kptlist=kptlist, iw_cutoff=gw.ac_iw_cutoff,
+            fullsigma=gw.fullsigma,
+        )
 
-    # Compute self-energy on imaginary axis i*[0,iw_cutoff]
-    sigmaI, omega = get_sigma_diag(gw, orbs, kptlist, freqs, wts, iw_cutoff=5.)
-
-    # Analytic continuation
-    coeff = []
+    # analytic continuation
     if gw.ac == 'twopole':
-        for k in range(nklist):
-            coeff.append(AC_twopole_diag(sigmaI[k], omega, orbs, nocc))
+        acobj = TwoPoleAC(list(range(nmo)), nocc)
     elif gw.ac == 'pade':
-        for k in range(nklist):
-            coeff_tmp, omega_fit = AC_pade_thiele_diag(sigmaI[k], omega)
-            coeff.append(coeff_tmp)
-    coeff = np.array(coeff)
-
-    conv = True
-    # This code does not support metals
-    homo = -99.
-    lumo = 99.
-    for k in range(nkpts):
-        if homo < mf.mo_energy[k][nocc-1]:
-            homo = mf.mo_energy[k][nocc-1]
-        if lumo > mf.mo_energy[k][nocc]:
-            lumo = mf.mo_energy[k][nocc]
-    ef = (homo+lumo)/2.
-
-    mo_energy = np.zeros_like(np.array(mf.mo_energy))
-    for k in range(nklist):
-        kn = kptlist[k]
-        for p in orbs:
-            if gw.linearized:
+        acobj = PadeAC(npts=gw.ac_pade_npts, step_ratio=gw.ac_pade_step_ratio)
+    else:
+        raise ValueError('Unknown GW-AC type %s' % (str(gw.ac)))
+
+    acobj.ac_fit(sigmaI, omega, axis=-1)
+
+    if gw.fullsigma:
+        diag_acobj = acobj.diagonal(axis1=1, axis2=2)
+    else:
+        diag_acobj = acobj
+
+    mo_energy = np.zeros_like(mf.mo_energy)
+    for ik, k in enumerate(kptlist):
+        for ip, p in enumerate(orbs_frz):
+            if gw.qpe_linearized:
                 # linearized G0W0
                 de = 1e-6
-                ep = mf.mo_energy[kn][p]
-                #TODO: analytic sigma derivative
-                if gw.ac == 'twopole':
-                    sigmaR = two_pole(ep-ef, coeff[k,:,p-orbs[0]]).real
-                    dsigma = two_pole(ep-ef+de, coeff[k,:,p-orbs[0]]).real - sigmaR.real
-                elif gw.ac == 'pade':
-                    sigmaR = pade_thiele(ep-ef, omega_fit[p-orbs[0]], coeff[k,:,p-orbs[0]]).real
-                    dsigma = pade_thiele(ep-ef+de, omega_fit[p-orbs[0]], coeff[k,:,p-orbs[0]]).real - sigmaR.real
-                zn = 1.0/(1.0-dsigma/de)
-                e = ep + zn*(sigmaR.real + vk[kn,p,p].real - v_mf[kn,p,p].real)
-                mo_energy[kn,p] = e
+                ep = mf.mo_energy[k][orbs[ip]]
+                sigmaR = diag_acobj[ik, ip].ac_eval(ep).real
+                dsigma = diag_acobj[ik, ip].ac_eval(ep + de).real - sigmaR.real
+                zn = 1.0 / (1.0 - dsigma / de)
+                if gw.qpe_linearized_range is not None:
+                    zn = 1.0 if zn < gw.qpe_linearized_range[0] or zn > gw.qpe_linearized_range[1] else zn
+                mo_energy[k, orbs[ip]] = ep + zn * (sigmaR + vk[k, p, p] - v_mf[k, p, p]).real
             else:
                 # self-consistently solve QP equation
                 def quasiparticle(omega):
-                    if gw.ac == 'twopole':
-                        sigmaR = two_pole(omega-ef, coeff[k,:,p-orbs[0]]).real
-                    elif gw.ac == 'pade':
-                        sigmaR = pade_thiele(omega-ef, omega_fit[p-orbs[0]], coeff[k,:,p-orbs[0]]).real
-                    return omega - mf.mo_energy[kn][p] - (sigmaR.real + vk[kn,p,p].real - v_mf[kn,p,p].real)
+                    sigmaR = diag_acobj[ik, ip].ac_eval(omega)
+                    return omega - mf.mo_energy[k][orbs[ip]] - (sigmaR + vk[k, p, p] - v_mf[k, p, p]).real
+
                 try:
-                    e = newton(quasiparticle, mf.mo_energy[kn][p], tol=1e-6, maxiter=100)
-                    mo_energy[kn,p] = e
+                    mo_energy[k, orbs[ip]] = scipy.optimize.newton(
+                        quasiparticle, mf.mo_energy[k][orbs[ip]], tol=gw.qpe_tol, maxiter=gw.qpe_max_iter
+                    )
                 except RuntimeError:
-                    conv = False
-    mo_coeff = mf.mo_coeff
+                    logger.warn(gw, 'QPE for k=%d orbital=%d not converged!', k, orbs[ip])
 
-    if gw.verbose >= logger.DEBUG:
-        numpy.set_printoptions(threshold=nmo)
+    # save GW results
+    gw.mo_energy = mo_energy
+    gw.acobj = acobj
+
+    with np.printoptions(threshold=len(mf.mo_energy[0])):
         for k in range(nkpts):
-            logger.debug(gw, '  GW mo_energy @ k%d =\n%s', k,mo_energy[k])
-        numpy.set_printoptions(threshold=1000)
+            logger.debug(gw, '  GW mo_energy @ k%d =\n%s', k, mo_energy[k])
+    logger.warn(gw, 'GW QP energies may not be sorted from min to max')
+
+    if gw.writefile > 0:
+        with h5py.File('vxc.h5', 'w') as feri:
+            feri['vk'] = np.asarray(vk)
+            feri['v_mf'] = np.asarray(v_mf)
+
+        with h5py.File('sigma_imag.h5', 'w') as feri:
+            feri['sigmaI'] = np.asarray(sigmaI)
+            feri['omega'] = np.asarray(omega)
+            if gw.sigmaI is not None:
+                feri['sigmaI_full'] = np.asarray(gw.sigmaI)
+
+        acobj.save('ac_coeff.h5')
+
+    return
+
+
+def get_rho_response(omega, mo_energy, Lia, kidx):
+    """Get Pi=PV.
+    P is density-density response function.
+    V is two-electron integral.
+    See equation 24 in 10.1021/acs.jctc.0c00704
+
+    Parameters
+    ----------
+    omega : double
+        real position of imaginary frequency
+    mo_energy : double 2d array
+        orbital energy
+    Lia : complex 4d ndarray
+        occupied-virtual block of three-center density-fitting matrix in MO
+    kidx : list
+        momentum-conserved k-point list kj=kidx[ki]
+
+    Returns
+    -------
+    Pi : complex ndarray
+        Pi in auxiliary basis at freq iw
+    """
+    nkpts, naux, nocc, nvir = Lia.shape
 
-    return conv, mo_energy, mo_coeff
+    # Compute Pi for kL
+    Pi = np.zeros(shape=[naux, naux], dtype=np.complex128)
+    for i in range(nkpts):
+        # Find ka that conserves with ki and kL (-ki+ka+kL=G)
+        a = kidx[i]
+        eia = mo_energy[i, :nocc, None] - mo_energy[a, None, nocc:]
+        Lia_i = Lia[i]
+        eia = eia / (omega**2 + eia**2)
+        Pia = Lia_i * eia
+        # Response from both spin-up and spin-down density
+        # Pi += (4./nkpts) * einsum('Pia,Qia->PQ', Pia, Lov.conj())
+        scipy.linalg.blas.zgemm(
+            alpha=4.0 / nkpts,
+            a=Lia_i.reshape(naux, nocc * nvir).T,
+            b=Pia.reshape(naux, nocc * nvir).T,
+            c=Pi.T,
+            trans_a=2,
+            trans_b=0,
+            beta=1.0,
+            overwrite_c=True,
+        )
+        Pia = Lia_i = None
+
+    return Pi
 
-def get_rho_response(gw, omega, mo_energy, Lpq, kL, kidx):
-    '''
-    Compute density response function in auxiliary basis at freq iw
-    '''
-    nkpts, naux, nmo, nmo = Lpq.shape
-    nocc = gw.nocc
-    kpts = gw.kpts
-    kscaled = gw.mol.get_scaled_kpts(kpts)
-    kscaled -= kscaled[0]
+
+def get_rho_response_metal(omega, mo_energy, mo_occ, Lpq, kidx):
+    """Get Pi=PV for metallic systems.
+    P is density-density response function.
+    V is two-electron integral.
+    See equation 24 in 10.1021/acs.jctc.0c00704
+
+    Parameters
+    ----------
+    omega : double
+        real position of imaginary frequency
+    mo_energy : double ndarray
+        orbital energy
+    mo_occ : double ndarray
+        occupation number
+    Lpq : complex ndarray
+        three-center density-fitting matrix in MO
+    kidx : list
+        momentum-conserved k-point list kj=kidx[ki]
+
+    Returns
+    -------
+    Pi : complex ndarray
+        Pi in auxiliary basis at freq iw
+    """
+    nkpts, naux, nmo, _ = Lpq.shape
+    mo_occ = [x / 2.0 for x in mo_occ]
 
     # Compute Pi for kL
-    Pi = np.zeros((naux,naux),dtype=np.complex128)
-    for i, kpti in enumerate(kpts):
+    Pi = np.zeros(shape=[naux, naux], dtype=np.complex128)
+    for i in range(nkpts):
         # Find ka that conserves with ki and kL (-ki+ka+kL=G)
         a = kidx[i]
-        eia = mo_energy[i,:nocc,None] - mo_energy[a,None,nocc:]
-        eia = eia/(omega**2+eia*eia)
-        Pia = einsum('Pia,ia->Pia',Lpq[i][:,:nocc,nocc:],eia)
+        eia = mo_energy[i, :, None] - mo_energy[a, None, :]
+        fia = mo_occ[i][:, None] - mo_occ[a][None, :]
+        Lia = np.ascontiguousarray(Lpq[i])
+        eia = eia * fia / (omega**2 + eia**2)
+        Pia = Lia * eia
         # Response from both spin-up and spin-down density
-        Pi += 4./nkpts * einsum('Pia,Qia->PQ',Pia,Lpq[i][:,:nocc,nocc:].conj())
+        # both ia and ai are included, this gives a factor of 2.0
+        # Pi += (2./nkpts) * einsum('Pia,Qia->PQ', Pia, Lpq_i.conj())
+        scipy.linalg.blas.zgemm(
+            alpha=2.0 / nkpts,
+            a=Lia.reshape(naux, nmo * nmo).T,
+            b=Pia.reshape(naux, nmo * nmo).T,
+            c=Pi.T,
+            trans_a=2,
+            trans_b=0,
+            beta=1.0,
+            overwrite_c=True,
+        )
+        Pia = Lia = None
+
+    return Pi
+
+
+def get_rho_response_head(omega, mo_energy, qij):
+    """Compute head (G=0, G'=0) density response function in auxiliary basis at freq iw.
+    equation 48 in 10.1021/acs.jctc.0c00704
+
+    Parameters
+    ----------
+    omega : double
+        frequency point
+    mo_energy : double ndarray
+        orbital energy
+    qij : complex ndarray
+        pair density matrix defined as equation 51 in 10.1021/acs.jctc.0c00704
+
+    Returns
+    -------
+    Pi_00 : complex
+        head response function
+    """
+    nkpts, nocc = qij.shape[:2]
+
+    Pi_00 = 0j
+    for k in range(nkpts):
+        eia = mo_energy[k, :nocc, None] - mo_energy[k, None, nocc:]
+        eia = eia / (omega**2 + eia**2)
+        Pi_00 += 4.0 / nkpts * einsum('ia,ia->', eia, qij[k].conj() * qij[k])
+    return Pi_00
+
+
+def get_rho_response_wing(omega, mo_energy, Lia, qij):
+    """Compute wing (G=P, G'=0) density response function in auxiliary basis at freq iw.
+    equation 48 in 10.1021/acs.jctc.0c00704
+
+    Parameters
+    ----------
+    omega : double
+        frequency point
+    mo_energy : double 2d array
+        orbital energy
+    Lia : complex 4d array
+        occupied-virtual block of three-center density fitting matrix in MO
+    qij : complex ndarray
+        pair density matrix defined as equation 51 in 10.1021/acs.jctc.0c00704
+
+    Returns
+    -------
+    Pi : complex ndarray
+        wing response function
+    """
+    nkpts, naux, nocc, nvir = Lia.shape
+
+    Pi = np.zeros(shape=[naux], dtype=np.complex128)
+    for k in range(nkpts):
+        eia = mo_energy[k, :nocc, None] - mo_energy[k, None, nocc:]
+        eia = eia / (omega**2 + eia**2)
+        eia_q = eia * qij[k].conj()
+        Pi += 4.0 / nkpts * np.matmul(Lia[k].reshape(naux, nocc * nvir), eia_q.reshape(nocc * nvir))
     return Pi
 
-def get_sigma_diag(gw, orbs, kptlist, freqs, wts, iw_cutoff=None, max_memory=8000):
-    '''
-    Compute GW correlation self-energy (diagonal elements)
-    in MO basis on imaginary axis
-    '''
-    mo_energy = np.array(gw._scf.mo_energy)
-    mo_coeff = np.array(gw._scf.mo_coeff)
+
+def get_qij(gw, q, mo_energy, mo_coeff, uniform_grids=False):
+    """Compute pair density matrix in the long-wavelength limit through kp perturbation theory
+    qij = 1/Omega * |< psi_{ik} | e^{iqr} | psi_{ak-q} >|^2
+    equation 51 in 10.1021/acs.jctc.0c00704
+    Ref: Phys. Rev. B 83, 245122 (2011)
+
+    Parameters
+    ----------
+    gw : KRGWAC
+        gw object, provides attributes: nocc, nmo, kpts, mol
+    q : double
+        q grid
+    mo_energy : double ndarray
+        orbital energy
+    mo_coeff : complex ndarray
+        coefficient from AO to MO
+    uniform_grids : bool, optional
+        use uniform grids, by default False
+
+    Returns
+    -------
+    qij : complex ndarray
+        pair density matrix in the long-wavelength limit
+    """
+    nocc = gw.nocc
+    nmo = gw.nmo
+    nvir = nmo - nocc
+    kpts = gw.kpts
+    nkpts = len(kpts)
+    cell = gw.mol
+
+    if uniform_grids:
+        with temporary_env(cell, verbose=0):
+            mydf = df.FFTDF(cell, kpts=kpts)
+            coords = cell.gen_uniform_grids(mydf.mesh)
+    else:
+        with temporary_env(cell, verbose=0):
+            coords, weights = dft.gen_grid.get_becke_grids(cell, level=4)
+    ngrid = len(coords)
+
+    qij = np.zeros(shape=[nkpts, nocc, nvir], dtype=np.complex128)
+    for i, kpti in enumerate(kpts):
+        ao_p = dft.numint.eval_ao(cell, coords, kpt=kpti, deriv=1)
+        ao = ao_p[0]
+        ao_grad = ao_p[1:4]
+        if uniform_grids:
+            ao_ao_grad = einsum('mg,xgn->xmn', ao.T.conj(), ao_grad) * cell.vol / ngrid
+        else:
+            ao_ao_grad = einsum('g,mg,xgn->xmn', weights, ao.T.conj(), ao_grad)
+        q_ao_ao_grad = -1j * einsum('x,xmn->mn', q, ao_ao_grad)
+        q_mo_mo_grad = reduce(np.matmul, (mo_coeff[i][:, :nocc].T.conj(), q_ao_ao_grad, mo_coeff[i][:, nocc:]))
+        enm = 1.0 / (mo_energy[i][nocc:, None] - mo_energy[i][None, :nocc])
+        dens = enm.T * q_mo_mo_grad
+        qij[i] = dens / np.sqrt(cell.vol)
+
+    return qij
+
+
+def get_sigma(
+    gw, freqs, wts, ef, mo_energy, orbs=None, kptlist=None, mo_coeff=None, mo_occ=None, iw_cutoff=None, fullsigma=False
+):
+    """Get GW self-energy.
+    See equation 27 in 10.1021/acs.jctc.0c00704
+
+    Parameters
+    ----------
+    gw : KRGWAC
+        GW objects,
+        provides attributes: _scf, mol, frozen, nmo, nocc, kpts, nkpts, mo_coeff, mo_occ, fc, fc_grid, with_df
+    freqs : double array
+        position of imaginary frequency
+    wts : double array
+        weight of frequency points
+    ef : double
+        Fermi level
+    mo_energy : double ndarray
+        non-frozen orbital energy
+    orbs : list, optional
+        orbital index in non-frozen nmo to calculate self-energy, by default None
+    kptlist : list, optional
+        k-point index to calculate self-energy, by default None
+    mo_coeff : complex ndarray, optional
+        coefficient from AO to non-frozen MO, by default None
+    mo_occ : double ndarray, optional
+        non-frozen occupation number, by default None
+    iw_cutoff : complex, optional
+        imaginary grid cutoff for fitting, by default None
+    fullsigma : bool, optional
+        calculate off-diagonal elements, by default False
+
+    Returns
+    -------
+    sigma: complex ndarray
+        self-energy on the imaginary axis
+    omega: complex ndarray
+        imaginary frequency grids of self-energy
+    """
     nocc = gw.nocc
     nmo = gw.nmo
     nkpts = gw.nkpts
     kpts = gw.kpts
+
+    if orbs is None:
+        orbs = list(range(nmo))
+    if kptlist is None:
+        kptlist = list(range(nkpts))
+    norbs = len(orbs)
     nklist = len(kptlist)
     nw = len(freqs)
-    norbs = len(orbs)
-    mydf = gw.with_df
+
+    if mo_coeff is None:
+        mo_coeff = _mo_frozen(gw, gw.mo_coeff)
+    if mo_occ is None:
+        mo_occ = _mo_occ_frozen(gw, gw.mo_occ)
+    nao = mo_coeff[0].shape[0]
 
     # possible kpts shift center
     kscaled = gw.mol.get_scaled_kpts(kpts)
     kscaled -= kscaled[0]
 
-    # This code does not support metals
-    homo = -99.
-    lumo = 99.
-    for k in range(nkpts):
-        if homo < mo_energy[k][nocc-1]:
-            homo = mo_energy[k][nocc-1]
-        if lumo > mo_energy[k][nocc]:
-            lumo = mo_energy[k][nocc]
-    if (lumo-homo)<1e-3:
-        logger.warn(gw, 'This GW-AC code is not supporting metals!')
-    ef = (homo+lumo)/2.
-
     # Integration on numerical grids
-    if iw_cutoff is not None:
+    if iw_cutoff is not None and gw.rdm is False:
         nw_sigma = sum(iw < iw_cutoff for iw in freqs) + 1
     else:
         nw_sigma = nw + 1
 
-    # Compute occ for -iw and vir for iw separately
-    # to avoid branch cuts in analytic continuation
-    omega_occ = np.zeros((nw_sigma), dtype=np.complex128)
-    omega_vir = np.zeros((nw_sigma), dtype=np.complex128)
-    omega_occ[1:] = -1j*freqs[:(nw_sigma-1)]
-    omega_vir[1:] = 1j*freqs[:(nw_sigma-1)]
-    orbs_occ = [i for i in orbs if i < nocc]
-    norbs_occ = len(orbs_occ)
-
-    emo_occ = np.zeros((nkpts,nmo,nw_sigma),dtype=np.complex128)
-    emo_vir = np.zeros((nkpts,nmo,nw_sigma),dtype=np.complex128)
-    for k in range(nkpts):
-        emo_occ[k] = omega_occ[None,:] + ef - mo_energy[k][:,None]
-        emo_vir[k] = omega_vir[None,:] + ef - mo_energy[k][:,None]
-
-    sigma = np.zeros((nklist,norbs,nw_sigma),dtype=np.complex128)
-    omega = np.zeros((norbs,nw_sigma),dtype=np.complex128)
-    for p in range(norbs):
-        orbp = orbs[p]
-        if orbp < nocc:
-            omega[p] = omega_occ.copy()
-        else:
-            omega[p] = omega_vir.copy()
+    omega = np.zeros(shape=[nw_sigma], dtype=np.complex128)
+    omega[1:] = 1j * freqs[: (nw_sigma - 1)] + ef
+    emo = omega[None, None, :] - mo_energy[:, :, None]
 
+    if fullsigma is False:
+        sigma = np.zeros(shape=[nklist, norbs, nw_sigma], dtype=np.complex128)
+    else:
+        sigma = np.zeros(shape=[nklist, norbs, norbs, nw_sigma], dtype=np.complex128)
     if gw.fc:
         # Set up q mesh for q->0 finite size correction
-        q_pts = np.array([1e-3,0,0]).reshape(1,3)
+        if not gw.fc_grid:
+            q_pts = np.array([1e-3, 0, 0], dtype=np.double).reshape(1, 3)
+        else:
+            Nq = 3
+            q_pts = np.zeros(shape=[Nq**3 - 1, 3], dtype=np.double)
+            for i in range(Nq):
+                for j in range(Nq):
+                    for k in range(Nq):
+                        if i == 0 and j == 0 and k == 0:
+                            continue
+                        else:
+                            q_pts[i * Nq**2 + j * Nq + k - 1, 0] = k * 5e-4
+                            q_pts[i * Nq**2 + j * Nq + k - 1, 1] = j * 5e-4
+                            q_pts[i * Nq**2 + j * Nq + k - 1, 2] = i * 5e-4
+        nq_pts = len(q_pts)
         q_abs = gw.mol.get_abs_kpts(q_pts)
 
         # Get qij = 1/sqrt(Omega) * < psi_{ik} | e^{iqr} | psi_{ak-q} > at q: (nkpts, nocc, nvir)
-        qij = get_qij(gw, q_abs[0], mo_coeff)
+        qij = np.zeros(shape=[nq_pts, nkpts, nocc, nmo - nocc], dtype=np.complex128)
+
+        if not gw.fc_grid:
+            for k in range(nq_pts):
+                qij[k] = get_qij(gw, q_abs[k], mo_energy, mo_coeff)
+        else:
+            for k in range(nq_pts):
+                qij[k] = get_qij(gw, q_abs[k], mo_energy, mo_coeff)
 
+    cderiarr = gw.with_df.cderi_array()
     for kL in range(nkpts):
         # Lij: (ki, L, i, j) for looping every kL
         Lij = []
         # kidx: save kj that conserves with kL and ki (-ki+kj+kL=G)
         # kidx_r: save ki that conserves with kL and kj (-ki+kj+kL=G)
-        kidx = np.zeros((nkpts),dtype=np.int64)
-        kidx_r = np.zeros((nkpts),dtype=np.int64)
+        kidx = np.zeros(shape=[nkpts], dtype=np.int64)
+        kidx_r = np.zeros(shape=[nkpts], dtype=np.int64)
         for i, kpti in enumerate(kpts):
             for j, kptj in enumerate(kpts):
                 # Find (ki,kj) that satisfies momentum conservation with kL
@@ -266,313 +511,852 @@ def get_sigma_diag(gw, orbs, kptlist, freqs, wts, iw_cutoff=None, max_memory=800
                 if is_kconserv:
                     kidx[i] = j
                     kidx_r[j] = i
-                    logger.debug(gw, "Read Lpq (kL: %s / %s, ki: %s, kj: %s)"%(kL+1, nkpts, i, j))
+                    logger.debug(gw, 'Read Lpq (kL: %s / %s, ki: %s, kj: %s)' % (kL + 1, nkpts, i, j))
                     Lij_out = None
                     # Read (L|pq) and ao2mo transform to (L|ij)
-                    Lpq = []
-                    for LpqR, LpqI, sign \
-                            in mydf.sr_loop([kpti, kptj], max_memory=0.1*gw._scf.max_memory, compact=False):
-                        Lpq.append(LpqR+LpqI*1.0j)
                     # support unequal naux on different k points
-                    Lpq = np.vstack(Lpq).reshape(-1,nmo**2)
-                    tao = []
-                    ao_loc = None
+                    Lpq = cderiarr.load(kpti, kptj)
+                    if Lpq.shape[-1] == (nao*(nao+1))//2:
+                        Lpq = lib.unpack_tril(Lpq).reshape(-1,nao**2)
+                    else:
+                        Lpq = Lpq.reshape(-1,nao**2)
+                    Lpq = Lpq.astype(np.complex128)
+
                     moij, ijslice = _conc_mos(mo_coeff[i], mo_coeff[j])[2:]
-                    Lij_out = _ao2mo.r_e2(Lpq, moij, ijslice, tao, ao_loc, out=Lij_out)
-                    Lij.append(Lij_out.reshape(-1,nmo,nmo))
-        Lij = np.asarray(Lij)
+                    Lij_out = _ao2mo.r_e2(Lpq, moij, ijslice, tao=[], ao_loc=None, out=Lij_out)
+                    Lij.append(Lij_out.reshape(-1, nmo, nmo))
+        Lij = np.ascontiguousarray(Lij)
         naux = Lij.shape[1]
 
-        if kL == 0:
-            for w in range(nw):
-                # body dielectric matrix eps_body
-                Pi = get_rho_response(gw, freqs[w], mo_energy, Lij, kL, kidx)
-                eps_body_inv = np.linalg.inv(np.eye(naux)-Pi)
-
-                if gw.fc:
-                    # head dielectric matrix eps_00
-                    Pi_00 = get_rho_response_head(gw, freqs[w], mo_energy, qij)
-                    eps_00 = 1. - 4. * np.pi/np.linalg.norm(q_abs[0])**2 * Pi_00
+        if hasattr(gw._scf, 'sigma') is False:
+            Lia = np.ascontiguousarray(Lij[:, :, :nocc, nocc:])
 
-                    # wings dielectric matrix eps_P0
-                    Pi_P0 = get_rho_response_wing(gw, freqs[w], mo_energy, Lij, qij)
-                    eps_P0 = -np.sqrt(4.*np.pi) / np.linalg.norm(q_abs[0]) * Pi_P0
+        # allocate intermediates
+        naux_ones = np.ones(shape=[1, naux], dtype=np.complex128)
+        mnQ = np.zeros(shape=[nmo * norbs, naux], dtype=np.complex128)
+        if fullsigma is False:
+            Qmn = np.zeros(shape=[naux, nmo * norbs], dtype=np.complex128)
+            Wmn = np.zeros(shape=[nmo, norbs], dtype=np.complex128)
+        else:
+            Wmn = np.zeros(shape=[nmo, norbs, norbs], dtype=np.complex128)
+            Lij_kmQn = np.ascontiguousarray(Lij.transpose(0, 2, 1, 3))
 
-                    # inverse dielectric matrix
-                    eps_inv_00 = 1./(eps_00 - np.dot(np.dot(eps_P0.conj(),eps_body_inv),eps_P0))
-                    eps_inv_P0 = -eps_inv_00 * np.dot(eps_body_inv, eps_P0)
+        for w in range(nw):
+            if hasattr(gw._scf, 'sigma'):
+                Pi = get_rho_response_metal(freqs[w], mo_energy, mo_occ, Lij, kidx)
+            else:
+                Pi = get_rho_response(freqs[w], mo_energy, Lia, kidx)
+            Pi_inv = np.linalg.inv(np.eye(naux) - Pi)
 
-                    # head correction
-                    Del_00 = 2./np.pi * (6.*np.pi**2/gw.mol.vol/nkpts)**(1./3.) * (eps_inv_00 - 1.)
+            if gw.fc and kL == 0:
+                eps_inv_00 = 0j
+                eps_inv_P0 = np.zeros(shape=[naux], dtype=np.complex128)
+                for iq in range(nq_pts):
+                    # head dielectric matrix eps_00, equation 47 in 10.1021/acs.jctc.0c00704
+                    Pi_00 = get_rho_response_head(freqs[w], mo_energy, qij[iq])
+                    eps_00 = 1.0 - 4.0 * np.pi / np.linalg.norm(q_abs[iq]) ** 2.0 * Pi_00
 
-                eps_inv_PQ = eps_body_inv
-                g0_occ = wts[w] * emo_occ / (emo_occ**2+freqs[w]**2)
-                g0_vir = wts[w] * emo_vir / (emo_vir**2+freqs[w]**2)
+                    # wings dielectric matrix eps_P0, equation 48 in 10.1021/acs.jctc.0c00704
+                    Pi_P0 = get_rho_response_wing(freqs[w], mo_energy, Lia, qij[iq])
+                    eps_P0 = -np.sqrt(4.0 * np.pi) / np.linalg.norm(q_abs[iq]) * Pi_P0
 
-                for k in range(nklist):
-                    kn = kptlist[k]
-                    # Find km that conserves with kn and kL (-km+kn+kL=G)
-                    km = kidx_r[kn]
-                    Qmn = einsum('Pmn,PQ->Qmn',Lij[km][:,:,orbs].conj(),eps_inv_PQ-np.eye(naux))
-                    Wmn = 1./nkpts * einsum('Qmn,Qmn->mn',Qmn,Lij[km][:,:,orbs])
-                    sigma[k][:norbs_occ] += -einsum('mn,mw->nw',Wmn[:,:norbs_occ],g0_occ[km])/np.pi
-                    sigma[k][norbs_occ:] += -einsum('mn,mw->nw',Wmn[:,norbs_occ:],g0_vir[km])/np.pi
-
-                    if gw.fc:
-                        # apply head correction
-                        assert (kn == km)
-                        sigma[k][:norbs_occ] += -Del_00 * g0_occ[kn][orbs][:norbs_occ] /np.pi
-                        sigma[k][norbs_occ:] += -Del_00 * g0_vir[kn][orbs][norbs_occ:] /np.pi
-
-                        # apply wing correction
-                        Wn_P0 = einsum('Pnm,P->nm',Lij[kn],eps_inv_P0).diagonal()
-                        Wn_P0 = Wn_P0.real * 2.
-                        Del_P0 = np.sqrt(gw.mol.vol/4./np.pi**3) * (6.*np.pi**2/gw.mol.vol/nkpts)**(2./3.) * Wn_P0[orbs]
-                        sigma[k][:norbs_occ] += -einsum('n,nw->nw', Del_P0[:norbs_occ],
-                                                        g0_occ[kn][orbs][:norbs_occ]) /np.pi
-                        sigma[k][norbs_occ:] += -einsum('n,nw->nw', Del_P0[norbs_occ:],
-                                                        g0_vir[kn][orbs][norbs_occ:]) /np.pi
-        else:
-            for w in range(nw):
-                Pi = get_rho_response(gw, freqs[w], mo_energy, Lij, kL, kidx)
-                Pi_inv = np.linalg.inv(np.eye(naux)-Pi)-np.eye(naux)
-                g0_occ = wts[w] * emo_occ / (emo_occ**2+freqs[w]**2)
-                g0_vir = wts[w] * emo_vir / (emo_vir**2+freqs[w]**2)
-                for k in range(nklist):
-                    kn = kptlist[k]
+                    # inverse dielectric matrix
+                    # equation 53 in 10.1021/acs.jctc.0c00704
+                    eps_inv_00 += 1.0 / nq_pts * 1.0 / (eps_00 - reduce(np.matmul, (eps_P0.conj(), Pi_inv, eps_P0)))
+                    # equation 54 in 10.1021/acs.jctc.0c00704
+                    eps_inv_P0 += 1.0 / nq_pts * (-eps_inv_00) * np.matmul(Pi_inv, eps_P0)
+
+                # head correction, equation 43 in 10.1021/acs.jctc.0c00704
+                Del_00 = 2.0 / np.pi * (6.0 * np.pi**2 / gw.mol.vol / nkpts) ** (1.0 / 3.0) * (eps_inv_00 - 1.0)
+
+            Pi_inv -= np.eye(naux)
+            g0 = wts[w] * emo / (emo**2 + freqs[w] ** 2)
+            for k in range(nklist):
+                kn = kptlist[k]
+                # Find km that conserves with kn and kL (-km+kn+kL=G)
+                km = kidx_r[kn]
+
+
+                if len(orbs) == nmo:
+                    l_slice = np.ascontiguousarray(Lij[km].reshape(naux, -1))
+                    if fullsigma:
+                        l_slice_mQn = np.ascontiguousarray(Lij_kmQn[km])
+                else:
+                    l_slice = np.ascontiguousarray(Lij[km, :, :, mkslice(orbs)].reshape(naux, -1))
+                    if fullsigma:
+                        l_slice_mQn = np.ascontiguousarray(Lij_kmQn[km, :, :, mkslice(orbs)])
+
+                # Qmn = einsum('Pmn,PQ->Qmn', Lij[km][:, :, orbs].conj(), Pi_inv)
+                scipy.linalg.blas.zgemm(alpha=1.0, a=Pi_inv.T, b=l_slice.T, c=mnQ.T, overwrite_c=1, trans_b=2)
+
+                if fullsigma is False:
+                    # Wmn = 1./nkpts * einsum('Qmn,Qmn->mn',Qmn,Lij[km][:,:,orbs])
+                    Qmn[:] = mnQ.T * l_slice
+                    np.matmul(naux_ones, Qmn, out=Wmn.reshape(1, nmo * norbs))
+                    array_scale(Wmn, 1.0 / nkpts / np.pi)
+
+                    # sigma[k] += -einsum('mn,mw->nw',Wmn,g0[km]) / np.pi
+                    # 1 / np.pi is included in Wmn above
+                    sigma[k] -= np.matmul(Wmn.reshape(nmo, norbs).T, g0[km])
+                else:
+                    # for orbm in range(nmo):
+                    #     Wmn[orbm] = 1./nkpts * np.dot(Qmn[:,orbm,:].transpose(),Lij[km][:,orbm,orbs])
+                    #for m in range(nmo):
+                    #    np.matmul(Qmn[:, m, :].T, np.ascontiguousarray(Lij[km, :, m, mkslice(orbs)]), out=Wmn[m])
+                    np.matmul(mnQ.reshape(nmo, norbs, naux), l_slice_mQn, out=Wmn)
+                    array_scale(Wmn, 1.0 / nkpts / np.pi)
+
+                    #Wmn = Wmn.reshape(nmo, norbs * norbs).T
+                    # sigma[k] += -einsum('mnl,mw->nlw',Wmn,g0[km])/np.pi
+                    # 1 / np.pi is included in Wmn above
+                    sigma[k] -= np.matmul(Wmn.reshape(nmo, norbs * norbs).T, g0[km]).reshape(norbs, norbs, nw_sigma)
+
+                if gw.fc and kL == 0:
                     # Find km that conserves with kn and kL (-km+kn+kL=G)
-                    km = kidx_r[kn]
-                    Qmn = einsum('Pmn,PQ->Qmn',Lij[km][:,:,orbs].conj(),Pi_inv)
-                    Wmn = 1./nkpts * einsum('Qmn,Qmn->mn',Qmn,Lij[km][:,:,orbs])
-                    sigma[k][:norbs_occ] += -einsum('mn,mw->nw',Wmn[:,:norbs_occ],g0_occ[km])/np.pi
-                    sigma[k][norbs_occ:] += -einsum('mn,mw->nw',Wmn[:,norbs_occ:],g0_vir[km])/np.pi
+                    assert kn == km
+                    if fullsigma is False:
+                        # head correction
+                        sigma[k] += -Del_00 * g0[kn][orbs] / np.pi
+
+                        # wing correction
+                        Wn_P0 = einsum('Pnn,P->n', Lij[kn], eps_inv_P0)
+                        Wn_P0 = Wn_P0[orbs].real * 2.0
+                        Del_P0 = np.sqrt(gw.mol.vol/4/np.pi**3) * (6*np.pi**2/gw.mol.vol/nkpts) ** (2/3) * Wn_P0
+                        sigma[k] += -einsum('n,nw->nw', Del_P0, g0[kn][orbs]) / np.pi
+                    else:
+                        # head correction
+                        tmp = -Del_00 * g0[kn][orbs] / np.pi
+                        sigma[k, np.arange(norbs), np.arange(norbs), :] += tmp
+
+                        # wing correction
+                        Wn_P0 = einsum('Pnn,P->n', Lij[kn], eps_inv_P0)
+                        Wn_P0 = Wn_P0[orbs].real * 2.0
+                        Del_P0 = np.sqrt(gw.mol.vol/4/np.pi**3) * (6*np.pi**2/gw.mol.vol/nkpts) ** (2/3) * Wn_P0
+                        tmp = -einsum('n,nw->nw', Del_P0, g0[kn][orbs]) / np.pi
+                        sigma[k, np.arange(norbs), np.arange(norbs), :] += tmp
+
+    if gw.rdm:
+        gw.sigmaI = sigma
 
     return sigma, omega
 
-def get_rho_response_head(gw, omega, mo_energy, qij):
-    '''
-    Compute head (G=0, G'=0) density response function in auxiliary basis at freq iw
-    '''
-    nkpts, nocc, nvir = qij.shape
+
+def get_sigma_outcore(
+    gw, freqs, wts, ef, mo_energy, orbs=None, kptlist=None, mo_coeff=None, mo_occ=None, iw_cutoff=None, fullsigma=False
+):
+    """Low-memory routine to get GW self-energy.
+    See equation 27 in 10.1021/acs.jctc.0c00704
+
+    Parameters
+    ----------
+    gw : KRGWAC
+        GW objects,
+        provides attributes: _scf, mol, frozen, nmo, nocc, kpts, nkpts, mo_coeff, mo_occ, fc, fc_grid, with_df
+    freqs : double array
+        position of imaginary frequency
+    wts : double array
+        weight of frequency points
+    ef : double
+        Fermi level
+    mo_energy : double ndarray
+        non-frozen orbital energy
+    orbs : list, optional
+        orbital index in non-frozen nmo to calculate self-energy, by default None
+    kptlist : list, optional
+        k-point index to calculate self-energy, by default None
+    mo_coeff : complex ndarray, optional
+        coefficient from AO to non-frozen MO, by default None
+    mo_occ : double ndarray, optional
+        non-frozen occupation number, by default None
+    iw_cutoff : complex, optional
+        imaginary grid cutoff for fitting, by default None
+    fullsigma : bool, optional
+        calculate off-diagonal elements, by default False
+
+    Returns
+    -------
+    sigma: complex ndarray
+        self-energy on the imaginary axis
+    omega: complex ndarray
+        imaginary frequency grids of self-energy
+    """
+    assert gw.fc is False, "finite-size correction is not implemented in get_sigma_outcore"
     nocc = gw.nocc
+    nmo = gw.nmo
+    nkpts = gw.nkpts
     kpts = gw.kpts
 
-    # Compute Pi head
-    Pi_00 = 0j
-    for i, kpti in enumerate(kpts):
-        eia = mo_energy[i,:nocc,None] - mo_energy[i,None,nocc:]
-        eia = eia/(omega**2+eia*eia)
-        Pi_00 += 4./nkpts * einsum('ia,ia->',eia,qij[i].conj()*qij[i])
-    return Pi_00
+    if orbs is None:
+        orbs = list(range(nmo))
+    if kptlist is None:
+        kptlist = list(range(nkpts))
+    norbs = len(orbs)
+    nklist = len(kptlist)
+    nw = len(freqs)
 
-def get_rho_response_wing(gw, omega, mo_energy, Lpq, qij):
-    '''
-    Compute wing (G=P, G'=0) density response function in auxiliary basis at freq iw
-    '''
-    nkpts, naux, nmo, nmo = Lpq.shape
-    nocc = gw.nocc
-    kpts = gw.kpts
+    if mo_coeff is None:
+        mo_coeff = _mo_frozen(gw, gw.mo_coeff)
+    if mo_occ is None:
+        mo_occ = _mo_occ_frozen(gw, gw.mo_occ)
+    nao = mo_coeff[0].shape[0]
 
-    # Compute Pi wing
-    Pi = np.zeros(naux,dtype=np.complex128)
-    for i, kpti in enumerate(kpts):
-        eia = mo_energy[i,:nocc,None] - mo_energy[i,None,nocc:]
-        eia = eia/(omega**2+eia*eia)
-        eia_q = eia * qij[i].conj()
-        Pi += 4./nkpts * einsum('Pia,ia->P',Lpq[i][:,:nocc,nocc:],eia_q)
-    return Pi
+    # possible kpts shift center
+    kscaled = gw.mol.get_scaled_kpts(kpts)
+    kscaled -= kscaled[0]
 
-def get_qij(gw, q, mo_coeff, uniform_grids=False):
-    '''
-    Compute qij = 1/Omega * |< psi_{ik} | e^{iqr} | psi_{ak-q} >|^2 at q: (nkpts, nocc, nvir)
-    through kp perturbation theory
-    Ref: Phys. Rev. B 83, 245122 (2011)
-    '''
-    nocc = gw.nocc
-    nmo = gw.nmo
-    nvir = nmo - nocc
-    kpts = gw.kpts
-    nkpts = len(kpts)
-    cell = gw.mol
-    mo_energy = gw._scf.mo_energy
+    # Integration on numerical grids
+    if iw_cutoff is not None and gw.rdm is False:
+        nw_sigma = sum(iw < iw_cutoff for iw in freqs) + 1
+    else:
+        nw_sigma = nw + 1
 
-    if uniform_grids:
-        mydf = df.FFTDF(cell, kpts=kpts)
-        coords = cell.gen_uniform_grids(mydf.mesh)
+    omega = np.zeros(shape=[nw_sigma], dtype=np.complex128)
+    omega[1:] = 1j * freqs[: (nw_sigma - 1)] + ef
+    emo = omega[None, None, :] - mo_energy[:, :, None]
+
+    if fullsigma is False:
+        sigma = np.zeros(shape=[nklist, norbs, nw_sigma], dtype=np.complex128)
     else:
-        coords, weights = dft.gen_grid.get_becke_grids(cell,level=5)
-    ngrid = len(coords)
+        sigma = np.zeros(shape=[nklist, norbs, norbs, nw_sigma], dtype=np.complex128)
 
-    qij = np.zeros((nkpts,nocc,nvir),dtype=np.complex128)
-    for i, kpti in enumerate(kpts):
-        ao_p = dft.numint.eval_ao(cell, coords, kpt=kpti, deriv=1)
-        ao = ao_p[0]
-        ao_grad = ao_p[1:4]
-        if uniform_grids:
-            ao_ao_grad = einsum('mg,xgn->xmn',ao.T.conj(),ao_grad) * cell.vol / ngrid
+    cput0 = (time.process_time(), time.perf_counter())
+    cderiarr = gw.with_df.cderi_array()
+    for kL in range(nkpts):
+        cput3 = (time.process_time(), time.perf_counter())
+        # kidx: save kj that conserves with kL and ki (-ki+kj+kL=G)
+        # kidx_r: save ki that conserves with kL and kj (-ki+kj+kL=G)
+        kidx = np.zeros(shape=[nkpts], dtype=np.int64)
+        kidx_r = np.zeros(shape=[nkpts], dtype=np.int64)
+
+        for i in range(nkpts):
+            for j in range(nkpts):
+                # Find (ki,kj) that satisfies momentum conservation with kL
+                kconserv = -kscaled[i] + kscaled[j] + kscaled[kL]
+                is_kconserv = np.linalg.norm(np.round(kconserv) - kconserv) < 1e-12
+                if is_kconserv:
+                    kidx[i] = j
+                    kidx_r[j] = i
+
+        # TODO: more efficient way to find naux without loading the whole array
+        Lpq_ao = cderiarr.load(kpts[0], kpts[kidx[0]])
+        assert len(Lpq_ao.shape) == 2
+        naux = Lpq_ao.shape[0]
+
+        Pi = np.zeros(shape=[nw, naux, naux], dtype=np.complex128)
+        cput1 = (time.process_time(), time.perf_counter())
+        for i in range(nkpts):
+            a = kidx[i]
+            logger.debug(gw, 'Pi (kL: %s / %s, ki: %s, kj: %s)' % (kL + 1, nkpts, a, kidx_r[a]))
+            Lpq_ao = cderiarr.load(kpts[i], kpts[a])
+            if Lpq_ao.shape[-1] == (nao * (nao + 1)) // 2:
+                Lpq_ao = lib.unpack_tril(Lpq_ao).reshape(-1, nao**2)
+            else:
+                Lpq_ao = Lpq_ao.reshape(-1, nao**2)
+            Lpq_ao = Lpq_ao.astype(np.complex128)
+
+            moij, ijslice = _conc_mos(mo_coeff[i], mo_coeff[a])[2:]
+            Lpq = None
+            Lpq = _ao2mo.r_e2(Lpq_ao, moij, ijslice, tao=[], ao_loc=None, out=Lpq)
+            del Lpq_ao
+            Lpq = np.ascontiguousarray(Lpq.reshape(-1, nmo, nmo))
+
+            if hasattr(gw._scf, 'sigma'):
+                eia = mo_energy[i, :, None] - mo_energy[a, None, :]
+                fia = (mo_occ[i][:, None] - mo_occ[a][None, :]) / 2.0
+                Lia = Lpq
+                for w in range(nw):
+                    freqs_w = freqs[w]
+                    eia_w = eia * fia / (freqs_w**2 + eia**2)
+                    Pia = Lia * eia_w
+                    # Response from both spin-up and spin-down density
+                    # both ia and ai are included, this gives a factor of 2.0
+                    # Pi += (2./nkpts) * einsum('Pia,Qia->PQ', Pia, Lpq_i.conj())
+                    scipy.linalg.blas.zgemm(
+                        alpha=2.0 / nkpts,
+                        a=Lia.reshape(naux, nmo * nmo).T,
+                        b=Pia.reshape(naux, nmo * nmo).T,
+                        c=Pi[w].T,
+                        trans_a=2,
+                        trans_b=0,
+                        beta=1.0,
+                        overwrite_c=True,
+                    )
+                    del eia_w, Pia
+                del eia, fia
+            else:
+                eia = mo_energy[i, :nocc, None] - mo_energy[a, None, nocc:]
+                Lia = np.ascontiguousarray(Lpq[:, :nocc, nocc:])
+                nvir = Lia.shape[-1]
+                for w in range(nw):
+                    freqs_w = freqs[w]
+                    eia_w = eia / (freqs_w**2 + eia**2)
+                    Pia = Lia * eia_w
+                    # Response from both spin-up and spin-down density
+                    # Pi += (4./nkpts) * einsum('Pia,Qia->PQ', Pia, Lov.conj())
+                    scipy.linalg.blas.zgemm(
+                        alpha=4.0 / nkpts,
+                        a=Lia.reshape(naux, nocc * nvir).T,
+                        b=Pia.reshape(naux, nocc * nvir).T,
+                        c=Pi[w].T,
+                        trans_a=2,
+                        trans_b=0,
+                        beta=1.0,
+                        overwrite_c=True,
+                    )
+                    del eia_w, Pia
+                del eia
+            del Lpq, Lia
+
+        logger.timer(gw, 'Calculate Pi for kL: %s / %s' % (kL + 1, nkpts), *cput1)
+
+        for w in range(nw):
+            Pi[w] = np.linalg.inv(np.eye(naux) - Pi[w])
+            Pi[w] -= np.eye(naux)
+        Pi_inv = Pi
+
+        # allocate intermediates
+        naux_ones = np.ones(shape=[1, naux], dtype=np.complex128)
+        mnQ = np.zeros(shape=[nmo * norbs, naux], dtype=np.complex128)
+        if fullsigma is False:
+            Qmn = np.zeros(shape=[naux, nmo * norbs], dtype=np.complex128)
+            Wmn = np.zeros(shape=[nmo, norbs], dtype=np.complex128)
         else:
-            ao_ao_grad = einsum('g,mg,xgn->xmn',weights,ao.T.conj(),ao_grad)
-        q_ao_ao_grad = -1j * einsum('x,xmn->mn',q,ao_ao_grad)
-        q_mo_mo_grad = np.dot(np.dot(mo_coeff[i][:,:nocc].T.conj(), q_ao_ao_grad), mo_coeff[i][:,nocc:])
-        enm = 1./(mo_energy[i][nocc:,None] - mo_energy[i][None,:nocc])
-        dens = enm.T * q_mo_mo_grad
-        qij[i] = dens / np.sqrt(cell.vol)
+            Wmn = np.zeros(shape=[nmo, norbs, norbs], dtype=np.complex128)
+            #Lij_kmQn = np.ascontiguousarray(Lij.transpose(0, 2, 1, 3))
+
+        for kn in range(nklist):
+            # Find km that conserves with kn and kL (-km+kn+kL=G)
+            km = kidx_r[kn]
+
+            cput2 = (time.process_time(), time.perf_counter())
+            logger.debug(gw, 'sigma (kL: %s / %s, ki: %s, kj: %s)' % (kL + 1, nkpts, km, kn))
+            Lpq_ao = cderiarr.load(kpts[km], kpts[kn])
+            if Lpq_ao.shape[-1] == (nao * (nao + 1)) // 2:
+                Lpq_ao = lib.unpack_tril(Lpq_ao).reshape(-1,nao**2)
+            else:
+                Lpq_ao = Lpq_ao.reshape(-1,nao**2)
+            Lpq_ao = Lpq_ao.astype(np.complex128)
+
+            Lpq = None
+            moij, ijslice = _conc_mos(mo_coeff[km], mo_coeff[kn])[2:]
+            Lpq = _ao2mo.r_e2(Lpq_ao, moij, ijslice, tao=[], ao_loc=None, out=Lpq)
+            Lpq = np.ascontiguousarray(Lpq.reshape(-1, nmo, nmo))
+
+            if len(orbs) == nmo:
+                l_slice = np.ascontiguousarray(Lpq.reshape(naux, -1))
+                if fullsigma:
+                    l_slice_mQn = np.ascontiguousarray(Lpq.transpose(1, 0, 2))
+            else:
+                l_slice = np.ascontiguousarray(Lpq[:, :, mkslice(orbs)].reshape(naux, -1))
+                if fullsigma:
+                    l_slice_mQn = np.ascontiguousarray(Lpq[:, :, mkslice(orbs)].transpose(1, 0, 2))
 
-    return qij
+            for w in range(nw):
+                g0 = wts[w] * emo[km] / (emo[km]**2 + freqs[w] ** 2)
+
+                # Qmn = einsum('Pmn,PQ->Qmn', Lij[km][:, :, orbs].conj(), Pi_inv)
+                scipy.linalg.blas.zgemm(alpha=1.0, a=Pi_inv[w].T, b=l_slice.T, c=mnQ.T, overwrite_c=1, trans_b=2)
+
+                if fullsigma is False:
+                    # Wmn = 1./nkpts * einsum('Qmn,Qmn->mn',Qmn,Lij[km][:,:,orbs])
+                    Qmn[:] = mnQ.T * l_slice
+                    np.matmul(naux_ones, Qmn, out=Wmn.reshape(1, nmo * norbs))
+                    array_scale(Wmn, 1.0 / nkpts / np.pi)
+
+                    # sigma[kn] += -einsum('mn,mw->nw',Wmn,g0[km]) / np.pi
+                    # 1 / np.pi is included in Wmn above
+                    sigma[kn] -= np.matmul(Wmn.reshape(nmo, norbs).T, g0)
+                else:
+                    # for orbm in range(nmo):
+                    #     Wmn[orbm] = 1./nkpts * np.dot(Qmn[:,orbm,:].transpose(),Lij[km][:,orbm,orbs])
+                    #for m in range(nmo):
+                    #    np.matmul(Qmn[:, m, :].T, np.ascontiguousarray(Lij[km, :, m, mkslice(orbs)]), out=Wmn[m])
+                    np.matmul(mnQ.reshape(nmo, norbs, naux), l_slice_mQn, out=Wmn)
+                    array_scale(Wmn, 1.0 / nkpts / np.pi)
+
+                    #Wmn = Wmn.reshape(nmo, norbs * norbs).T
+                    # sigma[kn] += -einsum('mnl,mw->nlw',Wmn,g0[km])/np.pi
+                    # 1 / np.pi is included in Wmn above
+                    sigma[kn] -= np.matmul(Wmn.reshape(nmo, norbs * norbs).T, g0).reshape(norbs, norbs, nw_sigma)
+
+            del Lpq, l_slice
+            if fullsigma:
+                del l_slice_mQn
+            logger.timer(gw, 'GW correlation self-energy for kL: %s / %s kn: %d' % (kL + 1, nkpts, kn), *cput2)
+
+        del Pi, Pi_inv, mnQ, Wmn
+        if fullsigma is False:
+            del Qmn
+        logger.timer(gw, 'GW correlation self-energy for kL: %s / %s' % (kL + 1, nkpts), *cput3)
+
+    if gw.rdm:
+        gw.sigmaI = sigma
+
+    logger.timer(gw, 'GW correlation self-energy', *cput0)
+
+    return sigma, omega
+
+
+def get_sigma_exchange(gw, mo_coeff_full=None, mo_occ_full=None):
+    """Get exchange self-energy (EXX).
 
-def _get_scaled_legendre_roots(nw):
+    Parameters
+    ----------
+    gw : KRGWAC
+        gw object
+    mo_coeff : complex ndarray, optional
+        orbital coefficient, by default None
+    mo_occ : double ndarray, optional
+        occupation number, by default None
+
+    Returns
+    -------
+    vk : complex ndarray
+        exchange self-energy
     """
-    Scale nw Legendre roots, which lie in the
-    interval [-1, 1], so that they lie in [0, inf)
-    Ref: www.cond-mat.de/events/correl19/manuscripts/ren.pdf
+    nmo = gw.nmo
+    nkpts = gw.nkpts
+    kpts = gw.kpts
+
+    if mo_coeff_full is None:
+        mo_coeff_full = gw.mo_coeff
+    if mo_occ_full is None:
+        mo_occ_full = gw.mo_occ
+    nao = mo_coeff_full[0].shape[0]
+    nmo_full = nao
+    nocc_full = int(np.sum(gw._scf.mo_occ[0])) // 2
+
+    # possible kpts shift center
+    kscaled = gw.mol.get_scaled_kpts(kpts)
+    kscaled -= kscaled[0]
+
+    vk = np.zeros(shape=[nkpts, nmo_full, nmo_full], dtype=np.complex128)
+    cderiarr = gw.with_df.cderi_array()
+    for kL in range(nkpts):
+        # kidx: save kj that conserves with kL and ki (-ki+kj+kL=G)
+        # kidx_r: save ki that conserves with kL and kj (-ki+kj+kL=G)
+        kidx = np.zeros(shape=[nkpts], dtype=np.int64)
+        kidx_r = np.zeros(shape=[nkpts], dtype=np.int64)
+        for i in range(nkpts):
+            for j in range(nkpts):
+                # Find (ki,kj) that satisfies momentum conservation with kL
+                kconserv = -kscaled[i] + kscaled[j] + kscaled[kL]
+                is_kconserv = np.linalg.norm(np.round(kconserv) - kconserv) < 1e-12
+                if is_kconserv:
+                    kidx[i] = j
+                    kidx_r[j] = i
+
+        for kn in range(nkpts):
+            # kn is i
+            # Find km that conserves with kn and kL (-km+kn+kL=G)
+            km = kidx_r[kn] # km is j
 
-    Returns:
-        freqs : 1D ndarray
-        wts : 1D ndarray
+            # logger.debug(gw, 'Read Lpq (kL: %s / %s, ki: %s, kj: %s @ Rank %d)' % (kL + 1, nkpts, i, j, rank))
+
+            # Read (L|pq) and ao2mo transform to (L|ij)
+            # support unequal naux on different k points
+            Lpq_ao = cderiarr.load(kpts[km], kpts[kn])
+            if Lpq_ao.shape[-1] == (nao * (nao + 1)) // 2:
+                Lpq_ao = lib.unpack_tril(Lpq_ao).reshape(-1, nao**2)
+            else:
+                Lpq_ao = Lpq_ao.reshape(-1, nao**2)
+            Lpq_ao = Lpq_ao.astype(np.complex128)
+
+            Lij = None
+            if hasattr(gw._scf, 'sigma'):
+                moij, ijslice = _conc_mos(mo_coeff_full[km], mo_coeff_full[kn])[2:]
+                Lij = _ao2mo.r_e2(Lpq_ao, moij, ijslice, tao=[], ao_loc=None, out=Lij)
+                Lij = Lij.reshape(-1, nmo_full, nmo_full)
+            else:
+                moij, ijslice = _conc_mos(mo_coeff_full[km][:, :nocc_full], mo_coeff_full[kn])[2:]
+                Lij = _ao2mo.r_e2(Lpq_ao, moij, ijslice, tao=[], ao_loc=None, out=Lij)
+                Lij = Lij.reshape(-1, nocc_full, nmo_full)
+
+            if hasattr(gw._scf, 'sigma'):
+                # vk[k] -= 1.0 / nkpts * einsum('i,Lip,Liq->pq', mo_occ[km], Lij[km].conj(), Lij[km]) * 0.5
+                Lij_occ = Lij * mo_occ_full[km][None, :, None]
+                scipy.linalg.blas.zgemm(
+                    alpha=-0.5 / nkpts,
+                    a=Lij_occ.reshape(-1, nmo_full).T,
+                    b=Lij.reshape(-1, nmo_full).T,
+                    c=vk[kn].T,
+                    trans_a=0,
+                    trans_b=2,
+                    beta=1.0,
+                    overwrite_c=True,
+                )
+            else:
+                # vk[k] -= 1.0 / nkpts * einsum('Lip,Liq->pq', Lij[km].conj(), Lij[km])
+                scipy.linalg.blas.zgemm(
+                    alpha=-1.0 / nkpts,
+                    a=Lij.reshape(-1, nmo_full).T,
+                    b=Lij.reshape(-1, nmo_full).T,
+                    c=vk[kn].T,
+                    trans_a=0,
+                    trans_b=2,
+                    beta=1.0,
+                    overwrite_c=True,
+                )
+
+    if nmo != nmo_full:
+        frozen_mask = get_frozen_mask(gw)
+        identity = np.eye(nmo_full, dtype=np.complex128)
+        vk_frz = np.zeros(shape=[nkpts, nmo, nmo], dtype=np.complex128)
+        for k in range(nkpts):
+            vk_frz[k] = identity[frozen_mask[k], :] @ vk[k] @ identity[:, frozen_mask[k]]
+        vk = vk_frz
+
+    return vk
+
+
+def get_ef(kmf, mo_energy):
+    """Get Fermi level.
+    For gapped systems, Fermi level is computed as the average between HOMO and LUMO.
+    For metallic systems, Fermi level is optmized according to mo_energy.
+
+    Parameters
+    ----------
+    kmf : pyscf.pbc.scf.rhf.RHF/pyscf.pbc.dft.rks.RKS
+        mean-field object, provides attributes: kpts, sigma, smearing_method
+    mo_energy : double array
+        orbital energy
+
+    Returns
+    -------
+    ef : double
+        Fermi level
     """
-    freqs, wts = np.polynomial.legendre.leggauss(nw)
-    x0 = 0.5
-    freqs_new = x0*(1.+freqs)/(1.-freqs)
-    wts = wts*2.*x0/(1.-freqs)**2
-    return freqs_new, wts
+    if hasattr(kmf, "sigma"):
+        from pyscf.scf import addons as mol_addons
 
-def _get_clenshaw_curtis_roots(nw):
+        if kmf.smearing_method.lower() == "fermi":
+            f_occ = mol_addons._fermi_smearing_occ
+        else:
+            f_occ = mol_addons._gaussian_smearing_occ
+        mo_energy_stack = np.hstack(np.asarray(mo_energy))
+        nelectron = kmf.mol.tot_electrons(len(kmf.kpts))
+        ef = mol_addons._smearing_optimize(f_occ, mo_energy_stack, (nelectron + 1) // 2, kmf.sigma)[0]
+    else:
+        nocc = int(kmf.cell.nelectron // 2)
+        homo = -99.0
+        lumo = 99.0
+        for k in range(len(kmf.kpts)):
+            if homo < mo_energy[k][nocc - 1]:
+                homo = mo_energy[k][nocc - 1]
+            if lumo > mo_energy[k][nocc]:
+                lumo = mo_energy[k][nocc]
+        ef = (homo + lumo) / 2.0
+    return ef
+
+
+def get_g0_k(omega, mo_energy, eta):
+    """Get non-interacting Green's function.
+
+    Parameters
+    ----------
+    omega : double or complex ndarray
+        frequency grids
+    mo_energy : double ndarray
+        orbital energy
+    eta : double
+        broadening parameter
+
+    Returns
+    -------
+    gf0 : complex ndarray
+        non-interacting Green's function
     """
-    Clenshaw-Curtis quadrature on [0,inf)
-    Ref: J. Chem. Phys. 132, 234114 (2010)
-    Returns:
-        freqs : 1D ndarray
-        wts : 1D ndarray
+    nkpts = len(mo_energy)
+    nmo = len(mo_energy[0])
+    nw = len(omega)
+    gf0 = np.zeros(shape=[nkpts, nmo, nmo, nw], dtype=np.complex128)
+    for k in range(nkpts):
+        for iw in range(nw):
+            gf0[k, :, :, iw] = np.diag(1.0 / (omega[iw] + 1j * eta - mo_energy[k]))
+    return gf0
+
+
+def make_gf(gw, omega, eta):
+    """Get dynamical Green's function and self-energy.
+
+    Parameters
+    ----------
+    gw : KRGWAC
+        GW object, provides attributes: orbs, kptlist, ef, ac_coeff, omega_fit, vk, vxc, _scf.mo_energy
+    omega : double or complex array
+        frequency grids
+    eta : double
+        broadening parameter
+
+    Returns
+    -------
+    gf : complex ndarray
+        GW Green's function
+    gf0 : complex ndarray
+        mean-field Green's function
+    sigma : complex ndarray
+        GW correlation self-energy
     """
-    freqs = np.zeros(nw)
-    wts = np.zeros(nw)
-    a = 0.2
-    for w in range(nw):
-        t = (w+1.0)/nw * np.pi/2.
-        freqs[w] = a / np.tan(t)
-        if w != nw-1:
-            wts[w] = a*np.pi/2./nw/(np.sin(t)**2)
-        else:
-            wts[w] = a*np.pi/4./nw/(np.sin(t)**2)
-    return freqs[::-1], wts[::-1]
+    assert gw.frozen is None or gw.frozen == 0
+
+    if eta is None:
+        eta = gw.eta
+
+    nomega = len(omega)
+    sigma = np.zeros(shape=[gw.nkpts, gw.nmo, gw.nmo, nomega], dtype=np.complex128)
+    if gw.fullsigma:
+        for ik, k in enumerate(gw.kptlist):
+            for ip, p in enumerate(gw.orbs_frz):
+                for iq, q in enumerate(gw.orbs_frz):
+                    sigma[k, p, q] = gw.acobj[ik, ip, iq].ac_eval(omega + 1j * eta) + gw.vk[k, p, q] - gw.vxc[k, p, q]
+    else:
+        for ik, k in enumerate(gw.kptlist):
+            for ip, p in enumerate(gw.orbs_frz):
+                sigma[k, p, p] = gw.acobj[ik, ip].ac_eval(omega + 1j * eta) + gw.vk[k, p, p] - gw.vxc[k, p, p]
+
+    gf0 = get_g0_k(omega, gw._scf.mo_energy, eta)
+    gf = np.zeros_like(gf0)
+    for k in range(gw.nkpts):
+        for iw in range(nomega):
+            gf[k, :, :, iw] = np.linalg.inv(np.linalg.inv(gf0[k, :, :, iw]) - sigma[k, :, :, iw])
+
+    return gf, gf0, sigma
+
+
+def make_rdm1_linear(gw, ao_repr=False):
+    """Get GW density matrix from Green's function G(it=0).
+    G is from linear Dyson equation, which conserves particle number
+    G = G0 + G0 Sigma G0
+    See equation 16 in 10.1021/acs.jctc.0c01264
+
+    Parameters
+    ----------
+    gw : KRGWAC
+        GW object, provides attributes: sigmaI, mol, _scf, freqs, wts, frozen, orbs, fc
+    ao_repr : bool, optional
+        return density matrix in AO, by default False
+
+    Returns
+    -------
+    rdm1 : double ndarray
+        density matrix
+    """
+    assert gw.sigmaI is not None
+    assert gw.rdm is True and gw.fullsigma is True
+    assert gw.frozen is None or gw.frozen == 0
+    sigmaI = gw.sigmaI[:, :, :, 1:]
+    freqs = 1j * gw.freqs
+    wts = gw.wts
+    nmo = gw.nmo
+    nkpts = gw.nkpts
+    if len(gw.orbs) != nmo:
+        sigma = np.zeros(shape=[nkpts, nmo, nmo, len(freqs)], dtype=sigmaI.dtype)
+        for k in range(nkpts):
+            for ia, a in enumerate(gw.orbs):
+                for ib, b in enumerate(gw.orbs):
+                    sigma[k, a, b, :] = sigmaI[k, ia, ib, :]
+    else:
+        sigma = sigmaI
 
-def two_pole_fit(coeff, omega, sigma):
-    cf = coeff[:5] + 1j*coeff[5:]
-    f = cf[0] + cf[1]/(omega+cf[3]) + cf[2]/(omega+cf[4]) - sigma
-    f[0] = f[0]/0.01
-    return np.array([f.real,f.imag]).reshape(-1)
+    for iw in range(len(freqs)):
+        sigma[:, :, :, iw] += gw.vk - gw.vxc
+    gf0 = get_g0_k(freqs, np.array(gw._scf.mo_energy) - gw.ef, eta=0)
+    gf = np.array(gf0, copy=True)
+    for k in range(nkpts):
+        for iw in range(len(freqs)):
+            gf[k, :, :, iw] = reduce(np.matmul, (gf0[k, :, :, iw], sigma[k, :, :, iw], gf0[k, :, :, iw]))
 
-def two_pole(freqs, coeff):
-    cf = coeff[:5] + 1j*coeff[5:]
-    return cf[0] + cf[1]/(freqs+cf[3]) + cf[2]/(freqs+cf[4])
+    # GW density matrix
+    rdm1 = np.zeros(shape=[nkpts, nmo, nmo], dtype=np.double)
+    for k in range(nkpts):
+        rdm1[k] = 2.0 / np.pi * einsum('ijw,w->ij', gf[k], wts).real + np.eye(nmo)
+        logger.info(gw, 'GW particle number @ k%d = %s', k, np.trace(rdm1[k]))
 
-def AC_twopole_diag(sigma, omega, orbs, nocc):
-    """
-    Analytic continuation to real axis using a two-pole model
-    Returns:
-        coeff: 2D array (ncoeff, norbs)
+    # Symmetrize density matrix
+    for k in range(nkpts):
+        rdm1[k] = 0.5 * (rdm1[k] + rdm1[k].T)
+
+    if ao_repr is True:
+        ovlp = gw._scf.get_ovlp()
+        for k in range(nkpts):
+            CS = np.matmul(ovlp, gw._scf.mo_coeff[k])
+            rdm1[k] = reduce(np.matmul, (CS, rdm1[k], CS.conj().T))
+
+    return rdm1
+
+
+def _mo_energy_frozen(gw, mo_energy):
+    """Get non-frozen orbital energy.
+
+    Parameters
+    ----------
+    gw : KRGWAC
+        GW object, provides attributes: frozen, nmo, nkpt
+    mo_energy : double ndarray
+        full orbital energy
+
+    Returns
+    -------
+    mo_energy_frozen : double ndarray
+        non-frozen orbital energy
     """
-    norbs, nw = sigma.shape
-    coeff = np.zeros((10,norbs))
-    for p in range(norbs):
-        if orbs[p] < nocc:
-            x0 = np.array([0, 1, 1, 1, -1, 0, 0, 0, -1.0, -0.5])
-        else:
-            x0 = np.array([0, 1, 1, 1, -1, 0, 0, 0, 1.0, 0.5])
-        #TODO: analytic gradient
-        xopt = least_squares(two_pole_fit, x0, jac='3-point', method='trf', xtol=1e-10,
-                             gtol = 1e-10, max_nfev=1000, verbose=0, args=(omega[p], sigma[p]))
-        if xopt.success is False:
-            print('WARN: 2P-Fit Orb %d not converged, cost function %e'%(p,xopt.cost))
-        coeff[:,p] = xopt.x.copy()
-    return coeff
-
-def thiele(fn,zn):
-    nfit = len(zn)
-    g = np.zeros((nfit,nfit),dtype=np.complex128)
-    g[:,0] = fn.copy()
-    for i in range(1,nfit):
-        g[i:,i] = (g[i-1,i-1]-g[i:,i-1])/((zn[i:]-zn[i-1])*g[i:,i-1])
-    a = g.diagonal()
-    return a
-
-def pade_thiele(freqs,zn,coeff):
-    nfit = len(coeff)
-    X = coeff[-1]*(freqs-zn[-2])
-    for i in range(nfit-1):
-        idx = nfit-i-1
-        X = coeff[idx]*(freqs-zn[idx-1])/(1.+X)
-    X = coeff[0]/(1.+X)
-    return X
-
-def AC_pade_thiele_diag(sigma, omega):
+    frozen_mask = get_frozen_mask(gw)
+    nmo = gw.nmo
+    nkpts = gw.nkpts
+    mo_energy_frozen = np.zeros(shape=[nkpts, nmo], dtype=np.double)
+    for k in range(nkpts):
+        mo_energy_frozen[k] = mo_energy[k][frozen_mask[k]]
+    return mo_energy_frozen
+
+
+def _mo_frozen(gw, mo):
+    """Get non-frozen orbital coefficient.
+
+    Parameters
+    ----------
+    gw : KRGWAC
+        GW object, provides attributes: frozen, nmo, nkpt
+    mo : complex ndarray
+        full orbital coefficient
+
+    Returns
+    -------
+    mo_frozen : complex ndarray
+        non-frozen orbital coefficient
     """
-    Analytic continuation to real axis using a Pade approximation
-    from Thiele's reciprocal difference method
-    Reference: J. Low Temp. Phys. 29, 179 (1977)
-    Returns:
-        coeff: 2D array (ncoeff, norbs)
-        omega: 2D array (norbs, npade)
+    frozen_mask = get_frozen_mask(gw)
+    nmo = gw.nmo
+    nkpts = gw.nkpts
+    nao = mo[0].shape[0]
+    mo_frozen = np.zeros(shape=[nkpts, nao, nmo], dtype=np.complex128)
+    for k in range(nkpts):
+        mo_frozen[k] = mo[k][:, frozen_mask[k]]
+    return mo_frozen
+
+
+def _mo_occ_frozen(gw, mo_occ):
+    """Get non-frozen occupation number.
+
+    Parameters
+    ----------
+    gw : KRGWAC
+        GW object, provides attributes: frozen, nmo, nkpt
+    mo_occ : double ndarray
+        full occupation number
+
+    Returns
+    -------
+    mo_occ_frozen : double ndarray
+        non-frozen occupation number
     """
-    idx = range(1,40,6)
-    sigma1 = sigma[:,idx].copy()
-    sigma2 = sigma[:,(idx[-1]+4)::4].copy()
-    sigma = np.hstack((sigma1,sigma2))
-    omega1 = omega[:,idx].copy()
-    omega2 = omega[:,(idx[-1]+4)::4].copy()
-    omega = np.hstack((omega1,omega2))
-    norbs, nw = sigma.shape
-    npade = nw // 2
-    coeff = np.zeros((npade*2,norbs),dtype=np.complex128)
-    for p in range(norbs):
-        coeff[:,p] = thiele(sigma[p,:npade*2], omega[p,:npade*2])
-
-    return coeff, omega[:,:npade*2]
+    frozen_mask = get_frozen_mask(gw)
+    nmo = gw.nmo
+    nkpts = gw.nkpts
+    mo_occ_frozen = np.zeros(shape=[nkpts, nmo], dtype=np.double)
+    for k in range(nkpts):
+        mo_occ_frozen[k] = mo_occ[k][frozen_mask[k]]
+    return mo_occ_frozen
 
-class KRGWAC(lib.StreamObject):
 
-    linearized = getattr(__config__, 'gw_gw_GW_linearized', False)
-    # Analytic continuation: pade or twopole
-    ac = getattr(__config__, 'gw_gw_GW_ac', 'pade')
-    # Whether applying finite size corrections
-    fc = getattr(__config__, 'gw_gw_GW_fc', True)
+def set_frozen_orbs(gw):
+    """Set .frozen attribute from frozen mask.
+
+    Parameters
+    ----------
+    gw : KRGWAC
+        unrestricted GW object
+    """
+    if gw.frozen is not None:
+        if gw.orbs is not None:
+            if isinstance(gw.frozen, (int, np.int64)):
+                # frozen core
+                gw.orbs_frz = [x - gw.frozen for x in gw.orbs]
+            else:
+                # frozen list
+                assert isinstance(gw.frozen[0], (int, np.int64))
+                gw.orbs_frz = []
+                for orbi in gw.orbs:
+                    count = len([p for p in gw.frozen if p <= orbi])
+                    gw.orbs_frz.append(orbi - count)
+            if any(np.array(gw.orbs_frz) < 0):
+                raise RuntimeError('GW orbs must be larger than frozen core!')
+        else:
+            gw.orbs_frz = range(gw.nmo)
+            gw.orbs = range(len(gw._scf.mo_energy[0]))
+            if isinstance(gw.frozen, (int, np.int64)):
+                gw.orbs = list(set(gw.orbs) - set(range(gw.frozen)))
+            else:
+                assert isinstance(gw.frozen[0], (int, np.int64))
+                gw.orbs = list(set(gw.orbs) - set(gw.frozen))
+    else:
+        if gw.orbs is None:
+            gw.orbs = range(len(gw._scf.mo_energy[0]))
+        gw.orbs_frz = gw.orbs
+    return
 
-    _keys = {
-        'linearized', 'ac', 'fc', 'frozen', 'mol', 'with_df',
-        'kpts', 'nkpts', 'mo_energy', 'mo_coeff', 'mo_occ', 'sigma',
-    }
 
+class KRGWAC(lib.StreamObject):
     def __init__(self, mf, frozen=None):
-        self.mol = mf.mol
-        self._scf = mf
-        self.verbose = self.mol.verbose
-        self.stdout = self.mol.stdout
-        self.max_memory = mf.max_memory
-
-        #TODO: implement frozen orbs
-        if frozen is not None and frozen > 0:
-            raise NotImplementedError
-        self.frozen = frozen
+        self.mol = mf.mol  # mol object
+        self._scf = mf  # mean-field object
+        self.verbose = self.mol.verbose  # verbose level
+        self.stdout = self.mol.stdout  # standard output
+        self.max_memory = mf.max_memory  # max memory in MB
+
+        # options
+        self.frozen = frozen  # frozen orbital option
+        self.orbs = None  # list of orbital index in full nmo
+        self.orbs_frz = None  # list of orbital index in non-frozen nmo
+        self.kptlist = None  # list of k-points to evaluate
+        self.fullsigma = False  # calculate off-diagonal self-energy
+        self.rdm = False  # calculate GW density matrix
+        self.vhf_df = False  # use density-fitting for exchange self-energy
+        self.fc = True  # finite-size correction to self-energy
+        self.fc_grid = False  # grids for finite-size correction to self-energy
+        self.outcore = False  # low-memory routine to calculate self-energy
+        self.eta = 5.0e-3  # broadening parameter
+        self.nw = 100  # number of grids for integration
+        self.ac = 'pade'  # analytical continuation method
+        self.ac_iw_cutoff = 5.0  # imaginary frequency cutting for fitting self-energy
+        self.ac_pade_npts = 18  # number of selected points for Pade approximation
+        self.ac_pade_step_ratio = 2.0 / 3.0  # final/initial step size for Pade approximation
+        self.qpe_max_iter = 100  # max iteration in iteratively solving quasiparticle equation
+        self.qpe_tol = 1.0e-6  # tolerance in Newton method for iteratively quasiparticle equation
+        self.qpe_linearized = False  # use linearized quasiparticle equation
+        self.qpe_linearized_range = [0.5, 1.5]  # Z-shot factor range, if not in this range, z=1
+        self.writefile = 0  # write file level
 
         # DF-KGW must use GDF integrals
         if getattr(mf, 'with_df', None):
             self.with_df = mf.with_df
         else:
             raise NotImplementedError
-
-##################################################
-# don't modify the following attributes, they are not input options
-        self._nocc = None
-        self._nmo = None
-        self.kpts = mf.kpts
-        self.nkpts = len(self.kpts)
-        # self.mo_energy: GW quasiparticle energy, not scf mo_energy
-        self.mo_energy = None
-        self.mo_coeff = mf.mo_coeff
-        self.mo_occ = mf.mo_occ
-        self.sigma = None
-
-    def dump_flags(self):
+        self._keys.update(['with_df'])
+
+        ##################################################
+        # don't modify the following attributes, they are not input options
+        self._nocc = None  # number of NON-FROZEN occupied orbitals
+        self._nmo = None  # number of NON-FROZEN orbitals
+        self.kpts = mf.kpts  # k-point list
+        self.nkpts = len(self.kpts)  # number of k-points
+        self.mo_energy = None  # orbital energy
+        self.mo_coeff = None  # orbital coefficient
+        self.mo_occ = None  # occupiation number
+
+        # results
+        self.vk = None  # exchange matrix in MO
+        self.vxc = None  # mean-field exchange-correlation matrix in MO
+        self.freqs = None  # frequency grids
+        self.wts = None  # weights of frequency grids
+        self.ef = None  # Fermi level
+        self.acobj = None  # analytical continuation object
+        self.ac_coeff = None  # Pade fitting coefficient, old interface, to be deprecated
+        self.omega_fit = None  # AC fitting frequency, old interface, to be deprecated
+        self.sigmaI = None  # self-energy in the imaginary axis
+
+        return
+
+    def dump_flags(self, verbose=None):
         log = logger.Logger(self.stdout, self.verbose)
         log.info('')
         log.info('******** %s ********', self.__class__)
@@ -582,59 +1366,98 @@ def dump_flags(self):
         nkpts = self.nkpts
         log.info('GW nocc = %d, nvir = %d, nkpts = %d', nocc, nvir, nkpts)
         if self.frozen is not None:
-            log.info('frozen orbitals %s', str(self.frozen))
-        logger.info(self, 'use perturbative linearized QP eqn = %s', self.linearized)
-        logger.info(self, 'analytic continuation method = %s', self.ac)
-        logger.info(self, 'GW finite size corrections = %s', self.fc)
-        return self
+            log.info('frozen orbitals = %s', str(self.frozen))
+        if self.kptlist is not None:
+            log.info('k-point list = %s', str(self.kptlist))
+        if self.orbs is not None:
+            log.info('orbital list = %s', str(self.orbs))
+        log.info('off-diagonal self-energy = %s', self.fullsigma)
+        log.info('GW density matrix = %s', self.rdm)
+        log.info('density-fitting for exchange = %s', self.vhf_df)
+        log.info('outcore for self-energy= %s', self.outcore)
+        log.info('finite size corrections = %s', self.fc)
+        if self.fc_grid is not None:
+            log.info('grids for finite size corrections = %s', self.fc_grid)
+        log.info('broadening parameter = %.3e', self.eta)
+        log.info('number of grids = %d', self.nw)
+        log.info('analytic continuation method = %s', self.ac)
+        log.info('imaginary frequency cutoff = %s', str(self.ac_iw_cutoff))
+        if self.ac == 'pade':
+            log.info('Pade points = %d', self.ac_pade_npts)
+            log.info('Pade step ratio = %.3f', self.ac_pade_step_ratio)
+        log.info('use perturbative linearized QP eqn = %s', self.qpe_linearized)
+        if self.qpe_linearized is True:
+            log.info('linearized factor range = %s', self.qpe_linearized_range)
+        else:
+            log.info('QPE max iter = %d', self.qpe_max_iter)
+            log.info('QPE tolerance = %.1e', self.qpe_tol)
+        log.info('')
+        return
 
     @property
     def nocc(self):
-        return self.get_nocc()
+        frozen_mask = get_frozen_mask(self)
+        nkpts = len(self._scf.mo_energy)
+        nelec = 0.0
+        for k in range(nkpts):
+            nelec += np.sum(self._scf.mo_occ[k][frozen_mask[k]])
+        nelec = int(nelec / nkpts)
+        return nelec // 2
+
     @nocc.setter
     def nocc(self, n):
         self._nocc = n
 
     @property
     def nmo(self):
-        return self.get_nmo()
+        frozen_mask = get_frozen_mask(self)
+        return len(self._scf.mo_energy[0][frozen_mask[0]])
+
     @nmo.setter
     def nmo(self, n):
         self._nmo = n
 
-    get_nocc = get_nocc
-    get_nmo = get_nmo
-    get_frozen_mask = get_frozen_mask
+    def kernel(self, orbs=None, kptlist=None):
+        """Run a G0W0 calculation.
 
-    def kernel(self, mo_energy=None, mo_coeff=None, orbs=None, kptlist=None, nw=100):
+        Parameters
+        ----------
+        orbs : list, optional
+            orbital list to calculate self-energy, by default None
+        kptlist : list, optional
+            k-point list to calculate self-energy, by default None
         """
-        Input:
-            kptlist: self-energy k-points
-            orbs: self-energy orbs
-            nw: grid number
-        Output:
-            mo_energy: GW quasiparticle energy
-        """
-        if mo_coeff is None:
-            mo_coeff = np.array(self._scf.mo_coeff)
-        if mo_energy is None:
-            mo_energy = np.array(self._scf.mo_energy)
+        if self.mo_energy is None:
+            self.mo_energy = np.array(self._scf.mo_energy, copy=True)
+        if self.mo_coeff is None:
+            self.mo_coeff = np.array(self._scf.mo_coeff, copy=True)
+        if self.mo_occ is None:
+            self.mo_occ = np.array(self._scf.mo_occ, copy=True)
+
+        self.orbs = orbs
+        self.kptlist = kptlist
+
+        if hasattr(self._scf, "sigma"):
+            self.nw = max(400, self.nw)
+            self.ac_pade_npts = 18
+            self.ac_pade_step_ratio = 5.0 / 6.0
+            self.fc = False
 
         nmo = self.nmo
         naux = self.with_df.get_naoaux()
         nkpts = self.nkpts
-        mem_incore = (2*nkpts*nmo**2*naux) * 16/1e6
+        mem_incore = (2 * nkpts * nmo**2 * naux) * 16 / 1e6
         mem_now = lib.current_memory()[0]
-        if (mem_incore + mem_now > 0.99*self.max_memory):
+        if mem_incore + mem_now > 0.99 * self.max_memory:
             logger.warn(self, 'Memory may not be enough!')
-            raise NotImplementedError
 
-        cput0 = (logger.process_clock(), logger.perf_counter())
+        cput0 = (time.process_time(), time.perf_counter())
         self.dump_flags()
-        self.converged, self.mo_energy, self.mo_coeff = \
-                kernel(self, mo_energy, mo_coeff, orbs=orbs,
-                       kptlist=kptlist, nw=nw, verbose=self.verbose)
-
-        logger.warn(self, 'GW QP energies may not be sorted from min to max')
-        logger.timer(self, 'GW', *cput0)
-        return self.mo_energy
+        kernel(self)
+        logger.timer(self, 'KRGWAC', *cput0)
+        return
+
+    set_frozen_orbs = set_frozen_orbs
+    make_rdm1 = make_rdm1_linear
+    make_gf = make_gf
+    get_sigma_exchange = get_sigma_exchange
diff --git a/pyscf/pbc/gw/krpa.py b/pyscf/pbc/gw/krpa.py
new file mode 100644
index 0000000000..6add9be745
--- /dev/null
+++ b/pyscf/pbc/gw/krpa.py
@@ -0,0 +1,1045 @@
+#!/usr/bin/env python
+# Copyright 2014-2026 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Tianyu Zhu <zhutianyu1991@gmail.com>
+# Author: Christopher Hillenbrand <chillenbrand15@gmail.com>
+# Author: Chaoqun Zhang <cq_zhang@outlook.com>
+# Author: Jincheng Yu <pimetamon@gmail.com>
+# Author: Jiachen Li <lijiachen.duke@gmail.com>
+#
+
+"""
+Periodic spin-restricted random phase approximation (direct RPA) with N^4 scaling.
+
+References:
+    T. Zhu and G.K.-L. Chan, J. Chem. Theory. Comput. 17, 727-741 (2021)
+    New J. Phys. 14, 053020 (2012)
+"""
+
+import numpy as np
+import scipy.linalg.blas as blas
+import time
+
+from pyscf import lib
+from pyscf.lib import logger, temporary_env
+from pyscf.ao2mo._ao2mo import r_e2
+from pyscf.ao2mo.incore import _conc_mos
+from pyscf.pbc import scf, tools
+from pyscf.pbc.mp.kmp2 import get_nocc, get_nmo, get_frozen_mask
+
+from pyscf.gw.utils.ac_grid import _get_scaled_legendre_roots
+from pyscf.pbc.gw.krgw_ac import get_rho_response, get_rho_response_head, get_rho_response_wing, get_qij, \
+    _mo_occ_frozen, _mo_energy_frozen, _mo_frozen
+
+
+einsum = lib.einsum
+
+
+def kernel(rpa, mo_energy, mo_coeff, nw=None, with_e_hf=None):
+    """RPA correlation and total energy
+
+    Parameters
+    ----------
+    rpa : KRPA
+        rpa object
+    mo_energy : double array
+        molecular orbital energies
+    mo_coeff : double ndarray
+        molecular orbital coefficients
+    Lpq : double array, optional
+        density fitting 3-center integral in MO basis, by default None
+    nw : int, optional
+        number of frequency point on imaginary axis, by default None
+    with_e_hf : float, optional
+        extra input HF energy, by default None
+
+    Returns
+    -------
+    e_tot : float
+        RPA total energy
+    e_hf : float
+        HF energy (exact exchange for given mo_coeff)
+    e_corr : float
+        RPA correlation energy
+    """
+    # Compute HF exchange energy (EXX)
+    if with_e_hf is None:
+        rhf = scf.KRHF(rpa.mol, rpa.kpts, exxdiv=rpa._scf.exxdiv)
+        rhf.verbose = 0
+        if hasattr(rpa._scf, 'sigma'):
+            rhf = scf.addons.smearing_(rhf, sigma=rpa._scf.sigma, method=rpa._scf.smearing_method)
+        rhf.with_df = rpa._scf.with_df
+        with temporary_env(rpa.with_df, verbose=0), temporary_env(rhf.mol, verbose=0):
+            dm = rpa._scf.make_rdm1()
+            e_1e = 1.0 / len(rpa.kpts) * lib.einsum('kij,kji', dm, rhf.get_hcore()).real
+            e_j = 0.5 / len(rpa.kpts) * lib.einsum('kij,kji', dm, rhf.get_j(rhf.cell, dm)).real
+            e_x = get_rpa_exx(rpa, acfd=rpa.acfd_exx, correction_only=False)
+            e_nuc = rpa._scf.energy_nuc()
+            e_hf = e_1e + e_j + e_x + e_nuc
+    else:
+        e_hf = with_e_hf
+        logger.debug(rpa, f'  Setting EXX energy explicitly to {e_hf}')
+
+    is_metal = hasattr(rpa._scf, 'sigma')
+
+    # Turn off FC for metals
+    if is_metal and rpa.fc:
+        logger.warn(rpa, 'FC not available for metals - setting rpa.fc to False')
+        rpa.fc = False
+
+    # Grids for integration on imaginary axis
+    freqs, wts = rpa.get_grids(nw=nw, mo_energy=mo_energy)
+
+    # Compute RPA correlation energy
+    if rpa.outcore:
+        if is_metal:
+            e_corr = get_rpa_ecorr_outcore_metal(rpa, freqs, wts)
+        else:
+            e_corr = get_rpa_ecorr_outcore(rpa, freqs, wts)
+    else:
+        e_corr = get_rpa_ecorr(rpa, freqs, wts)
+
+    # Compute total energy
+    e_tot = e_hf + e_corr
+
+    logger.debug(rpa, f'  RPA total energy = {e_tot}')
+    logger.debug(rpa, f'  EXX energy = {e_hf}, RPA corr energy = {e_corr}')
+
+    return e_tot, e_hf, e_corr
+
+
+def get_idx_metal(mo_occ, threshold=1.0e-6):
+    """Get index of occupied/virtual/fractional orbitals of metals.
+
+    Parameters
+    ----------
+    mo_occ : double 1d array
+        occupation number
+    threshold : double, optional
+        threshold to determine fractionally occupied orbitals, by default 1.0e-6
+
+    Returns
+    -------
+    idx_occ : list
+        list of occupied orbital indexes
+    idx_frac : list
+        list of fractionally occupied orbital indexes
+    idx_vir : list
+        list of virtual orbital indexes
+    """
+    idx_occ = np.where(mo_occ > 2.0 - threshold)[0]
+    idx_vir = np.where(mo_occ < threshold)[0]
+    idx_frac = list(range(idx_occ[-1] + 1, idx_vir[0]))
+
+    return idx_occ, idx_frac, idx_vir
+
+
+def get_rho_response_metal(omega, mo_energy, mo_occ, Lpq, kidx):
+    """Get Pi=PV for metallic systems.
+    P is density-density response function.
+    V is two-electron integral.
+    See equation 24 in doi.org/10.1021/acs.jctc.0c00704.
+
+    NOTE: this function is different from the one in krgw_ac.py.
+    They should be merged in the future. The metal version here
+    is more efficient both in memory and computational time.
+
+    Parameters
+    ----------
+    omega : double
+        real position of imaginary frequency
+    mo_energy : double ndarray
+        orbital energy
+    mo_occ : double ndarray
+        occupation number
+    Lpq : list of complex ndarray
+        three-center density-fitting matrix in MO.
+        Lpq[ki] contains the naux x (nocc_i + nfrac_i) x (nfrac_i + nvir_i) sub-block.
+    kidx : list
+        momentum-conserved k-point list kj=kidx[ki]
+
+    Returns
+    -------
+    Pi : complex ndarray
+        Pi in auxiliary basis at freq iw
+    """
+    nkpts = len(Lpq)
+    naux = Lpq[0].shape[0]
+
+    # Compute Pi for kL
+    Pi = np.zeros(shape=[naux, naux], dtype=np.complex128)
+    for i in range(nkpts):
+        # Find ka that conserves with ki and kL (-ki+ka+kL=G)
+        a = kidx[i]
+        idx_occ_i, _, idx_vir_i = get_idx_metal(mo_occ[i])
+        idx_occ_a, idx_frac_a, idx_vir_a = get_idx_metal(mo_occ[a])
+
+        # merge index
+        idx_i = slice(idx_occ_i[0], idx_vir_i[0])
+        idx_a = slice(idx_occ_a[-1] + 1, idx_vir_a[-1] + 1)
+        nocc_i = len(idx_occ_i)
+        nfrac_a = len(idx_frac_a)
+
+        eia = mo_energy[i, idx_i, None] - mo_energy[a, None, idx_a]
+        fia = (mo_occ[i][idx_i, None] - mo_occ[a][None, idx_a]) / 2.0
+
+        # factor of 0.5 is for double counting
+        fia[nocc_i:, :nfrac_a] *= 0.5
+        # Response from both spin-up and spin-down density
+        rho_accum_inner(Pi, eia, omega, Lpq[i], alpha=4.0 / nkpts, fia=fia)
+
+    return Pi
+
+
+def rho_accum_inner(Pi, eia, omega, Lov, alpha=0.0, fia=None):
+    """Get contribution to response function from current occupied-virtual block.
+
+    Parameters
+    ----------
+    Pi : complex 2d array
+        density-density response function, will be overwritten
+    eia : double 2d array
+        occupied-virtual orbital energy difference
+    omega : double
+        real position of imaginary frequency
+    Lov : complex 3d array
+        occupied-virtual block of three-center density-fitting matrix in MO
+    alpha : float, optional
+        prefactor, by default 0.0
+    fia : double 2d array, optional
+        occupied-virtual occupation number difference, by default None
+    """
+    naux, nocc, nvir = Lov.shape
+
+    if fia is None:
+        eia = eia / (omega**2 + eia**2)
+    else:
+        eia = eia * fia / (omega**2 + eia**2)
+    Pia = (Lov * eia).reshape(naux, nocc * nvir)
+
+    # The following call to blas.zgemm may be replaced with
+    # Pi += alpha * np.einsum('Pia, Qia -> PQ', Pia, Lov.conj(), optimize=True)
+    # with a moderate performance hit.
+
+    # zgemm is complex matrix multiplication. A wrapper is included in SciPy.
+    # C <- alpha * op(A) @ op(B) + beta * C
+    blas.zgemm(
+        alpha=alpha,
+        a=Lov.reshape(naux, nocc * nvir).T,
+        b=Pia.T,
+        trans_a=2,  # take conjugate transpose of A (this gives Lov.conj())
+        trans_b=0,  # B is Pia.T
+        beta=1.0,
+        c=Pi.T,  # Pi.T += alpha * Lov.conj() @ Pia.T
+        overwrite_c=True,
+    )
+
+    return
+
+
+def rho_wing_accum_inner(Pi_P0, eia, omega, Lov, qov, alpha=0.0):
+    """Accumulate the finite-size-correction wing response for one OV slice.
+
+    Parameters
+    ----------
+    Pi_P0 : complex 1d array
+        finite-size correction to density-density response function, will be overwritten
+    eia : double 2d array
+        occupied-virtual orbital energy difference
+    omega : double
+        frequency
+    Lov : complex 3d array
+        occupied-virtual block of three-center density-fitting matrix in MO
+    qov : complex 2d array
+        virtual-occupied correction
+    alpha : float, optional
+        prefactor, by default 0.0
+    """
+    naux, nocc, nvir = Lov.shape
+    eia_q = eia * qov.conj() / (omega**2 + eia**2)
+    Pi_P0 += alpha * np.matmul(Lov.reshape(naux, nocc * nvir), eia_q.reshape(nocc * nvir))
+
+    return
+
+
+def get_rpa_ecorr(rpa, freqs, wts):
+    """Compute RPA correlation energy.
+
+    Parameters
+    ----------
+    rpa : KRPA
+        rpa object
+    freqs : double 1d array
+            frequency grid
+        wts : double 1d array
+            weight of grids
+
+    Returns
+    -------
+    e_corr : double
+        correlation energy
+    """
+    mo_coeff = np.array(_mo_frozen(rpa, rpa._scf.mo_coeff))
+    mo_energy = np.array(_mo_energy_frozen(rpa, rpa._scf.mo_energy))
+    mo_occ = np.array(_mo_occ_frozen(rpa, rpa._scf.mo_occ))
+
+    nocc = rpa.nocc
+    nmo = rpa.nmo
+    nvir = nmo - nocc
+    nao = rpa._scf.mo_coeff[0].shape[0]
+    nkpts = rpa.nkpts
+    kpts = rpa.kpts
+    nw = len(freqs)
+    mydf = rpa.with_df
+
+    # possible kpts shift center
+    kscaled = rpa.mol.get_scaled_kpts(kpts)
+    kscaled -= kscaled[0]
+
+    is_metal = hasattr(rpa._scf, 'sigma')
+
+    if rpa.fc:
+        qij, q_abs, nq_pts = rpa.get_q_mesh(mo_energy, mo_coeff)
+
+    e_corr = 0j
+
+    # Precompute k-conservation table
+    # Given k-point indices (kL, i), kconserv_table[kshift,i] contains
+    # the index j that satisfies momentum conservation,
+    # (k(i) - k(j) - k(kL)) \dot a = 2n\pi
+    # i.e.
+    # - ki + kj + kL = G
+    kconserv_table = get_kconserv_ria_efficient(rpa.mol, kpts)
+    cderiarr = mydf.cderi_array()
+
+    for kL in range(nkpts):
+        # Lij: (ki, L, i, j) for looping every kL
+        if is_metal:
+            Lij = []
+        else:
+            Lij = None
+        # kidx: save kj that conserves with kL and ki (-ki+kj+kL=G)
+        # kidx_r: save ki that conserves with kL and kj (-ki+kj+kL=G)
+        # kidx = np.zeros((nkpts),dtype=np.int64)
+        # kidx_r = np.zeros((nkpts),dtype=np.int64)
+        for i, kpti in enumerate(kpts):
+            j = kconserv_table[kL, i]
+            kptj = kpts[j]
+            kconserv = -kscaled[i] + kscaled[j] + kscaled[kL]
+            assert np.linalg.norm(np.round(kconserv) - kconserv) < 1e-12  # kidx[i] = j
+            # kidx_r[j] = i
+            logger.debug(rpa, f'Read Lpq (kL: {kL+1} / {nkpts}, ki: {i}, kj: {j})')
+            # Read (L|pq) and ao2mo transform to (L|ij)
+            # support unequal naux on different k points
+            Lpq = cderiarr.load(kpti, kptj)
+            if Lpq.shape[-1] == (nao * (nao + 1)) // 2:
+                Lpq = lib.unpack_tril(Lpq).reshape(-1, nao**2)
+            else:
+                Lpq = Lpq.reshape(-1, nao**2)
+            Lpq = Lpq.astype(np.complex128)
+            tao = []
+            ao_loc = None
+            moij, ijslice = _conc_mos(mo_coeff[i], mo_coeff[j])[2:]
+
+            naux = Lpq.shape[0]
+            if not is_metal:
+                if Lij is None:
+                    Lij = np.zeros((nkpts, naux, nocc, nvir), dtype=np.complex128)
+                ijslice = (0, nocc, nmo + nocc, 2 * nmo)
+                r_e2(Lpq, moij, ijslice, tao, ao_loc, out=Lij[i])
+            else:
+                # Only (nocc+nfrac, nfrac+nvir) block of Lpq is needed
+                # This is consistent with the new get_rho_response_metal implementation
+                idx_occ_i, idx_frac_i, idx_vir_i = get_idx_metal(mo_occ[i])
+                idx_occ_j, idx_frac_j, idx_vir_j = get_idx_metal(mo_occ[j])
+
+                nocc_i = len(idx_occ_i)
+                nfrac_i = len(idx_frac_i)
+                nocc_j = len(idx_occ_j)
+                nfrac_j = len(idx_frac_j)
+                nvir_j = len(idx_vir_j)
+                ijslice = (0, nocc_i + nfrac_i, nmo + nocc_j, 2 * nmo)
+
+                Lij.append(r_e2(Lpq, moij, ijslice, tao, ao_loc).reshape(naux, nocc_i + nfrac_i, nfrac_j + nvir_j))
+
+        for w in range(nw):
+            if is_metal:
+                Pi = get_rho_response_metal(freqs[w], mo_energy, mo_occ, Lij, kconserv_table[kL])
+            else:
+                Pi = get_rho_response(freqs[w], mo_energy, Lij, kconserv_table[kL])
+            if kL == 0 and rpa.fc:
+                for iq in range(nq_pts):
+                    # head Pi_00
+                    Pi_00 = get_rho_response_head(freqs[w], mo_energy, qij[iq])
+                    Pi_00 = 4.0 * np.pi / np.linalg.norm(q_abs[iq]) ** 2 * Pi_00
+                    # wings Pi_P0
+                    Pi_P0 = get_rho_response_wing(freqs[w], mo_energy, Lij, qij[iq])
+                    Pi_P0 = np.sqrt(4.0 * np.pi) / np.linalg.norm(q_abs[iq]) * Pi_P0
+
+                    # assemble Pi
+                    Pi_fc = np.zeros((naux + 1, naux + 1), dtype=Pi.dtype)
+                    Pi_fc[0, 0] = Pi_00
+                    Pi_fc[0, 1:] = Pi_P0.conj()
+                    Pi_fc[1:, 0] = Pi_P0
+                    Pi_fc[1:, 1:] = Pi
+
+                    e_corr += get_rpa_ecorr_w(Pi_fc, wts[w])
+            else:
+                e_corr += get_rpa_ecorr_w(Pi, wts[w])
+
+    e_corr = e_corr.real
+    e_corr *= 1.0 / (2.0 * np.pi) / nkpts
+    return e_corr
+
+
+def get_rpa_ecorr_outcore(rpa, freqs, wts):
+    """Low-memory routine to compute RPA correlation energy.
+
+    Parameters
+    ----------
+    rpa : KRPA
+        rpa object
+    freqs : double 1d array
+        frequency grid
+    wts : double 1d array
+        weight of grids
+
+    Returns
+    -------
+    e_corr : double
+        correlation energy
+    """
+    mo_coeff = np.array(_mo_frozen(rpa, rpa._scf.mo_coeff))
+    mo_energy = np.array(_mo_energy_frozen(rpa, rpa._scf.mo_energy))
+
+    nocc = rpa.nocc
+    nmo = rpa.nmo
+    nao = rpa._scf.mo_coeff[0].shape[0]
+    nkpts = rpa.nkpts
+    kpts = rpa.kpts
+    nw = len(freqs)
+    mydf = rpa.with_df
+
+    # possible kpts shift center
+    kscaled = rpa.mol.get_scaled_kpts(kpts)
+    kscaled -= kscaled[0]
+
+    if rpa.fc:
+        qij, q_abs, nq_pts = rpa.get_q_mesh(mo_energy, mo_coeff)
+
+    e_corr = 0j
+
+    # Precompute k-conservation table
+    # Given k-point indices (kL, i), kconserv_table[kshift,i] contains
+    # the index j that satisfies momentum conservation,
+    # (k(i) - k(j) - k(kL)) \dot a = 2n\pi
+    # i.e.
+    # - ki + kj + kL = G
+    kconserv_table = get_kconserv_ria_efficient(rpa.mol, kpts)
+    cderiarr = mydf.cderi_array()
+
+    for kL in range(nkpts):
+        Pi = None
+        Pi_P0 = None
+        nseg = nocc // rpa.segsize + 1
+        for iseg in range(nseg):
+            orb_start = iseg * rpa.segsize
+            orb_end = min((iseg + 1) * rpa.segsize, nocc)
+            if orb_end == orb_start:
+                continue
+            norb_this_iter = orb_end - orb_start
+
+            # Lij: (ki, L, i, j) for looping every kL
+            # kidx: save kj that conserves with kL and ki (-ki+kj+kL=G)
+            # kidx_r: save ki that conserves with kL and kj (-ki+kj+kL=G)
+            # kidx = np.zeros((nkpts),dtype=np.int64)
+            # kidx_r = np.zeros((nkpts),dtype=np.int64)
+            for i, kpti in enumerate(kpts):
+                j = kconserv_table[kL, i]
+                kptj = kpts[j]
+                # Find (ki,kj) that satisfies momentum conservation with kL
+                kconserv = -kscaled[i] + kscaled[j] + kscaled[kL]
+                assert np.linalg.norm(np.round(kconserv) - kconserv) < 1e-12
+                logger.debug(rpa, f'Read Lpq (kL: {kL+1} / {nkpts}, ki: {i}, kj: {j})')
+                # Read (L|pq) and ao2mo transform to (L|ij)
+                # support uneqaul naux on different k points
+                Lpq = cderiarr.load(kpti, kptj)
+                if Lpq.shape[-1] == (nao * (nao + 1)) // 2:
+                    Lpq = lib.unpack_tril(Lpq).reshape(-1, nao**2)
+                else:
+                    Lpq = Lpq.reshape(-1, nao**2)
+                Lpq = Lpq.astype(np.complex128)
+                naux = Lpq.shape[0]
+
+                tao = []
+                ao_loc = None
+                moij, ijslice = _conc_mos(mo_coeff[i], mo_coeff[j])[2:]
+
+                ijslice = (orb_start, orb_end, nmo + nocc, 2 * nmo)
+                Lij_slice = r_e2(Lpq, moij, ijslice, tao, ao_loc)
+                Lij_slice = Lij_slice.reshape(naux, norb_this_iter, nmo - nocc)
+                if Pi is None:
+                    Pi = np.zeros((nw, naux, naux), dtype=np.complex128)
+                    if kL == 0 and rpa.fc:
+                        Pi_P0 = np.zeros((nq_pts, nw, naux), dtype=np.complex128)
+
+                # Find ka that conserves with ki and kL (-ki+ka+kL=G)
+                a_inner = kconserv_table[kL, i]
+                eia = mo_energy[i][orb_start:orb_end, None] - mo_energy[a_inner][None, nocc:]
+                for w in range(nw):
+                    rho_accum_inner(Pi[w], eia, freqs[w], Lij_slice, alpha=4.0 / nkpts)
+                    if kL == 0 and rpa.fc:
+                        for iq in range(nq_pts):
+                            rho_wing_accum_inner(
+                                Pi_P0[iq, w],
+                                eia,
+                                freqs[w],
+                                Lij_slice,
+                                qij[iq, i, orb_start:orb_end],
+                                alpha=4.0 / nkpts,
+                            )
+
+        for w in range(nw):
+            if kL == 0 and rpa.fc:
+                for iq in range(nq_pts):
+                    Pi_00 = get_rho_response_head(freqs[w], mo_energy, qij[iq])
+                    Pi_00 = 4.0 * np.pi / np.linalg.norm(q_abs[iq]) ** 2 * Pi_00
+                    Pi_P0_iq = np.sqrt(4.0 * np.pi) / np.linalg.norm(q_abs[iq]) * Pi_P0[iq, w]
+
+                    Pi_fc = np.zeros((naux + 1, naux + 1), dtype=Pi.dtype)
+                    Pi_fc[0, 0] = Pi_00
+                    Pi_fc[0, 1:] = Pi_P0_iq.conj()
+                    Pi_fc[1:, 0] = Pi_P0_iq
+                    Pi_fc[1:, 1:] = Pi[w]
+
+                    e_corr += get_rpa_ecorr_w(Pi_fc, wts[w])
+            else:
+                e_corr += get_rpa_ecorr_w(Pi[w], wts[w])
+
+    e_corr = e_corr.real
+    e_corr *= 1.0 / (2.0 * np.pi) / nkpts
+    return e_corr
+
+
+def get_rpa_ecorr_outcore_metal(rpa, freqs, wts):
+    """Low-memory routine to compute RPA correlation energy for metals.
+
+    Parameters
+    ----------
+    rpa : KRPA
+        rpa object
+    freqs : double 1d array
+        frequency grid
+    wts : double 1d array
+        weight of grids
+
+    Returns
+    -------
+    e_corr : double
+        correlation energy
+    """
+    mo_coeff = np.array(_mo_frozen(rpa, rpa._scf.mo_coeff))
+    mo_energy = np.array(_mo_energy_frozen(rpa, rpa._scf.mo_energy))
+    mo_occ = np.array(_mo_occ_frozen(rpa, rpa._scf.mo_occ))
+
+    nmo = rpa.nmo
+    nao = rpa._scf.mo_coeff[0].shape[0]
+    nkpts = rpa.nkpts
+    kpts = rpa.kpts
+    nw = len(freqs)
+    mydf = rpa.with_df
+
+    # possible kpts shift center
+    kscaled = rpa.mol.get_scaled_kpts(kpts)
+    kscaled -= kscaled[0]
+
+    e_corr = 0j
+
+    # Precompute k-conservation table
+    # Given k-point indices (kL, i), kconserv_table[kshift,i] contains
+    # the index j that satisfies momentum conservation,
+    # (k(i) - k(j) - k(kL)) \dot a = 2n\pi
+    # i.e.
+    # - ki + kj + kL = G
+    kconserv_table = get_kconserv_ria_efficient(rpa.mol, kpts)
+    cderiarr = mydf.cderi_array()
+
+    for kL in range(nkpts):
+        Pi = None
+        # Lij: (ki, L, i, j) for looping every kL
+        # kidx: save kj that conserves with kL and ki (-ki+kj+kL=G)
+        # kidx_r: save ki that conserves with kL and kj (-ki+kj+kL=G)
+        # kidx = np.zeros((nkpts),dtype=np.int64)
+        # kidx_r = np.zeros((nkpts),dtype=np.int64)
+        for i, kpti in enumerate(kpts):
+            j = kconserv_table[kL, i]
+            kptj = kpts[j]
+            # Find (ki,kj) that satisfies momentum conservation with kL
+            kconserv = -kscaled[i] + kscaled[j] + kscaled[kL]
+            assert np.linalg.norm(np.round(kconserv) - kconserv) < 1e-12
+            logger.debug(rpa, f'Read Lpq (kL: {kL+1} / {nkpts}, ki: {i}, kj: {j})')
+            # Read (L|pq) and ao2mo transform to (L|ij)
+            # support uneqaul naux on different k points
+            Lpq = cderiarr.load(kpti, kptj)
+            if Lpq.shape[-1] == (nao * (nao + 1)) // 2:
+                Lpq = lib.unpack_tril(Lpq).reshape(-1, nao**2)
+            else:
+                Lpq = Lpq.reshape(-1, nao**2)
+            Lpq = Lpq.astype(np.complex128)
+            naux = Lpq.shape[0]
+
+            idx_occ_i, idx_frac_i, idx_vir_i = get_idx_metal(mo_occ[i])
+            idx_occ_j, idx_frac_j, idx_vir_j = get_idx_metal(mo_occ[j])
+
+            nocc_i = len(idx_occ_i)
+            nfrac_i = len(idx_frac_i)
+            nocc_j = len(idx_occ_j)
+            nfrac_j = len(idx_frac_j)
+            nseg = (nocc_i + nfrac_i) // rpa.segsize + 1
+            for iseg in range(nseg):
+                orb_start = iseg * rpa.segsize
+                orb_end = min((iseg + 1) * rpa.segsize, nocc_i + nfrac_i)
+                if orb_end == orb_start:
+                    break
+                norb_this_iter = orb_end - orb_start
+
+                tao = []
+                ao_loc = None
+                moij, ijslice = _conc_mos(mo_coeff[i], mo_coeff[j])[2:]
+
+                ijslice = (orb_start, orb_end, nmo + nocc_j, 2 * nmo)
+                Lij_slice = r_e2(Lpq, moij, ijslice, tao, ao_loc)
+                Lij_slice = Lij_slice.reshape(naux, norb_this_iter, nmo - nocc_j)
+                if Pi is None:
+                    Pi = np.zeros((nw, naux, naux), dtype=np.complex128)
+
+                # Find ka that conserves with ki and kL (-ki+ka+kL=G)
+                eia = mo_energy[i][orb_start:orb_end, None] - mo_energy[j][None, nocc_j:]
+                fia = (mo_occ[i][orb_start:orb_end, None] - mo_occ[j][None, nocc_j:]) / 2.0
+                # The overall fia[nocc_i:, :nfrac_j] *= 0.5 for double counting
+                if orb_start >= nocc_i:
+                    fia[:, :nfrac_j] *= 0.5
+                elif orb_end > nocc_i:
+                    offset = nocc_i - orb_start
+                    fia[offset:, :nfrac_j] *= 0.5
+                for w in range(nw):
+                    rho_accum_inner(Pi[w], eia, freqs[w], Lij_slice, alpha=4.0 / nkpts, fia=fia)
+
+        for w in range(nw):
+            e_corr += get_rpa_ecorr_w(Pi[w], wts[w])
+
+    e_corr = e_corr.real
+    e_corr *= 1.0 / (2.0 * np.pi) / nkpts
+    return e_corr
+
+
+def get_rpa_ecorr_w(Pi_w, wts_w):
+    """Get contribution to RPA correlation energy from a single frequency.
+
+    Parameters
+    ----------
+    Pi_w : complex 2d array
+        density-density response function at a single frequency
+    wts_w : double
+        weights of the frequency
+
+    Returns
+    -------
+    e_corr : double
+        correlation energy
+    """
+    # First, compute ec_w = Tr(Pi_w) + |log(det(I-Pi_w))|
+    ec_w = np.trace(Pi_w)
+    # The following two lines are equivalent to
+    # Pi_w = np.eye(naux) - Pi_w
+    blas.zdscal(-1.0, Pi_w.ravel(), overwrite_x=1)
+    np.fill_diagonal(Pi_w, np.diagonal(Pi_w) + 1.0)
+
+    ec_w += np.linalg.slogdet(Pi_w)[1]
+    #e_corr = 1.0 / (2.0 * np.pi) / nkpts * ec_w * wts_w
+    e_corr = ec_w * wts_w
+
+    return e_corr
+
+
+def get_rpa_exx(rpa, acfd=False, correction_only=False):
+    """Calculate RPA exchange energy.
+    For gapped systems, Hartree-Fock and adiabatic connection fluctuation dissipation exchange energies are the same.
+    For metallic systems, they are different.
+    The ACFD exchange energy is given by equation 12 in doi.org/10.1103/PhysRevB.81.115126
+
+    Parameters
+    ----------
+    rpa : KRPA
+        rpa object
+    acfd : bool, optional
+        calculate ACFD exchange energy, by default False
+    correction_only : bool, optional
+        only calculate the correction term, by default False
+
+    Returns
+    -------
+    ex : double
+        exchange energy
+    """
+    mo_energy = np.array(_mo_energy_frozen(rpa, rpa._scf.mo_energy))
+    mo_coeff = np.array(_mo_frozen(rpa, rpa._scf.mo_coeff))
+    mo_occ = np.array(_mo_occ_frozen(rpa, rpa._scf.mo_occ))
+
+    nocc = rpa.nocc
+    nao = rpa._scf.mo_coeff[0].shape[0]
+    nkpts = rpa.nkpts
+    kpts = rpa.kpts
+    mydf = rpa.with_df
+
+    # possible kpts shift center
+    kscaled = rpa.mol.get_scaled_kpts(kpts)
+    kscaled -= kscaled[0]
+
+    ex = 0j
+    cderiarr = mydf.cderi_array()
+    for kL in range(nkpts):
+        # kidx: save kj that conserves with kL and ki (-ki+kj+kL=G)
+        # kidx_r: save ki that conserves with kL and kj (-ki+kj+kL=G)
+        kidx = np.zeros(shape=[nkpts], dtype=np.int64)
+        kidx_r = np.zeros(shape=[nkpts], dtype=np.int64)
+        for i in range(nkpts):
+            for j in range(nkpts):
+                # Find (ki,kj) that satisfies momentum conservation with kL
+                kconserv = -kscaled[i] + kscaled[j] + kscaled[kL]
+                is_kconserv = np.linalg.norm(np.round(kconserv) - kconserv) < 1e-12
+                if is_kconserv:
+                    kidx[i] = j
+                    kidx_r[j] = i
+
+        for kn in range(nkpts):
+            # Find km that conserves with kn and kL (-km+kn+kL=G)
+            km = kidx_r[kn]
+
+            # logger.debug(gw, 'Read Lpq (kL: %s / %s, ki: %s, kj: %s @ Rank %d)' % (kL + 1, nkpts, i, j, rank))
+            # Read (L|pq) and ao2mo transform to (L|ij)
+            # support unequal naux on different k points
+            Lpq_ao = cderiarr.load(kpts[km], kpts[kn])
+            if Lpq_ao.shape[-1] == (nao * (nao + 1)) // 2:
+                Lpq_ao = lib.unpack_tril(Lpq_ao).reshape(-1, nao**2)
+            else:
+                Lpq_ao = Lpq_ao.reshape(-1, nao**2)
+            Lpq_ao = Lpq_ao.astype(np.complex128)
+
+            Lij = None
+            if hasattr(rpa._scf, 'sigma'):
+                idx_occ_i, idx_frac_i, _ = get_idx_metal(mo_occ[km])
+                idx_occ_j, idx_frac_j, _ = get_idx_metal(mo_occ[kn])
+                nocc_i = len(idx_occ_i) + len(idx_frac_i)
+                nocc_j = len(idx_occ_j) + len(idx_frac_j)
+                moij, ijslice = _conc_mos(mo_coeff[km][:, :nocc_i], mo_coeff[kn][:, :nocc_j])[2:]
+                Lij = r_e2(Lpq_ao, moij, ijslice, tao=[], ao_loc=None, out=Lij)
+                Lij = Lij.reshape(-1, nocc_i, nocc_j)
+
+                if acfd is True:
+                    if correction_only is True:
+                        mo_occ_ij = np.minimum(mo_occ[km][:nocc_i, None], mo_occ[kn][None, :nocc_j]) / 2.0
+                        mo_occ_ij -= mo_occ[km][:nocc_i, None] * mo_occ[kn][None, :nocc_j] / 4.0
+                    else:
+                        # numerical integration for equation 12 in doi.org/10.1103/PhysRevB.81.115126
+                        # NOTE: this integration is not stable!!!
+                        # w, wts = _get_scaled_legendre_roots(200)
+                        #eij = mo_energy[km][:nocc_i, None] - mo_energy[kn][None, :nocc_j]
+                        ##integrad = eij[:, :, None] / lib.direct_sum("ij+w->ijw", eij**2, w**2) * wts[None, None]
+                        #integrand = eij[:, :, None] / (eij[:, :, None]**2 + w**2) * wts[None, None]
+                        #integrand = np.sum(integrand, axis=2) * 2.0 / np.pi
+
+                        # The following line is equivalent to the frequency integration in equation 12 in
+                        # doi.org/10.1103/PhysRevB.81.115126
+                        # TODO: add a detailed note
+                        eij = mo_energy[km][:nocc_i, None] - mo_energy[kn][None, :nocc_j]
+                        integrand = np.zeros((nocc_i, nocc_j), dtype=np.complex128)
+                        integrand[eij > 1e-6] = 1
+                        integrand[eij < -1e-6] = -1
+                        mo_occ_ij = 1.0 - integrand
+                        # spin-restricted mo_occ should be divided by 2
+                        mo_occ_ij = mo_occ_ij * mo_occ[km][:nocc_i, None] / 2.0
+                else:
+                    mo_occ_ij = mo_occ[km][:nocc_i, None] * mo_occ[kn][None, :nocc_j] / 4.0
+                Lij_occ = Lij * mo_occ_ij[None]
+                # ex -= np.einsum('Lij,Lij->', Lij_occ.reshape(-1, nocc, nocc), Lij.reshape(-1, nocc, nocc).conj())
+                ex -= blas.zdotc(Lij_occ.ravel(), Lij.ravel())
+            else:
+                moij, ijslice = _conc_mos(mo_coeff[km][:, :nocc], mo_coeff[kn][:, :nocc])[2:]
+                Lij = r_e2(Lpq_ao, moij, ijslice, tao=[], ao_loc=None, out=Lij)
+                # ex -= np.einsum('Lij,Lij->', Lij.reshape(-1, nocc, nocc), Lij.reshape(-1, nocc, nocc).conj())
+                ex -= blas.zdotc(Lij.ravel(), Lij.ravel())
+
+    ex = ex.real
+    ex /= nkpts**2
+
+    if rpa._scf.exxdiv == 'ewald' and rpa._scf.cell.dimension != 0:
+        madelung = tools.pbc.madelung(rpa._scf.cell, kpts)
+        exxdiv_shift = madelung * np.sum(mo_occ**2) / (4.0 * nkpts)
+        ex -= exxdiv_shift
+        if acfd is True:
+            for k in range(nkpts):
+                idx_occ, idx_frac, _ = get_idx_metal(mo_occ[k])
+                f_i = mo_occ[k][:(len(idx_occ) + len(idx_frac))] / 2.0
+                ex -= madelung * np.sum(f_i - f_i * f_i) / nkpts
+
+    return ex
+
+
+def get_kconserv_ria_efficient(cell, kpts, tol=1e-12):
+    r"""Get the momentum conservation array for single excitation amplitudes
+    for a set of k-points with appropriate k-shift.
+
+
+    Given k-point indices (kshift, m) the array kconserv[kshift,m] returns
+    the index n that satisfies momentum conservation,
+
+        (k(m) - k(n) - k(kshift)) \dot a = 2n\pi
+    """
+    nkpts = kpts.shape[0]
+    a = cell.lattice_vectors() / (2 * np.pi)
+
+    kconserv = np.zeros((nkpts, nkpts), dtype=int)
+    kvKM = -kpts[:, None, :] + kpts[:, :]
+    for N, kvN in enumerate(kpts):
+        kvKMN = np.einsum('wx,kmx->wkm', a, kvKM - kvN, optimize=True)
+        # check whether (1/(2pi) k_{KLN} dot a) is an integer
+        kvKMN_int = np.rint(kvKMN)
+        mask = np.einsum('wkm->km', abs(kvKMN - kvKMN_int), optimize=True) < tol
+        kconserv[mask] = N
+    return kconserv
+
+
+class KRPA(lib.StreamObject):
+    def __init__(self, mf, frozen=None):
+        self.mol = mf.mol  # mol object
+        self._scf = mf  # mean-field object
+        self.verbose = self.mol.verbose  # verbose level
+        self.stdout = self.mol.stdout  # standard output
+        self.max_memory = mf.max_memory  # max memory in MB
+
+        # options
+        self.frozen = frozen  # frozen orbital options
+        self.grids_alg = 'legendre'  # algorithm to generate grids
+        self.outcore = False  # low-memory routine
+        self.segsize = 50  # number of orbitals in one segment for outcore
+        self.fc = False  # finite-size correction
+        self.fc_grid = False  # grids for finite-size correction
+        self.acfd_exx = False  # calculate ACFD exchange energy
+
+        # don't modify the following attributes, they are not input options
+        self._nocc = None  # number of occupied orbitals
+        self._nmo = None  # number of orbitals (exclude frozen orbitals)
+        self.kpts = mf.kpts  # k-points
+        self.nkpts = len(self.kpts)  # number of k-points
+        self.mo_energy = np.array(mf.mo_energy, copy=True)  # orbital energy
+        self.mo_coeff = np.array(mf.mo_coeff, copy=True)  # orbital coefficient
+        self.mo_occ = np.array(mf.mo_occ, copy=True)  # occupation number
+        self.e_corr = None  # correlation energy
+        self.e_hf = None  # Hartree-Fock energy
+        self.e_tot = None  # total energy
+
+        # KRPA must use GDF integrals
+        if getattr(mf, 'with_df', None):
+            self.with_df = mf.with_df
+        else:
+            raise NotImplementedError
+        self._keys.update(['with_df'])
+
+        return
+
+    def dump_flags(self, verbose=None):
+        log = logger.Logger(self.stdout, self.verbose)
+        log.info('')
+        log.info('******** %s ********', self.__class__)
+        log.info('method = %s', self.__class__.__name__)
+        nocc = self.nocc
+        nvir = self.nmo - nocc
+        nkpts = self.nkpts
+        log.info(f'RPA nocc = {nocc}, nvir = {nvir}, nkpts = {nkpts}')
+        if self.frozen is not None:
+            log.info(f'frozen orbitals = {str(self.frozen)}')
+        log.info('grid type = %s', self.grids_alg)
+        log.info('outcore mode = %s', self.outcore)
+        if self.outcore is True:
+            log.info('outcore segment size = %d', self.segsize)
+        log.info('RPA finite size corrections = %s', self.fc)
+        log.info('ACFD exchange energy = %s', self.acfd_exx)
+        log.info('')
+        return
+
+    @property
+    def nocc(self):
+        frozen_mask = get_frozen_mask(self)
+        nkpts = len(self._scf.mo_energy)
+        nelec = 0.0
+        for k in range(nkpts):
+            nelec += np.sum(self._scf.mo_occ[k][frozen_mask[k]])
+        nelec = int(nelec / nkpts)
+        return nelec // 2
+
+    @nocc.setter
+    def nocc(self, n):
+        self._nocc = n
+
+    @property
+    def nmo(self):
+        frozen_mask = get_frozen_mask(self)
+        return len(self._scf.mo_energy[0][frozen_mask[0]])
+
+    @nmo.setter
+    def nmo(self, n):
+        self._nmo = n
+
+    get_nocc = get_nocc
+    get_nmo = get_nmo
+    get_frozen_mask = get_frozen_mask
+
+    def kernel(self, mo_energy=None, mo_coeff=None, nw=None, with_e_hf=None):
+        """RPA correlation and total energy
+
+        Calculated total energy, HF energy and RPA correlation energy
+        are stored in self.e_tot, self.e_hf, self.e_corr
+
+        Parameters
+        ----------
+        mo_energy : double array
+            molecular orbital energies
+        mo_coeff : double ndarray
+            molecular orbital coefficients
+        nw : int, optional
+            number of frequency point on imaginary axis, by default None
+        with_e_hf : float, optional
+            If given, overrides the HF energy computation.
+
+        Returns
+        -------
+        e_tot : float
+            RPA total energy
+        e_hf : float
+            HF energy (exact exchange for given mo_coeff)
+        e_corr : float
+            RPA correlation energy
+        """
+        if mo_coeff is None:
+            mo_coeff = _mo_frozen(self, self._scf.mo_coeff)
+        if mo_energy is None:
+            mo_energy = _mo_energy_frozen(self, self._scf.mo_energy)
+
+        cput0 = (time.process_time(), time.perf_counter())
+        self.dump_flags()
+        self.e_tot, self.e_hf, self.e_corr = kernel(self, mo_energy, mo_coeff, nw=nw, with_e_hf=with_e_hf)
+        logger.timer(self, 'RPA', *cput0)
+        return self.e_tot, self.e_hf, self.e_corr
+
+    def get_grids(self, alg=None, nw=None, mo_energy=None):
+        """Generate grids for integration.
+
+        Parameters
+        ----------
+        alg : str, optional
+            algorithm for generating grids, by default None
+        nw : int, optional
+            number of grids, by default None
+        mo_energy : double 2d array, optional
+            orbital energy, used for minimax grids, by default None
+
+        Returns
+        -------
+        freqs : double 1d array
+            frequency grid
+        wts : double 1d array
+            weight of grids
+        """
+        if alg is None:
+            alg = self.grids_alg
+        if mo_energy is None:
+            mo_energy = _mo_energy_frozen(self, self._scf.mo_energy)
+        if alg == 'legendre':
+            nw = 40 if nw is None else nw
+            freqs, wts = _get_scaled_legendre_roots(nw)
+        else:
+            raise NotImplementedError('Grids algorithm not implemented!')
+
+        return freqs, wts
+
+    def get_q_mesh(self, mo_energy, mo_coeff):
+        """Get q-mesh for finite size correction.
+        Equation 39-42 in doi.org/10.1021/acs.jctc.0c00704
+
+        Parameters
+        ----------
+        mo_energy : double 2d array
+            orbital energy
+        mo_coeff : double 3d array
+            coefficient from AO to MO
+
+        Returns
+        -------
+        qij : double 1d array
+            q-mesh grids
+        q_abs : double 1d array
+            absolute positions of q-mesh grids
+        nq_pts : init
+            number of q-mesh grids
+        """
+        nocc = self.nocc
+        nmo = self.nmo
+        nkpts = self.nkpts
+        # Set up q mesh for q->0 finite size correction
+        if not self.fc_grid:
+            q_pts = np.array([1e-3, 0, 0], dtype=np.double).reshape(1, 3)
+        else:
+            Nq = 3
+            q_pts = np.zeros(shape=[Nq**3 - 1, 3], dtype=np.double)
+            for i in range(Nq):
+                for j in range(Nq):
+                    for k in range(Nq):
+                        if i == 0 and j == 0 and k == 0:
+                            continue
+                        else:
+                            q_pts[i * Nq**2 + j * Nq + k - 1, 0] = k * 5e-4
+                            q_pts[i * Nq**2 + j * Nq + k - 1, 1] = j * 5e-4
+                            q_pts[i * Nq**2 + j * Nq + k - 1, 2] = i * 5e-4
+        nq_pts = len(q_pts)
+        q_abs = self.mol.get_abs_kpts(q_pts)
+
+        # Get qij = 1/sqrt(Omega) * < psi_{ik} | e^{iqr} | psi_{ak-q} > at q: (nkpts, nocc, nvir)
+        qij = np.zeros(shape=[nq_pts, nkpts, nocc, nmo - nocc], dtype=np.complex128)
+
+        if not self.fc_grid:
+            for k in range(nq_pts):
+                qij[k] = get_qij(self, q_abs[k], mo_energy, mo_coeff)
+        else:
+            for k in range(nq_pts):
+                qij[k] = get_qij(self, q_abs[k], mo_energy, mo_coeff)
+
+        return qij, q_abs, nq_pts
+
+    def get_acfd_exx(self, correction_only=False):
+        """Calculate ACFD exchange energy.
+
+        Parameters
+        ----------
+        correction_only : bool
+            only return the correction term
+
+        Returns
+        -------
+        ex_acfd : double
+            ACFD exchange energy
+        """
+        ex_acfd = get_rpa_exx(self, acfd=True, correction_only=correction_only)
+        return ex_acfd
diff --git a/pyscf/pbc/gw/kugw_ac.py b/pyscf/pbc/gw/kugw_ac.py
index f26d5204b6..c63e246b62 100644
--- a/pyscf/pbc/gw/kugw_ac.py
+++ b/pyscf/pbc/gw/kugw_ac.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright 2014-2021 The PySCF Developers. All Rights Reserved.
+# Copyright 2014-2026 The PySCF Developers. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,275 +14,540 @@
 # limitations under the License.
 #
 # Author: Tianyu Zhu <zhutianyu1991@gmail.com>
+# Author: Christopher Hillenbrand <chillenbrand15@gmail.com>
+# Author: Jiachen Li <lijiachen.duke@gmail.com>
 #
 
 '''
-PBC spin-unrestricted G0W0-AC QP eigenvalues with k-point sampling
+Periodic spin-unrestricted G0W0 method based on the analytic continuation scheme.
+This implementation has N^4 scaling,
+and is faster than GW-CD (N^4~N^5) and fully analytic GW (N^6) methods.
 GW-AC is recommended for valence states only, and is inaccurate for core states.
 
-Method:
-    See T. Zhu and G.K.-L. Chan, arxiv:2007.03148 (2020) for details
-    Compute Sigma on imaginary frequency with density fitting,
-    then analytically continued to real frequency.
-    Gaussian density fitting must be used (FFTDF and MDF are not supported).
+References:
+    T. Zhu and G.K.-L. Chan, J. Chem. Theory. Comput. 17, 727-741 (2021)
+    New J. Phys. 14 053020 (2012)
 '''
 
 from functools import reduce
-import numpy
-import numpy as np
 import h5py
-from scipy.optimize import newton, least_squares
+import time
+import numpy as np
+import scipy
 
 from pyscf import lib
-from pyscf.lib import logger
 from pyscf.ao2mo import _ao2mo
 from pyscf.ao2mo.incore import _conc_mos
+from pyscf.lib import einsum, logger, temporary_env
 from pyscf.pbc import df, dft, scf
-from pyscf.pbc.cc.kccsd_uhf import get_nocc, get_nmo, get_frozen_mask
-from pyscf import __config__
-
-einsum = lib.einsum
-
-def kernel(gw, mo_energy, mo_coeff, orbs=None,
-           kptlist=None, nw=None, verbose=logger.NOTE):
-    '''
-    GW-corrected quasiparticle orbital energies
-    Returns:
-        A list :  converged, mo_energy, mo_coeff
-    '''
-    mf = gw._scf
-    assert gw.frozen is None
+from pyscf.pbc.mp.kump2 import get_frozen_mask
 
-    nmoa, nmob = gw.nmo
-    nocca, noccb = gw.nocc
+from pyscf.pbc.gw.krgw_ac import KRGWAC
+from pyscf.gw.utils.ac_grid import _get_scaled_legendre_roots, PadeAC, TwoPoleAC
+from pyscf.gw.utils.gw_np_helper import mkslice, array_scale
 
-    if orbs is None:
-        orbs = range(nmoa)
-    if kptlist is None:
-        kptlist = range(gw.nkpts)
 
+def kernel(gw):
+    mf = gw._scf
+    nmo = gw.nmo[0]
     nkpts = gw.nkpts
-    nklist = len(kptlist)
+
+    # set frozen orbitals
+    gw.set_frozen_orbs()
+    orbs = gw.orbs
+    orbs_frz = gw.orbs_frz
+    kptlist = gw.kptlist
+    if kptlist is None:
+        gw.kptlist = kptlist = range(gw.nkpts)
+    mo_energy_frz = _mo_energy_frozen(gw, gw.mo_energy)
+    mo_coeff_frz = _mo_frozen(gw, gw.mo_coeff)
 
     # v_xc
-    dm = np.array(mf.make_rdm1())
-    v_mf = np.array(mf.get_veff())
-    vj = np.array(mf.get_j(dm_kpts=dm))
-    v_mf[0] = v_mf[0] - (vj[0] + vj[1])
-    v_mf[1] = v_mf[1] - (vj[0] + vj[1])
+    with temporary_env(mf, verbose=0), temporary_env(mf.mol, verbose=0), temporary_env(mf.with_df, verbose=0):
+        dm = mf.make_rdm1()
+        v_mf_ao = mf.get_veff()
+        vj_ao = mf.get_j(dm_kpts=dm)
+    v_mf_ao[0] = v_mf_ao[0] - (vj_ao[0] + vj_ao[1])
+    v_mf_ao[1] = v_mf_ao[1] - (vj_ao[0] + vj_ao[1])
+    v_mf = np.zeros(shape=[2, nkpts, nmo, nmo], dtype=np.complex128)
     for s in range(2):
         for k in range(nkpts):
-            v_mf[s,k] = reduce(numpy.dot, (mo_coeff[s,k].T.conj(), v_mf[s,k], mo_coeff[s,k]))
+            v_mf[s, k] = reduce(np.matmul, (mo_coeff_frz[s, k].T.conj(), v_mf_ao[s, k], mo_coeff_frz[s, k]))
+    gw.vxc = v_mf
 
     # v_hf from DFT/HF density
-    if gw.fc:
-        exxdiv = 'ewald'
-    else:
-        exxdiv = None
-    uhf = scf.KUHF(gw.mol, gw.kpts, exxdiv=exxdiv)
+    if isinstance(mf.with_df, df.GDF):
+        uhf = scf.KUHF(gw.mol.copy(deep=True), gw.kpts, exxdiv=None).density_fit()
+    elif isinstance(mf.with_df, df.RSDF):
+        uhf = scf.KUHF(gw.mol.copy(deep=True), gw.kpts, exxdiv=None).rs_density_fit()
+    if hasattr(mf, 'sigma'):
+        uhf = scf.addons.smearing_(uhf, sigma=mf.sigma, method=mf.smearing_method)
     uhf.with_df = gw.with_df
-    uhf.with_df._cderi = gw.with_df._cderi
-    if uhf.with_df._j_only:
-        logger.debug(gw, 'Rebuild CDERI for exchange integrals')
-        uhf.with_df.build(j_only=False)
-    vk = uhf.get_veff(gw.mol,dm_kpts=dm)
-    vj = uhf.get_j(gw.mol,dm_kpts=dm)
-    vk[0] = vk[0] - (vj[0] + vj[1])
-    vk[1] = vk[1] - (vj[0] + vj[1])
+    uhf.verbose = uhf.mol.verbose = 0
+    with temporary_env(uhf, verbose=0), temporary_env(uhf.with_df, verbose=0):
+        vk_ao = uhf.get_veff(dm_kpts=dm)
+        vj_ao = uhf.get_j(dm_kpts=dm)
+    vk_ao[0] = vk_ao[0] - (vj_ao[0] + vj_ao[1])
+    vk_ao[1] = vk_ao[1] - (vj_ao[0] + vj_ao[1])
+    vk = np.zeros(shape=[2, nkpts, nmo, nmo], dtype=np.complex128)
     for s in range(2):
         for k in range(nkpts):
-            vk[s,k] = reduce(numpy.dot, (mo_coeff[s,k].T.conj(), vk[s,k], mo_coeff[s,k]))
+            vk[s, k] = reduce(np.matmul, (mo_coeff_frz[s, k].T.conj(), vk_ao[s, k], mo_coeff_frz[s, k]))
 
-    # Grids for integration on imaginary axis
-    freqs,wts = _get_scaled_legendre_roots(nw)
-
-    # Compute self-energy on imaginary axis i*[0,iw_cutoff]
-    sigmaI, omega = get_sigma_diag(gw, orbs, kptlist, freqs, wts, iw_cutoff=5.)
-
-    # Analytic continuation
-    coeff_a = []
-    coeff_b = []
+    # finite size correction for exchange self-energy
+    if gw.fc:
+        vk_corr = -2.0 / np.pi * (6.0 * np.pi**2 / gw.mol.vol / nkpts) ** (1.0 / 3.0)
+        for s in range(2):
+            for k in range(nkpts):
+                # NOTE: here was a bug in commits before 2024/12
+                for i in range(gw.nocc[s]):
+                    vk[s][k][i, i] = vk[s][k][i, i] + vk_corr
+    gw.vk = vk
+
+    # set up Fermi level
+    ef = gw.ef = get_ef(kmf=mf, mo_energy=mf.mo_energy)
+
+    # grids for integration on imaginary axis
+    gw.freqs, gw.wts = freqs, wts = _get_scaled_legendre_roots(gw.nw)
+
+    # calculate self-energy on imaginary axis
+    sigmaI, omega = get_sigma(
+        gw, freqs, wts, ef=ef, mo_energy=mo_energy_frz, orbs=orbs_frz, kptlist=kptlist, iw_cutoff=gw.ac_iw_cutoff,
+        fullsigma=gw.fullsigma)
+
+    # analytic continuation
     if gw.ac == 'twopole':
-        for k in range(nklist):
-            coeff_a.append(AC_twopole_diag(sigmaI[0,k], omega[0], orbs, nocca))
-            coeff_b.append(AC_twopole_diag(sigmaI[1,k], omega[1], orbs, noccb))
+        acobj = TwoPoleAC(list(range(nmo)), gw.nocc)
     elif gw.ac == 'pade':
-        for k in range(nklist):
-            coeff_a_tmp, omega_fit_a = AC_pade_thiele_diag(sigmaI[0,k], omega[0])
-            coeff_b_tmp, omega_fit_b = AC_pade_thiele_diag(sigmaI[1,k], omega[1])
-            coeff_a.append(coeff_a_tmp)
-            coeff_b.append(coeff_b_tmp)
-        omega_fit = np.asarray((omega_fit_a, omega_fit_b))
-    coeff = np.asarray((coeff_a, coeff_b))
-
-    conv = True
-    # This code does not support metals
-    homo = -99.
-    lumo = 99.
-    mo_energy = np.asarray(mf.mo_energy)
-    for k in range(nkpts):
-        if homo < max(mo_energy[0,k][nocca-1],mo_energy[1,k][noccb-1]):
-            homo = max(mo_energy[0,k][nocca-1],mo_energy[1,k][noccb-1])
-        if lumo > min(mo_energy[0,k][nocca],mo_energy[1,k][noccb]):
-            lumo = min(mo_energy[0,k][nocca],mo_energy[1,k][noccb])
-    ef = (homo+lumo)/2.
+        acobj = PadeAC(npts=gw.ac_pade_npts, step_ratio=gw.ac_pade_step_ratio)
+    else:
+        raise ValueError('Unknown GW-AC type %s' % (str(gw.ac)))
+
+    acobj.ac_fit(sigmaI, omega, axis=-1)
 
-    mo_energy = np.zeros_like(np.array(mf.mo_energy))
+    if gw.fullsigma:
+        diag_acobj = acobj.diagonal(axis1=2, axis2=3)
+    else:
+        diag_acobj = acobj
+
+    mo_energy = np.zeros_like(mf.mo_energy)
     for s in range(2):
-        for k in range(nklist):
-            kn = kptlist[k]
-            for p in orbs:
-                if gw.linearized:
+        for ik, k in enumerate(kptlist):
+            for ip, p in enumerate(orbs_frz):
+                if gw.qpe_linearized:
                     # linearized G0W0
                     de = 1e-6
-                    ep = mf.mo_energy[s][kn][p]
-                    #TODO: analytic sigma derivative
-                    if gw.ac == 'twopole':
-                        sigmaR = two_pole(ep-ef, coeff[s,k,:,p-orbs[0]]).real
-                        dsigma = two_pole(ep-ef+de, coeff[s,k,:,p-orbs[0]]).real - sigmaR.real
-                    elif gw.ac == 'pade':
-                        sigmaR = pade_thiele(ep-ef, omega_fit[s,p-orbs[0]], coeff[s,k,:,p-orbs[0]]).real
-                        dsigma = pade_thiele(ep-ef+de, omega_fit[s,p-orbs[0]],
-                                             coeff[s,k,:,p-orbs[0]]).real - sigmaR.real
-                    zn = 1.0/(1.0-dsigma/de)
-                    e = ep + zn*(sigmaR.real + vk[s,kn,p,p].real - v_mf[s,kn,p,p].real)
-                    mo_energy[s,kn,p] = e
+                    ep = mf.mo_energy[s][k][orbs[ip]]
+                    sigmaR = diag_acobj[s, ik, ip].ac_eval(ep).real
+                    dsigma = diag_acobj[s, ik, ip].ac_eval(ep + de).real - sigmaR.real
+                    zn = 1.0 / (1.0 - dsigma / de)
+                    if gw.qpe_linearized_range is not None:
+                        zn = 1.0 if zn < gw.qpe_linearized_range[0] or zn > gw.qpe_linearized_range[1] else zn
+                    mo_energy[s, k, orbs[ip]] = ep + zn * (sigmaR + vk[s, k, p, p] - v_mf[s, k, p, p]).real
                 else:
                     # self-consistently solve QP equation
                     def quasiparticle(omega):
-                        if gw.ac == 'twopole':
-                            sigmaR = two_pole(omega-ef, coeff[s,k,:,p-orbs[0]]).real
-                        elif gw.ac == 'pade':
-                            sigmaR = pade_thiele(omega-ef, omega_fit[s,p-orbs[0]], coeff[s,k,:,p-orbs[0]]).real
-                        return omega - mf.mo_energy[s][kn][p] - (sigmaR.real + vk[s,kn,p,p].real - v_mf[s,kn,p,p].real)
+                        sigmaR = diag_acobj[s, ik, ip].ac_eval(omega)
+                        return omega - mf.mo_energy[s][k][orbs[ip]] - (sigmaR + vk[s, k, p, p] - v_mf[s, k, p, p]).real
+
                     try:
-                        e = newton(quasiparticle, mf.mo_energy[s][kn][p], tol=1e-6, maxiter=100)
-                        mo_energy[s,kn,p] = e
+                        mo_energy[s, k, orbs[ip]] = scipy.optimize.newton(
+                            quasiparticle, mf.mo_energy[s][k][orbs[ip]], tol=gw.qpe_tol, maxiter=gw.qpe_max_iter
+                        )
                     except RuntimeError:
-                        conv = False
-    mo_coeff = mf.mo_coeff
+                        logger.warn(gw, 'QPE for spin=%d k=%d orbital=%d not converged!', s, k, orbs[ip])
+
+    # save GW results
+    gw.mo_energy = mo_energy
+    gw.acobj = acobj
 
     if gw.verbose >= logger.DEBUG:
-        numpy.set_printoptions(threshold=nmoa)
-        for k in range(nkpts):
-            logger.debug(gw, '  GW mo_energy spin-up @ k%d =\n%s', k,mo_energy[0,k])
-        for k in range(nkpts):
-            logger.debug(gw, '  GW mo_energy spin-down @ k%d =\n%s', k,mo_energy[1,k])
-        numpy.set_printoptions(threshold=1000)
+        with np.printoptions(threshold=len(mf.mo_energy[0][0])):
+            for k in range(nkpts):
+                logger.debug(gw, '  GW mo_energy spin-up @ k%d =\n%s', k, mo_energy[0, k])
+            for k in range(nkpts):
+                logger.debug(gw, '  GW mo_energy spin-down @ k%d =\n%s', k, mo_energy[1, k])
+
+    if gw.writefile > 0:
+        with h5py.File('vxc.h5', 'w') as feri:
+            feri['vk'] = np.asarray(vk)
+            feri['v_mf'] = np.asarray(v_mf)
+
+        with h5py.File('sigma_imag.h5', 'w') as feri:
+            feri['sigmaI'] = np.asarray(sigmaI)
+            feri['omega'] = np.asarray(omega)
+            if gw.sigmaI is not None:
+                feri['sigmaI_full'] = np.asarray(gw.sigmaI)
+
+        acobj.save('ac_coeff.h5')
+
+    return
+
+
+def get_rho_response(omega, nocc, mo_energy, Lia, kidx):
+    """Get Pi=PV.
+    P is density-density response function.
+    V is two-electron integral.
+    See equation 24 in 10.1021/acs.jctc.0c00704
+
+    Parameters
+    ----------
+    omega : double
+        real position of imaginary frequency
+    nocc : list of int
+        number of occupied orbitals for two spins
+    mo_energy : double ndarray
+        orbital energy
+    Lia : list of complex 4d array
+        occupied-virtual block three-center density-fitting matrix in MO
+    kidx : list
+        momentum-conserved k-point list kj=kidx[ki]
+
+    Returns
+    -------
+    Pi : complex ndarray
+        Pi in auxiliary basis at freq iw
+    """
+    nkpts, naux = Lia[0].shape[:2]
+    nocc = [Lia[0].shape[2], Lia[1].shape[2]]
+    nvir = [Lia[0].shape[3], Lia[1].shape[3]]
 
-    return conv, mo_energy, mo_coeff
+    # Compute Pi for kL
+    Pi = np.zeros(shape=[naux, naux], dtype=np.complex128)
+    for i in range(nkpts):
+        # Find ka that conserves with ki and kL (-ki+ka+kL=G)
+        a = kidx[i]
 
-def get_rho_response(gw, omega, mo_energy, Lpq, kL, kidx):
-    '''
-    Compute density response function in auxiliary basis at freq iw
-    '''
-    spin, nkpts, naux, nmo, nmo = Lpq.shape
-    nocca, noccb = gw.nocc
-    kpts = gw.kpts
-    kscaled = gw.mol.get_scaled_kpts(kpts)
-    kscaled -= kscaled[0]
+        for s in range(2):
+            eia = mo_energy[s, i, :nocc[s], None] - mo_energy[s, a, None, nocc[s]:]
+            Lia_i_s = Lia[s][i]
+            eia = eia / (omega**2 + eia**2)
+            Pia = Lia_i_s * eia
+
+            # Pi += einsum('Pia,Qia->PQ', Pia, Lia.conj())
+            scipy.linalg.blas.zgemm(
+                alpha=2.0 / nkpts,
+                a=Lia_i_s.reshape(naux, nocc[s] * nvir[s]).T,
+                b=Pia.reshape(naux, nocc[s] * nvir[s]).T,
+                c=Pi.T,
+                trans_a=2,
+                trans_b=0,
+                beta=1.0,
+                overwrite_c=True,
+            )
+            Pia = Lia_i_s = None
+    return Pi
+
+
+def get_rho_response_metal(omega, mo_energy, mo_occ, Lpq, kidx):
+    """Get Pi=PV for metallic systems.
+    P is density-density response function.
+    V is two-electron integral.
+    See equation 24 in 10.1021/acs.jctc.0c00704
+
+    Parameters
+    ----------
+    omega : double
+        real position of imaginary frequency
+    mo_energy : double ndarray
+        orbital energy
+    mo_occ : double ndarray
+        occupation number
+    Lpq : complex ndarray
+        three-center density-fitting matrix in MO
+    kidx : list
+        momentum-conserved k-point list kj=kidx[ki]
+
+    Returns
+    -------
+    Pi : complex ndarray
+        Pi in auxiliary basis at freq iw
+    """
+    nkpts, nspin, naux, nmo, nmo = Lpq.shape
 
     # Compute Pi for kL
-    Pi = np.zeros((naux,naux),dtype=np.complex128)
-    for i, kpti in enumerate(kpts):
+    Pi = np.zeros(shape=[naux, naux], dtype=np.complex128)
+    for i in range(nkpts):
         # Find ka that conserves with ki and kL (-ki+ka+kL=G)
         a = kidx[i]
-        eia_a = mo_energy[0,i,:nocca,None] - mo_energy[0,a,None,nocca:]
-        eia_b = mo_energy[1,i,:noccb,None] - mo_energy[1,a,None,noccb:]
-        eia_a = eia_a/(omega**2+eia_a*eia_a)
-        eia_b = eia_b/(omega**2+eia_b*eia_b)
-        Pia_a = einsum('Pia,ia->Pia',Lpq[0,i][:,:nocca,nocca:],eia_a)
-        Pia_b = einsum('Pia,ia->Pia',Lpq[1,i][:,:noccb,noccb:],eia_b)
-        # Response from both spin-up and spin-down density
-        Pi += 2./nkpts * (einsum('Pia,Qia->PQ',Pia_a,Lpq[0,i][:,:nocca,nocca:].conj()) +
-                          einsum('Pia,Qia->PQ',Pia_b,Lpq[1,i][:,:noccb,noccb:].conj()))
+
+        for s in range(nspin):
+            eia = mo_energy[s, i, :, None] - mo_energy[s, a, None, :]
+            fia = mo_occ[s][i][:, None] - mo_occ[s][a][None, :]
+            Lia = np.ascontiguousarray(Lpq[i, s])
+            eia = eia * fia / (omega**2 + eia**2)
+            Pia = Lia * eia
+
+            # Pi += einsum('Pia, Qia -> PQ', Pia, Lia.conj()) / nkpts
+            scipy.linalg.blas.zgemm(
+                alpha=1.0 / nkpts,
+                a=Lia.reshape(naux, nmo * nmo).T,
+                b=Pia.reshape(naux, nmo * nmo).T,
+                c=Pi.T,
+                trans_a=2,
+                trans_b=0,
+                beta=1.0,
+                overwrite_c=True,
+            )
+            Pia = Lia = None
+    return Pi
+
+
+def get_rho_response_head(omega, mo_energy, qij):
+    """Compute head (G=0, G'=0) density response function in auxiliary basis at freq iw.
+    equation 48 in 10.1021/acs.jctc.0c00704
+
+    Parameters
+    ----------
+    omega : double
+        frequency point
+    mo_energy : double ndarray
+        orbital energy
+    qij : list of complex ndarray
+        pair density matrix defined as equation 51 in 10.1021/acs.jctc.0c00704
+
+    Returns
+    -------
+    Pi_00 : complex
+        head response function
+    """
+    nkpts = qij[0].shape[0]
+    nocc = [qij[0].shape[1], qij[1].shape[1]]
+
+    Pi_00 = 0j
+    for k in range(nkpts):
+        for s in range(2):
+            eia = mo_energy[s, k, : nocc[s], None] - mo_energy[s, k, None, nocc[s] :]
+            eia = eia / (omega**2 + eia**2)
+            Pi_00 += 2.0 / nkpts * einsum('ia,ia->', eia, qij[s][k].conj() * qij[s][k])
+
+    return Pi_00
+
+
+def get_rho_response_wing(omega, mo_energy, Lia, qij):
+    """Compute wing (G=P, G'=0) density response function in auxiliary basis at freq iw.
+     equation 48 in 10.1021/acs.jctc.0c00704
+
+     Parameters
+     ----------
+     omega : double
+         frequency point
+     mo_energy : double ndarray
+         orbital energy
+     Lia : complex ndarray
+         occupied-virtual block three-center density fitting matrix in MO
+     qij : list of complex ndarray
+         pair density matrix defined as equation 51 in 10.1021/acs.jctc.0c00704
+
+     Returns
+     -------
+    Pi : complex ndarray
+         wing response function
+    """
+    nkpts, naux = Lia[0].shape[:2]
+    nocc = [Lia[0].shape[2], Lia[1].shape[2]]
+    nvir = [Lia[0].shape[3], Lia[1].shape[3]]
+
+    Pi = np.zeros(shape=[naux], dtype=np.complex128)
+    for k in range(nkpts):
+        for s in range(2):
+            eia = mo_energy[s, k, :nocc[s], None] - mo_energy[s, k, None, nocc[s]:]
+            eia = eia / (omega**2 + eia**2)
+            eia_q = eia * qij[s][k].conj()
+
+            Pi += 2.0 / nkpts * np.matmul(Lia[s][k].reshape(naux, nocc[s] * nvir[s]), eia_q.reshape(-1))
+
     return Pi
 
-def get_sigma_diag(gw, orbs, kptlist, freqs, wts, iw_cutoff=None, max_memory=8000):
-    '''
-    Compute GW correlation self-energy (diagonal elements) in MO basis
-    on imaginary axis
-    '''
-    mo_energy = np.array(gw._scf.mo_energy)
-    mo_coeff = np.array(gw._scf.mo_coeff)
+
+def get_qij(gw, q, mo_energy, mo_coeff, uniform_grids=False):
+    """Compute pair density matrix in the long-wavelength limit through kp perturbation theory
+    qij = 1/Omega * |< psi_{ik} | e^{iqr} | psi_{ak-q} >|^2
+    equation 51 in 10.1021/acs.jctc.0c00704
+    Ref: Phys. Rev. B 83, 245122 (2011)
+
+    Parameters
+    ----------
+    gw : KUGWAC
+        gw object, provides attributes: nocc, nmo, kpts, mol
+    q : double
+        q grid
+    mo_energy : double ndarray
+        orbital energy
+    mo_coeff : complex ndarray
+        coefficient from AO to MO
+    uniform_grids : bool, optional
+        use uniform grids, by default False
+
+    Returns
+    -------
+    list
+        pair density matrix of two spins in the long-wavelength limit
+    """
     nocca, noccb = gw.nocc
     nmoa, nmob = gw.nmo
+    nvira = nmoa - nocca
+    nvirb = nmob - noccb
+    kpts = gw.kpts
+    nkpts = len(kpts)
+    cell = gw.mol
+
+    if uniform_grids:
+        with temporary_env(cell, verbose=0):
+            mydf = df.FFTDF(cell, kpts=kpts)
+            coords = cell.gen_uniform_grids(mydf.mesh)
+    else:
+        with temporary_env(cell, verbose=0):
+            coords, weights = dft.gen_grid.get_becke_grids(cell, level=4)
+    ngrid = len(coords)
+
+    qij_a = np.zeros(shape=[nkpts, nocca, nvira], dtype=np.complex128)
+    qij_b = np.zeros(shape=[nkpts, noccb, nvirb], dtype=np.complex128)
+    for i, kpti in enumerate(kpts):
+        ao_p = dft.numint.eval_ao(cell, coords, kpt=kpti, deriv=1)
+        ao = ao_p[0]
+        ao_grad = ao_p[1:4]
+        if uniform_grids:
+            ao_ao_grad = einsum('mg,xgn->xmn', ao.T.conj(), ao_grad) * cell.vol / ngrid
+        else:
+            ao_ao_grad = einsum('g,mg,xgn->xmn', weights, ao.T.conj(), ao_grad)
+        q_ao_ao_grad = -1j * einsum('x,xmn->mn', q, ao_ao_grad)
+        q_mo_mo_grad_a = reduce(
+            np.matmul, (mo_coeff[0, i][:, :nocca].T.conj(), q_ao_ao_grad, mo_coeff[0, i][:, nocca:])
+        )
+        q_mo_mo_grad_b = reduce(
+            np.matmul, (mo_coeff[1, i][:, :noccb].T.conj(), q_ao_ao_grad, mo_coeff[1, i][:, noccb:])
+        )
+        enm_a = 1.0 / (mo_energy[0, i][nocca:, None] - mo_energy[0, i][None, :nocca])
+        enm_b = 1.0 / (mo_energy[1, i][noccb:, None] - mo_energy[1, i][None, :noccb])
+        dens_a = enm_a.T * q_mo_mo_grad_a
+        dens_b = enm_b.T * q_mo_mo_grad_b
+        qij_a[i] = dens_a / np.sqrt(cell.vol)
+        qij_b[i] = dens_b / np.sqrt(cell.vol)
+
+    return (qij_a, qij_b)
+
+
+def get_sigma(
+    gw, freqs, wts, ef, mo_energy, orbs=None, kptlist=None, mo_coeff=None, mo_occ=None, iw_cutoff=None, fullsigma=False
+):
+    """Get GW self-energy.
+    See equation 27 in 10.1021/acs.jctc.0c00704
+
+    Parameters
+    ----------
+    gw : KUGWAC
+        GW objects,
+        provides attributes: _scf, mol, frozen, nmo, nocc, kpts, nkpts, mo_coeff, mo_occ, fc, fc_grid, with_df
+    freqs : double array
+        position of imaginary frequency
+    wts : double array
+        weight of frequency points
+    ef : double
+        Fermi level
+    mo_energy : double ndarray
+        non-frozen orbital energy
+    orbs : list, optional
+        orbital index in non-frozen nmo to calculate self-energy, by default None
+    kptlist : list, optional
+        k-point index to calculate self-energy, by default None
+    mo_coeff : complex ndarray, optional
+        coefficient from AO to non-frozen MO, by default None
+    mo_occ : double ndarray, optional
+        non-frozen occupation number, by default None
+    iw_cutoff : complex, optional
+        imaginary grid cutoff for fitting, by default None
+    fullsigma : bool, optional
+        calculate off-diagonal elements, by default False
+
+    Returns
+    -------
+    sigma: complex ndarray
+        self-energy on the imaginary axis
+    omega: complex ndarray
+        imaginary frequency grids of self-energy
+    """
+    nocca, noccb = nocc = gw.nocc
+    nmoa, nmob = nmo = gw.nmo
     nkpts = gw.nkpts
     kpts = gw.kpts
+
+    assert nmoa == nmob
+    if orbs is None:
+        orbs = list(range(nmoa))
+    if kptlist is None:
+        kptlist = list(range(nkpts))
+    norbs = len(orbs)
     nklist = len(kptlist)
     nw = len(freqs)
-    norbs = len(orbs)
-    mydf = gw.with_df
+
+    if mo_coeff is None:
+        mo_coeff = _mo_frozen(gw, gw.mo_coeff)
+    if mo_occ is None:
+        mo_occ = _mo_occ_frozen(gw, gw.mo_occ)
+    nao = mo_coeff.shape[2]
 
     # possible kpts shift
     kscaled = gw.mol.get_scaled_kpts(kpts)
     kscaled -= kscaled[0]
 
-    # This code does not support metals
-    homo = -99.
-    lumo = 99.
-    for k in range(nkpts):
-        if homo < max(mo_energy[0,k][nocca-1],mo_energy[1,k][noccb-1]):
-            homo = max(mo_energy[0,k][nocca-1],mo_energy[1,k][noccb-1])
-        if lumo > min(mo_energy[0,k][nocca],mo_energy[1,k][noccb]):
-            lumo = min(mo_energy[0,k][nocca],mo_energy[1,k][noccb])
-    if (lumo-homo)<1e-3:
-        logger.warn(gw, 'Current KUGW is not supporting metals!')
-    ef = (homo+lumo)/2.
-
     # Integration on numerical grids
-    if iw_cutoff is not None:
+    if iw_cutoff is not None and gw.rdm is False:
         nw_sigma = sum(iw < iw_cutoff for iw in freqs) + 1
     else:
         nw_sigma = nw + 1
 
-    # Compute occ for -iw and vir for iw separately
-    # to avoid branch cuts in analytic continuation
-    omega_occ = np.zeros((nw_sigma),dtype=np.complex128)
-    omega_vir = np.zeros((nw_sigma),dtype=np.complex128)
-    omega_occ[1:] = -1j*freqs[:(nw_sigma-1)]
-    omega_vir[1:] = 1j*freqs[:(nw_sigma-1)]
-    orbs_occ_a = [i for i in orbs if i < nocca]
-    orbs_occ_b = [i for i in orbs if i < noccb]
-    norbs_occ_a = len(orbs_occ_a)
-    norbs_occ_b = len(orbs_occ_b)
-
-    emo_occ_a = np.zeros((nkpts,nmoa,nw_sigma),dtype=np.complex128)
-    emo_occ_b = np.zeros((nkpts,nmob,nw_sigma),dtype=np.complex128)
-    emo_vir_a = np.zeros((nkpts,nmoa,nw_sigma),dtype=np.complex128)
-    emo_vir_b = np.zeros((nkpts,nmob,nw_sigma),dtype=np.complex128)
-    for k in range(nkpts):
-        emo_occ_a[k] = omega_occ[None,:] + ef - mo_energy[0,k][:,None]
-        emo_occ_b[k] = omega_occ[None,:] + ef - mo_energy[1,k][:,None]
-        emo_vir_a[k] = omega_vir[None,:] + ef - mo_energy[0,k][:,None]
-        emo_vir_b[k] = omega_vir[None,:] + ef - mo_energy[1,k][:,None]
-
-    sigma = np.zeros((2,nklist,norbs,nw_sigma),dtype=np.complex128)
-    omega = np.zeros((2,norbs,nw_sigma),dtype=np.complex128)
-    for s in range(2):
-        for p in range(norbs):
-            orbp = orbs[p]
-            if orbp < gw.nocc[s]:
-                omega[s,p] = omega_occ.copy()
-            else:
-                omega[s,p] = omega_vir.copy()
+    omega = np.zeros(shape=[nw_sigma], dtype=np.complex128)
+    omega[1:] = 1j * freqs[: (nw_sigma - 1)] + ef
+    emo_a = omega[None, None, :] - mo_energy[0][:, :, None]
+    emo_b = omega[None, None, :] - mo_energy[1][:, :, None]
 
+    if fullsigma is False:
+        sigma = np.zeros(shape=[2, nklist, norbs, nw_sigma], dtype=np.complex128)
+    else:
+        sigma = np.zeros(shape=[2, nklist, norbs, norbs, nw_sigma], dtype=np.complex128)
     if gw.fc:
         # Set up q mesh for q->0 finite size correction
-        q_pts = np.array([1e-3,0,0]).reshape(1,3)
+        if not gw.fc_grid:
+            q_pts = np.array([1e-3, 0, 0], dtype=np.double).reshape(1, 3)
+        else:
+            Nq = 3
+            q_pts = np.zeros(shape=[Nq**3 - 1, 3], dtype=np.double)
+            for i in range(Nq):
+                for j in range(Nq):
+                    for k in range(Nq):
+                        if i == 0 and j == 0 and k == 0:
+                            continue
+                        else:
+                            q_pts[i * Nq**2 + j * Nq + k - 1, 0] = k * 5e-4
+                            q_pts[i * Nq**2 + j * Nq + k - 1, 1] = j * 5e-4
+                            q_pts[i * Nq**2 + j * Nq + k - 1, 2] = i * 5e-4
+        nq_pts = len(q_pts)
         q_abs = gw.mol.get_abs_kpts(q_pts)
 
         # Get qij = 1/sqrt(Omega) * < psi_{ik} | e^{iqr} | psi_{ak-q} > at q: (nkpts, nocc, nvir)
-        qij = get_qij(gw, q_abs[0], mo_coeff)
+        qij_a = np.zeros(shape=[nq_pts, nkpts, nocca, nmoa - nocca], dtype=np.complex128)
+        qij_b = np.zeros(shape=[nq_pts, nkpts, noccb, nmob - noccb], dtype=np.complex128)
+
+        if not gw.fc_grid:
+            for k in range(nq_pts):
+                qij_tmp = get_qij(gw, q_abs[k], mo_energy, mo_coeff)
+                qij_a[k] = qij_tmp[0]
+                qij_b[k] = qij_tmp[1]
+        else:
+            for k in range(nq_pts):
+                qij_tmp = get_qij(gw, q_abs[k], mo_energy, mo_coeff)
+                qij_a[k] = qij_tmp[0]
+                qij_b[k] = qij_tmp[1]
 
+    cderiarr = gw.with_df.cderi_array()
     for kL in range(nkpts):
-        # Lij: (2, ki, L, i, j) for looping every kL
-        #Lij = np.zeros((2,nkpts,naux,nmoa,nmoa),dtype=np.complex128)
+        # Lij: (ki, 2, L, i, j) for looping every kL
         Lij = []
         # kidx: save kj that conserves with kL and ki (-ki+kj+kL=G)
         # kidx_r: save ki that conserves with kL and kj (-ki+kj+kL=G)
-        kidx = np.zeros((nkpts),dtype=np.int64)
-        kidx_r = np.zeros((nkpts),dtype=np.int64)
+        kidx = np.zeros(shape=[nkpts], dtype=np.int64)
+        kidx_r = np.zeros(shape=[nkpts], dtype=np.int64)
         for i, kpti in enumerate(kpts):
             for j, kptj in enumerate(kpts):
                 # Find (ki,kj) that satisfies momentum conservation with kL
@@ -291,354 +556,461 @@ def get_sigma_diag(gw, orbs, kptlist, freqs, wts, iw_cutoff=None, max_memory=800
                 if is_kconserv:
                     kidx[i] = j
                     kidx_r[j] = i
-                    logger.debug(gw, "Read Lpq (kL: %s / %s, ki: %s, kj: %s)"%(kL+1, nkpts, i, j))
+                    logger.debug(gw, 'Read Lpq (kL: %s / %s, ki: %s, kj: %s)' % (kL + 1, nkpts, i, j))
+
+                    # Read (L|pq) and ao2mo transform to (L|ij)
+                    Lpq = cderiarr.load(kpti, kptj)
+                    if Lpq.shape[-1] == (nao * (nao + 1)) // 2:
+                        Lpq = lib.unpack_tril(Lpq).reshape(-1, nao**2)
+                    else:
+                        Lpq = Lpq.reshape(-1, nao**2)
+                    Lpq = Lpq.astype(np.complex128)
+
+                    moija, ijslicea = _conc_mos(mo_coeff[0, i], mo_coeff[0, j])[2:]
+                    moijb, ijsliceb = _conc_mos(mo_coeff[1, i], mo_coeff[1, j])[2:]
                     Lij_out_a = None
                     Lij_out_b = None
-                    # Read (L|pq) and ao2mo transform to (L|ij)
-                    Lpq = []
-                    for LpqR, LpqI, sign \
-                            in mydf.sr_loop([kpti, kptj], max_memory=0.1*gw._scf.max_memory, compact=False):
-                        Lpq.append(LpqR+LpqI*1.0j)
-                    Lpq = np.vstack(Lpq).reshape(-1,nmoa**2)
-                    moija, ijslicea = _conc_mos(mo_coeff[0,i], mo_coeff[0,j])[2:]
-                    moijb, ijsliceb = _conc_mos(mo_coeff[1,i], mo_coeff[1,j])[2:]
-                    tao = []
-                    ao_loc = None
-                    Lij_out_a = _ao2mo.r_e2(Lpq, moija, ijslicea, tao, ao_loc, out=Lij_out_a)
-                    tao = []
-                    ao_loc = None
-                    Lij_out_b = _ao2mo.r_e2(Lpq, moijb, ijsliceb, tao, ao_loc, out=Lij_out_b)
-                    Lij.append(np.asarray((Lij_out_a.reshape(-1,nmoa,nmoa),Lij_out_b.reshape(-1,nmob,nmob))))
-
-        Lij = np.asarray(Lij)
-        Lij = Lij.transpose(1,0,2,3,4)
-        naux = Lij.shape[2]
+                    Lij_out_a = _ao2mo.r_e2(Lpq, moija, ijslicea, tao=[], ao_loc=None, out=Lij_out_a)
+                    Lij_out_b = _ao2mo.r_e2(Lpq, moijb, ijsliceb, tao=[], ao_loc=None, out=Lij_out_b)
+                    Lij.append(np.asarray((Lij_out_a.reshape(-1, nmoa, nmoa), Lij_out_b.reshape(-1, nmob, nmob))))
 
-        if kL == 0:
-            for w in range(nw):
-                # body dielectric matrix eps_body
-                Pi = get_rho_response(gw, freqs[w], mo_energy, Lij, kL, kidx)
-                eps_body_inv = np.linalg.inv(np.eye(naux)-Pi)
+        Lij = np.ascontiguousarray(Lij)
+        naux = Lij.shape[2]
+        if hasattr(gw._scf, 'sigma') is False:
+            Lia = [
+                np.ascontiguousarray(Lij[:, 0, :, : nocc[0], nocc[0] :]),
+                np.ascontiguousarray(Lij[:, 1, :, : nocc[1], nocc[1] :]),
+            ]
+
+        naux_ones = np.ones(shape=[1, naux], dtype=np.complex128)
+        for w in range(nw):
+            # body dielectric matrix eps_body
+            if hasattr(gw._scf, 'sigma'):
+                Pi = get_rho_response_metal(freqs[w], mo_energy, mo_occ, Lij, kidx)
+            else:
+                Pi = get_rho_response(freqs[w], nocc, mo_energy, Lia, kidx)
+            Pi_inv = np.linalg.inv(np.eye(naux) - Pi)
 
-                if gw.fc:
-                    # head dielectric matrix eps_00
-                    Pi_00 = get_rho_response_head(gw, freqs[w], mo_energy, qij)
-                    eps_00 = 1. - 4. * np.pi/np.linalg.norm(q_abs[0])**2 * Pi_00
+            if gw.fc and kL == 0:
+                eps_inv_00 = 0j
+                eps_inv_P0 = np.zeros(shape=[naux], dtype=np.complex128)
+                for iq in range(nq_pts):
+                    # head dielectric matrix eps_00, equation 47 in 10.1021/acs.jctc.0c00704
+                    Pi_00 = get_rho_response_head(freqs[w], mo_energy, (qij_a[iq], qij_b[iq]))
+                    eps_00 = 1.0 - 4.0 * np.pi / np.linalg.norm(q_abs[iq]) ** 2 * Pi_00
 
-                    # wings dielectric matrix eps_P0
-                    Pi_P0 = get_rho_response_wing(gw, freqs[w], mo_energy, Lij, qij)
-                    eps_P0 = -np.sqrt(4.*np.pi) / np.linalg.norm(q_abs[0]) * Pi_P0
+                    # wings dielectric matrix eps_P0, equation 48 in 10.1021/acs.jctc.0c00704
+                    Pi_P0 = get_rho_response_wing(freqs[w], mo_energy, Lia, (qij_a[iq], qij_b[iq]))
+                    eps_P0 = -np.sqrt(4.0 * np.pi) / np.linalg.norm(q_abs[iq]) * Pi_P0
 
                     # inverse dielectric matrix
-                    eps_inv_00 = 1./(eps_00 - np.dot(np.dot(eps_P0.conj(),eps_body_inv),eps_P0))
-                    eps_inv_P0 = -eps_inv_00 * np.dot(eps_body_inv, eps_P0)
-
-                    # head correction
-                    Del_00 = 2./np.pi * (6.*np.pi**2/gw.mol.vol/nkpts)**(1./3.) * (eps_inv_00 - 1.)
-
-                eps_inv_PQ = eps_body_inv
-                g0_occ_a = wts[w] * emo_occ_a / (emo_occ_a**2+freqs[w]**2)
-                g0_occ_b = wts[w] * emo_occ_b / (emo_occ_b**2+freqs[w]**2)
-                g0_vir_a = wts[w] * emo_vir_a / (emo_vir_a**2+freqs[w]**2)
-                g0_vir_b = wts[w] * emo_vir_b / (emo_vir_b**2+freqs[w]**2)
-                for k in range(nklist):
-                    kn = kptlist[k]
-                    # Find km that conserves with kn and kL (-km+kn+kL=G)
-                    km = kidx_r[kn]
-                    Qmn_a = einsum('Pmn,PQ->Qmn',Lij[0,km][:,:,orbs].conj(),eps_inv_PQ-np.eye(naux))
-                    Qmn_b = einsum('Pmn,PQ->Qmn',Lij[1,km][:,:,orbs].conj(),eps_inv_PQ-np.eye(naux))
-                    Wmn_a = 1./nkpts * einsum('Qmn,Qmn->mn',Qmn_a,Lij[0,km][:,:,orbs])
-                    Wmn_b = 1./nkpts * einsum('Qmn,Qmn->mn',Qmn_b,Lij[1,km][:,:,orbs])
-
-                    sigma[0,k][:norbs_occ_a] += -einsum('mn,mw->nw',Wmn_a[:,:norbs_occ_a],g0_occ_a[km])/np.pi
-                    sigma[1,k][:norbs_occ_b] += -einsum('mn,mw->nw',Wmn_b[:,:norbs_occ_b],g0_occ_b[km])/np.pi
-                    sigma[0,k][norbs_occ_a:] += -einsum('mn,mw->nw',Wmn_a[:,norbs_occ_a:],g0_vir_a[km])/np.pi
-                    sigma[1,k][norbs_occ_b:] += -einsum('mn,mw->nw',Wmn_b[:,norbs_occ_b:],g0_vir_b[km])/np.pi
-
-                    if gw.fc:
-                        # apply head correction
-                        assert (kn == km)
-                        sigma[0,k][:norbs_occ_a] += -Del_00 * g0_occ_a[kn][orbs][:norbs_occ_a] /np.pi
-                        sigma[0,k][norbs_occ_a:] += -Del_00 * g0_vir_a[kn][orbs][norbs_occ_a:] /np.pi
-                        sigma[1,k][:norbs_occ_b] += -Del_00 * g0_occ_b[kn][orbs][:norbs_occ_b] /np.pi
-                        sigma[1,k][norbs_occ_b:] += -Del_00 * g0_vir_b[kn][orbs][norbs_occ_b:] /np.pi
-
-                        # apply wing correction
-                        Wn_P0_a = einsum('Pnm,P->nm',Lij[0,kn],eps_inv_P0).diagonal()
-                        Wn_P0_b = einsum('Pnm,P->nm',Lij[1,kn],eps_inv_P0).diagonal()
-                        Wn_P0_a = Wn_P0_a.real * 2.
-                        Wn_P0_b = Wn_P0_b.real * 2.
-                        Del_P0_a = np.sqrt(gw.mol.vol/4./np.pi**3) * (6.*np.pi**2/gw.mol.vol/nkpts)**(2./3.) * Wn_P0_a[orbs]  # noqa: E501
-                        Del_P0_b = np.sqrt(gw.mol.vol/4./np.pi**3) * (6.*np.pi**2/gw.mol.vol/nkpts)**(2./3.) * Wn_P0_b[orbs]  # noqa: E501
-                        sigma[0,k][:norbs_occ_a] += -einsum('n,nw->nw',Del_P0_a[:norbs_occ_a],g0_occ_a[kn][orbs][:norbs_occ_a]) /np.pi  # noqa: E501
-                        sigma[0,k][norbs_occ_a:] += -einsum('n,nw->nw',Del_P0_a[norbs_occ_a:],g0_vir_a[kn][orbs][norbs_occ_a:]) /np.pi  # noqa: E501
-                        sigma[1,k][:norbs_occ_b] += -einsum('n,nw->nw',Del_P0_b[:norbs_occ_b],g0_occ_b[kn][orbs][:norbs_occ_b]) /np.pi  # noqa: E501
-                        sigma[1,k][norbs_occ_b:] += -einsum('n,nw->nw',Del_P0_b[norbs_occ_b:],g0_vir_b[kn][orbs][norbs_occ_b:]) /np.pi  # noqa: E501
-        else:
-            for w in range(nw):
-                Pi = get_rho_response(gw, freqs[w], mo_energy, Lij, kL, kidx)
-                Pi_inv = np.linalg.inv(np.eye(naux)-Pi)-np.eye(naux)
-                g0_occ_a = wts[w] * emo_occ_a / (emo_occ_a**2+freqs[w]**2)
-                g0_occ_b = wts[w] * emo_occ_b / (emo_occ_b**2+freqs[w]**2)
-                g0_vir_a = wts[w] * emo_vir_a / (emo_vir_a**2+freqs[w]**2)
-                g0_vir_b = wts[w] * emo_vir_b / (emo_vir_b**2+freqs[w]**2)
-                for k in range(nklist):
-                    kn = kptlist[k]
-                    # Find km that conserves with kn and kL (-km+kn+kL=G)
-                    km = kidx_r[kn]
-                    Qmn_a = einsum('Pmn,PQ->Qmn',Lij[0,km][:,:,orbs].conj(),Pi_inv)
-                    Qmn_b = einsum('Pmn,PQ->Qmn',Lij[1,km][:,:,orbs].conj(),Pi_inv)
-                    Wmn_a = 1./nkpts * einsum('Qmn,Qmn->mn',Qmn_a,Lij[0,km][:,:,orbs])
-                    Wmn_b = 1./nkpts * einsum('Qmn,Qmn->mn',Qmn_b,Lij[1,km][:,:,orbs])
-
-                    sigma[0,k][:norbs_occ_a] += -einsum('mn,mw->nw',Wmn_a[:,:norbs_occ_a],g0_occ_a[km])/np.pi
-                    sigma[1,k][:norbs_occ_b] += -einsum('mn,mw->nw',Wmn_b[:,:norbs_occ_b],g0_occ_b[km])/np.pi
-                    sigma[0,k][norbs_occ_a:] += -einsum('mn,mw->nw',Wmn_a[:,norbs_occ_a:],g0_vir_a[km])/np.pi
-                    sigma[1,k][norbs_occ_b:] += -einsum('mn,mw->nw',Wmn_b[:,norbs_occ_b:],g0_vir_b[km])/np.pi
+                    # equation 53 in 10.1021/acs.jctc.0c00704
+                    eps_inv_00 += 1.0 / nq_pts * 1.0 / (eps_00 - reduce(np.matmul, (eps_P0.conj(), Pi_inv, eps_P0)))
+                    # equation 54 in 10.1021/acs.jctc.0c00704
+                    eps_inv_P0 += 1.0 / nq_pts * (-eps_inv_00) * np.matmul(Pi_inv, eps_P0)
+
+                # head correction
+                Del_00 = 2.0 / np.pi * (6.0 * np.pi**2 / gw.mol.vol / nkpts) ** (1.0 / 3.0) * (eps_inv_00 - 1.0)
+
+            Pi_inv -= np.eye(naux)
+            g0_a = wts[w] * emo_a / (emo_a**2 + freqs[w] ** 2)
+            g0_b = wts[w] * emo_b / (emo_b**2 + freqs[w] ** 2)
+            g0 = [g0_a, g0_b]
+            for k, kn in enumerate(kptlist):
+                # Find km that conserves with kn and kL (-km+kn+kL=G)
+                km = kidx_r[kn]
+
+                for s in range(2):
+                    # Qmn_a = einsum('Pmn,PQ->Qmn', Lij[km, s][:, :, orbs].conj(), Pi_inv)
+                    if len(orbs) == nmo[s]:
+                        l_slice = Lij[km, s].reshape(naux, -1)
+                    else:
+                        l_slice = np.ascontiguousarray(Lij[km, s, :, :, mkslice(orbs)].reshape(naux, -1))
+                    Qmn = np.zeros(shape=[nmo[s] * norbs, naux], dtype=np.complex128)
+                    scipy.linalg.blas.zgemm(alpha=1.0, a=Pi_inv.T, b=l_slice.T, c=Qmn.T, overwrite_c=1, trans_b=2)
+                    Qmn = Qmn.T
+
+                    if fullsigma is False:
+                        # Wmn = 1.0 / nkpts * einsum('Qmn,Qmn->mn', Qmn, Lij[km, s][:, :, orbs])
+                        Qmn = Qmn * l_slice
+                        Wmn = np.matmul(naux_ones, Qmn)
+                        array_scale(Wmn, 1.0 / nkpts)
+
+                        # sigma[s, k] += -einsum('mn,mw->nw', Wmn, g0[s][km]) / np.pi
+                        sigma[s, k] -= np.matmul(Wmn.reshape(nmo[s], norbs).T, g0[s][km]) / np.pi
+                    else:
+                        # for orbm in range(nmo):
+                        #     Wmn[orbm] = 1./nkpts * np.dot(Qmn[:,orbm,:].transpose(),Lij[km][:,orbm,orbs])
+                        Qmn = Qmn.reshape(naux, nmo[s], norbs)
+                        Wmn = np.zeros(shape=[nmo[s], norbs, norbs], dtype=np.complex128)
+                        for m in range(nmo[s]):
+                            np.matmul(Qmn[:, m, :].T, np.ascontiguousarray(Lij[km, s, :, m, mkslice(orbs)]), out=Wmn[m])
+                        array_scale(Wmn, 1.0 / nkpts)
+                        Wmn = Wmn.reshape(nmo[s], norbs * norbs).T
+
+                        # sigma[s, k] += -einsum('mnl,mw->nlw',Wmn,g0[km])/np.pi
+                        sigma[s, k] -= np.matmul(Wmn, g0[s][km]).reshape(norbs, norbs, nw_sigma) / np.pi
+
+                if gw.fc and kL == 0:
+                    assert kn == km
+                    for s in range(2):
+                        if fullsigma is False:
+                            # apply head correction
+                            sigma[s, k] += -Del_00 * g0[s][kn][orbs] / np.pi
+
+                            # apply wing correction
+                            Wn_P0 = einsum('Pnn,P->n', Lij[kn, s], eps_inv_P0)
+                            Wn_P0 = Wn_P0[orbs].real * 2.0
+                            Del_P0 = (
+                                np.sqrt(gw.mol.vol / 4.0 / np.pi**3)
+                                * (6.0 * np.pi**2 / gw.mol.vol / nkpts) ** (2 / 3)
+                                * Wn_P0
+                            )
+
+                            sigma[s, k] += -einsum('n,nw->nw', Del_P0, g0[s][kn][orbs]) / np.pi
+                        else:
+                            # head correction
+                            tmp = -Del_00 * g0[s][kn][orbs] / np.pi
+                            for p in range(norbs):
+                                sigma[s, k, p, p, :] += tmp[p, :]
+                            # sigma[s, k, np.arange(norbs), np.arange(norbs), :] += tmp
+
+                            # wing correction
+                            Wn_P0 = einsum('Pnn,P->n', Lij[kn, s], eps_inv_P0)
+                            Wn_P0 = Wn_P0[orbs].real * 2.0
+                            Del_P0 = (
+                                np.sqrt(gw.mol.vol / 4.0 / np.pi**3)
+                                * (6.0 * np.pi**2 / gw.mol.vol / nkpts) ** (2 / 3)
+                                * Wn_P0
+                            )
+                            tmp = -einsum('n,nw->nw', Del_P0, g0[s][kn][orbs]) / np.pi
+                            for p in range(norbs):
+                                sigma[s, k, p, p, :] += tmp[p, :]
+                            #sigma[s, k, np.arange(norbs), np.arange(norbs), :] += tmp
+
+    if gw.rdm:
+        gw.sigmaI = sigma
 
     return sigma, omega
 
-def get_rho_response_head(gw, omega, mo_energy, qij):
-    '''
-    Compute head (G=0, G'=0) density response function in auxiliary basis at freq iw
-    '''
-    qij_a, qij_b = qij
-    nocca, noccb = gw.nocc
-    kpts = gw.kpts
-    nkpts = len(kpts)
 
-    # Compute Pi head
-    Pi_00 = 0j
-    for i, kpti in enumerate(kpts):
-        eia_a = mo_energy[0,i,:nocca,None] - mo_energy[0,i,None,nocca:]
-        eia_b = mo_energy[1,i,:noccb,None] - mo_energy[1,i,None,noccb:]
-        eia_a = eia_a/(omega**2+eia_a*eia_a)
-        eia_b = eia_b/(omega**2+eia_b*eia_b)
-        Pi_00 += 2./nkpts * (einsum('ia,ia->',eia_a,qij_a[i].conj()*qij_a[i]) +
-                             einsum('ia,ia->',eia_b,qij_b[i].conj()*qij_b[i]))
-    return Pi_00
-
-def get_rho_response_wing(gw, omega, mo_energy, Lpq, qij):
-    '''
-    Compute wing (G=P, G'=0) density response function in auxiliary basis at freq iw
-    '''
-    qij_a, qij_b = qij
-    spin, nkpts, naux, nmo, nmo = Lpq.shape
-    nocca, noccb = gw.nocc
-    kpts = gw.kpts
-    nkpts = len(kpts)
+def get_ef(kmf, mo_energy):
+    """Get Fermi level.
+    For gapped systems, Fermi level is computed as the average between HOMO and LUMO.
+    For metallic systems, Fermi level is optmized according to mo_energy.
 
-    # Compute Pi wing
-    Pi = np.zeros(naux,dtype=np.complex128)
-    for i, kpti in enumerate(kpts):
-        eia_a = mo_energy[0,i,:nocca,None] - mo_energy[0,i,None,nocca:]
-        eia_b = mo_energy[1,i,:noccb,None] - mo_energy[1,i,None,noccb:]
-        eia_a = eia_a/(omega**2+eia_a*eia_a)
-        eia_b = eia_b/(omega**2+eia_b*eia_b)
-        eia_q_a = eia_a * qij_a[i].conj()
-        eia_q_b = eia_b * qij_b[i].conj()
-        Pi += 2./nkpts * (einsum('Pia,ia->P',Lpq[0,i][:,:nocca,nocca:],eia_q_a) +
-                          einsum('Pia,ia->P',Lpq[1,i][:,:noccb,noccb:],eia_q_b))
-    return Pi
+    Parameters
+    ----------
+    kmf : pyscf.pbc.scf.uhf.UHF/pyscf.pbc.dft.uks.UKS
+        mean-field object, provides attributes: kpts, sigma, smearing_method
+    mo_energy : double array
+        orbital energy
 
-def get_qij(gw, q, mo_coeff, uniform_grids=False):
-    '''
-    Compute qij = 1/Omega * |< psi_{ik} | e^{iqr} | psi_{ak-q} >|^2 at q: (nkpts, nocc, nvir)
-    through kp perturbation theory
-    Ref: Phys. Rev. B 83, 245122 (2011)
-    '''
-    nocca, noccb = gw.nocc
-    nmoa, nmob = gw.nmo
-    nvira = nmoa - nocca
-    nvirb = nmob - noccb
-    kpts = gw.kpts
-    nkpts = len(kpts)
-    cell = gw.mol
-    mo_energy = np.asarray(gw._scf.mo_energy)
+    Returns
+    -------
+    ef : double
+        Fermi level
+    """
+    if hasattr(kmf, "sigma"):
+        from pyscf.scf import addons as mol_addons
 
-    if uniform_grids:
-        mydf = df.FFTDF(cell, kpts=kpts)
-        coords = cell.gen_uniform_grids(mydf.mesh)
+        if kmf.smearing_method.lower() == "fermi":
+            f_occ = mol_addons._fermi_smearing_occ
+        else:
+            f_occ = mol_addons._gaussian_smearing_occ
+        mo_energy_stack_a = np.hstack(mo_energy[0])
+        mo_energy_stack_b = np.hstack(mo_energy[1])
+        mo_energy_stack = np.append(mo_energy_stack_a, mo_energy_stack_b)
+        nelectron = kmf.mol.tot_electrons(len(kmf.kpts))
+        ef = mol_addons._smearing_optimize(f_occ, mo_energy_stack, nelectron, kmf.sigma)[0]
     else:
-        coords, weights = dft.gen_grid.get_becke_grids(cell,level=4)
-    ngrid = len(coords)
+        nkpts = len(kmf.kpts)
+        neleca = 0.0
+        nelecb = 0.0
+        for k in range(nkpts):
+            neleca += np.sum(kmf.mo_occ[0][k])
+            nelecb += np.sum(kmf.mo_occ[1][k])
+        nocca = int(neleca / nkpts)
+        noccb = int(nelecb / nkpts)
+
+        homo = -99.0
+        lumo = 99.0
+        for k in range(len(kmf.kpts)):
+            if homo < max(mo_energy[0][k][nocca - 1], mo_energy[1][k][noccb - 1]):
+                homo = max(mo_energy[0][k][nocca - 1], mo_energy[1][k][noccb - 1])
+            if lumo > min(mo_energy[0][k][nocca], mo_energy[1][k][noccb]):
+                lumo = min(mo_energy[0][k][nocca], mo_energy[1][k][noccb])
+        ef = (homo + lumo) / 2.0
+    return ef
+
+
+def get_g0_k(omega, mo_energy, eta):
+    """Get non-interacting Green's function.
+
+    Parameters
+    ----------
+    omega : double or complex ndarray
+        frequency grids
+    mo_energy : double ndarray
+        orbital energy
+    eta : double
+        broadening parameter
+
+    Returns
+    -------
+    gf0 : complex ndarray
+        non-interacting Green's function
+    """
+    nkpts = len(mo_energy[0])
+    nmo = mo_energy[0][0].shape[0]
+    nw = len(omega)
+    gf0 = np.zeros(shape=[2, nkpts, nmo, nmo, nw], dtype=np.complex128)
+    for s in range(2):
+        for k in range(nkpts):
+            for iw in range(nw):
+                gf0[s, k, :, :, iw] = np.diag(1.0 / (omega[iw] + 1j * eta - mo_energy[s][k]))
+    return gf0
+
+
+def make_gf(gw, omega, eta):
+    """Get dynamical Green's function and self-energy.
+
+    Parameters
+    ----------
+    gw : KUGWAC
+        GW object, provides attributes: orbs, kptlist, ef, ac_coeff, omega_fit, vk, vxc, _scf.mo_energy
+    omega : double or complex array
+        frequency grids
+    eta : double
+        broadening parameter
+
+    Returns
+    -------
+    gf : complex ndarray
+        GW Green's function
+    gf0 : complex ndarray
+        mean-field Green's function
+    sigma : complex ndarray
+        GW correlation self-energy
+    """
+    nmo = gw.nmo[0]
+
+    nomega = len(omega)
+    sigma = np.zeros(shape=[2, gw.nkpts, nmo, nmo, nomega], dtype=np.complex128)
+    if gw.fullsigma:
+        for s in range(2):
+            for ik, k in enumerate(gw.kptlist):
+                for ip, p in enumerate(gw.orbs_frz):
+                    for iq, q in enumerate(gw.orbs_frz):
+                        sigma[s, k, p, q] = gw.acobj[s, ik, ip, iq].ac_eval(omega + 1j * eta)
+                        sigma[s, k, p, q] += gw.vk[s, k, p, q] - gw.vxc[s, k, p, q]
+    else:
+        for s in range(2):
+            for k, kn in enumerate(gw.kptlist):
+                for ip, p in enumerate(gw.orbs_frz):
+                    sigma[s, k, p, p] = gw.acobj[s, ik, ip].ac_eval(omega + 1j * eta)
+                    sigma[s, kn, p, p] += gw.vk[s, kn, p, p] - gw.vxc[s, kn, p, p]
+
+    gf0 = get_g0_k(omega, gw._scf.mo_energy, eta)
+    gf = np.zeros_like(gf0)
+    for s in range(2):
+        for k in range(gw.nkpts):
+            for iw in range(nomega):
+                gf[s, k, :, :, iw] = np.linalg.inv(np.linalg.inv(gf0[s, k, :, :, iw]) - sigma[s, k, :, :, iw])
+
+    return gf, gf0, sigma
+
+
+def make_rdm1_linear(gw, ao_repr=False):
+    """Get GW density matrix from Green's function G(it=0).
+    G is from linear Dyson equation, which conserves particle number
+    G = G0 + G0 Sigma G0
+    See equation 16 in 10.1021/acs.jctc.0c01264
+
+    Parameters
+    ----------
+    gw : KUGWAC
+        GW object, provides attributes: sigmaI, mol, _scf, freqs, wts, frozen, orbs, fc
+    ao_repr : bool, optional
+        return density matrix in AO, by default False
+
+    Returns
+    -------
+    rdm1 : double ndarray
+        density matrix
+    """
+    assert gw.sigmaI is not None
+    assert gw.rdm is True and gw.fullsigma is True
+    assert gw.frozen is None or gw.frozen == 0
+    sigmaI = gw.sigmaI[:, :, :, 1:]
+    freqs = 1j * gw.freqs
+    wts = gw.wts
+    nmo = gw.nmo
+    nkpts = gw.nkpts
+    if len(gw.orbs) != nmo:
+        sigma = np.zeros(shape=[nkpts, nmo, nmo, len(freqs)], dtype=sigmaI.dtype)
+        for s in range(2):
+            for k in range(nkpts):
+                for ia, a in enumerate(gw.orbs):
+                    for ib, b in enumerate(gw.orbs):
+                        sigma[s, k, a, b, :] = sigmaI[s, k, ia, ib, :]
+    else:
+        sigma = sigmaI
 
-    qij_a = np.zeros((nkpts,nocca,nvira),dtype=np.complex128)
-    qij_b = np.zeros((nkpts,noccb,nvirb),dtype=np.complex128)
-    for i, kpti in enumerate(kpts):
-        ao_p = dft.numint.eval_ao(cell, coords, kpt=kpti, deriv=1)
-        ao = ao_p[0]
-        ao_grad = ao_p[1:4]
-        if uniform_grids:
-            ao_ao_grad = einsum('mg,xgn->xmn',ao.T.conj(),ao_grad) * cell.vol / ngrid
-        else:
-            ao_ao_grad = einsum('g,mg,xgn->xmn',weights,ao.T.conj(),ao_grad)
-        q_ao_ao_grad = -1j * einsum('x,xmn->mn',q,ao_ao_grad)
-        q_mo_mo_grad_a = np.dot(np.dot(mo_coeff[0,i][:,:nocca].T.conj(), q_ao_ao_grad), mo_coeff[0,i][:,nocca:])
-        q_mo_mo_grad_b = np.dot(np.dot(mo_coeff[1,i][:,:noccb].T.conj(), q_ao_ao_grad), mo_coeff[1,i][:,noccb:])
-        enm_a = 1./(mo_energy[0,i][nocca:,None] - mo_energy[0,i][None,:nocca])
-        enm_b = 1./(mo_energy[1,i][noccb:,None] - mo_energy[1,i][None,:noccb])
-        dens_a = enm_a.T * q_mo_mo_grad_a
-        dens_b = enm_b.T * q_mo_mo_grad_b
-        qij_a[i] = dens_a / np.sqrt(cell.vol)
-        qij_b[i] = dens_b / np.sqrt(cell.vol)
+    for iw in range(len(freqs)):
+        sigma[:, :, :, iw] += gw.vk - gw.vxc
+    gf0 = get_g0_k(freqs, np.array(gw._scf.mo_energy) - gw.ef, eta=0)
+    gf = np.array(gf0, copy=True)
+    for s in range(2):
+        for k in range(nkpts):
+            for iw in range(len(freqs)):
+                gf[s, k, :, :, iw] = gf0[s, k, :, :, iw] @ sigma[s, k, :, :, iw] @ gf0[s, k, :, :, iw]
 
-    return (qij_a, qij_b)
+    # GW density matrix
+    rdm1 = np.zeros(shape=[2, nkpts, nmo, nmo], dtype=np.double)
+    for s in range(2):
+        for k in range(nkpts):
+            rdm1[s, k] = (1.0 / np.pi) * einsum('ijw, w -> ij', gf[s, k], wts).real + np.eye(nmo) * 0.5
+            channel = "spin-up" if s == 0 else "spin-down"
+            logger.info(gw, 'GW particle number %s @ k%d = %s', channel, k, np.trace(rdm1[s, k]))
 
-def _get_scaled_legendre_roots(nw):
-    """
-    Scale nw Legendre roots, which lie in the
-    interval [-1, 1], so that they lie in [0, inf)
-    Ref: www.cond-mat.de/events/correl19/manuscripts/ren.pdf
+    # Symmetrize density matrix
+    for s in range(2):
+        for k in range(nkpts):
+            rdm1[s, k] = 0.5 * (rdm1[s, k] + rdm1[s, k].T)
 
-    Returns:
-        freqs : 1D ndarray
-        wts : 1D ndarray
-    """
-    freqs, wts = np.polynomial.legendre.leggauss(nw)
-    x0 = 0.5
-    freqs_new = x0*(1.+freqs)/(1.-freqs)
-    wts = wts*2.*x0/(1.-freqs)**2
-    return freqs_new, wts
+    if ao_repr is True:
+        ovlp = gw._scf.get_ovlp()
+        for s in range(2):
+            for k in range(nkpts):
+                CS = np.matmul(ovlp, gw._scf.mo_coeff[s, k])
+                rdm1[s, k] = reduce(np.matmul, (CS, rdm1[s, k], CS.conj().T))
+
+    return rdm1
 
-def _get_clenshaw_curtis_roots(nw):
-    """
-    Clenshaw-Curtis quadrature on [0,inf)
-    Ref: J. Chem. Phys. 132, 234114 (2010)
-    Returns:
-        freqs : 1D ndarray
-        wts : 1D ndarray
-    """
-    freqs = np.zeros(nw)
-    wts = np.zeros(nw)
-    a = 0.2
-    for w in range(nw):
-        t = (w+1.0)/nw * np.pi/2.
-        freqs[w] = a / np.tan(t)
-        if w != nw-1:
-            wts[w] = a*np.pi/2./nw/(np.sin(t)**2)
-        else:
-            wts[w] = a*np.pi/4./nw/(np.sin(t)**2)
-    return freqs[::-1], wts[::-1]
 
-def two_pole_fit(coeff, omega, sigma):
-    cf = coeff[:5] + 1j*coeff[5:]
-    f = cf[0] + cf[1]/(omega+cf[3]) + cf[2]/(omega+cf[4]) - sigma
-    f[0] = f[0]/0.01
-    return np.array([f.real,f.imag]).reshape(-1)
+def _mo_energy_frozen(gw, mo_energy):
+    """Get non-frozen orbital energy.
+    Assume nmoa = nmob.
 
-def two_pole(freqs, coeff):
-    cf = coeff[:5] + 1j*coeff[5:]
-    return cf[0] + cf[1]/(freqs+cf[3]) + cf[2]/(freqs+cf[4])
+    Parameters
+    ----------
+    gw : KUGWAC
+        GW object, provides attributes: frozen, nmo, nkpt
+    mo_energy : double ndarray
+        full orbital energy
 
-def AC_twopole_diag(sigma, omega, orbs, nocc):
+    Returns
+    -------
+    mo_energy_frozen : double ndarray
+        non-frozen orbital energy
     """
-    Analytic continuation to real axis using a two-pole model
-    Returns:
-        coeff: 2D array (ncoeff, norbs)
+    frozen_mask = get_frozen_mask(gw)
+    nmoa, _ = gw.nmo
+    nkpts = gw.nkpts
+    mo_energy_frozen = np.zeros(shape=[2, nkpts, nmoa], dtype=np.double)
+    for s in range(2):
+        for k in range(nkpts):
+            mo_energy_frozen[s, k] = mo_energy[s][k][frozen_mask[s][k]]
+    return mo_energy_frozen
+
+
+def _mo_frozen(gw, mo):
+    """Get non-frozen orbital coefficient.
+    Assume nmoa = nmob.
+
+    Parameters
+    ----------
+    gw : KUGWAC
+        GW object, provides attributes: frozen, nmo, nkpt
+    mo : complex ndarray
+        full orbital coefficient
+
+    Returns
+    -------
+    mo_frozen : complex ndarray
+        non-frozen orbital coefficient
     """
-    norbs, nw = sigma.shape
-    coeff = np.zeros((10,norbs))
-    for p in range(norbs):
-        if orbs[p] < nocc:
-            x0 = np.array([0, 1, 1, 1, -1, 0, 0, 0, -1.0, -0.5])
-        else:
-            x0 = np.array([0, 1, 1, 1, -1, 0, 0, 0, 1.0, 0.5])
-        #TODO: analytic gradient
-        xopt = least_squares(two_pole_fit, x0, jac='3-point', method='trf', xtol=1e-10,
-                             gtol = 1e-10, max_nfev=2000, verbose=0, args=(omega[p], sigma[p]))
-        if xopt.success is False:
-            print('WARN: 2P-Fit Orb %d not converged, cost function %e'%(p,xopt.cost))
-        coeff[:,p] = xopt.x.copy()
-    return coeff
-
-def thiele(fn,zn):
-    nfit = len(zn)
-    g = np.zeros((nfit,nfit),dtype=np.complex128)
-    g[:,0] = fn.copy()
-    for i in range(1,nfit):
-        g[i:,i] = (g[i-1,i-1]-g[i:,i-1])/((zn[i:]-zn[i-1])*g[i:,i-1])
-    a = g.diagonal()
-    return a
-
-def pade_thiele(freqs,zn,coeff):
-    nfit = len(coeff)
-    X = coeff[-1]*(freqs-zn[-2])
-    for i in range(nfit-1):
-        idx = nfit-i-1
-        X = coeff[idx]*(freqs-zn[idx-1])/(1.+X)
-    X = coeff[0]/(1.+X)
-    return X
-
-def AC_pade_thiele_diag(sigma, omega):
+    frozen_mask = get_frozen_mask(gw)
+    nmoa, _ = gw.nmo
+    nkpts = gw.nkpts
+    nao = mo[0][0].shape[0]
+    mo_frozen = np.zeros(shape=[2, nkpts, nao, nmoa], dtype=np.complex128)
+    for s in range(2):
+        for k in range(nkpts):
+            mo_frozen[s, k] = mo[s][k][:, frozen_mask[s][k]]
+    return mo_frozen
+
+
+def _mo_occ_frozen(gw, mo_occ):
+    """Get non-frozen occupation number.
+    Assume nmoa = nmob.
+
+    Parameters
+    ----------
+    gw : KUGWAC
+        GW object, provides attributes: frozen, nmo, nkpt
+    mo_occ : complex ndarray
+        full occupation number
+
+    Returns
+    -------
+    mo_occ_frozen : double ndarray
+        non-frozen occupation number
     """
-    Analytic continuation to real axis using a Pade approximation
-    from Thiele's reciprocal difference method
-    Reference: J. Low Temp. Phys. 29, 179 (1977)
-    Returns:
-        coeff: 2D array (ncoeff, norbs)
-        omega: 2D array (norbs, npade)
+    frozen_mask = get_frozen_mask(gw)
+    nmoa, _ = gw.nmo
+    nkpts = gw.nkpts
+    mo_occ_frozen = np.zeros(shape=[2, nkpts, nmoa], dtype=np.complex128)
+    for s in range(2):
+        for k in range(nkpts):
+            mo_occ_frozen[s, k] = mo_occ[s][k][frozen_mask[s][k]]
+    return mo_occ_frozen
+
+
+def set_frozen_orbs(gw):
+    """Set .frozen attribute from frozen mask.
+
+    Parameters
+    ----------
+    gw : KUGWAC
+        unrestricted GW object
     """
-    idx = range(1,40,6)
-    sigma1 = sigma[:,idx].copy()
-    sigma2 = sigma[:,(idx[-1]+4)::4].copy()
-    sigma = np.hstack((sigma1,sigma2))
-    omega1 = omega[:,idx].copy()
-    omega2 = omega[:,(idx[-1]+4)::4].copy()
-    omega = np.hstack((omega1,omega2))
-    norbs, nw = sigma.shape
-    npade = nw // 2
-    coeff = np.zeros((npade*2,norbs),dtype=np.complex128)
-    for p in range(norbs):
-        coeff[:,p] = thiele(sigma[p,:npade*2], omega[p,:npade*2])
-
-    return coeff, omega[:,:npade*2]
-
-
-class KUGWAC(lib.StreamObject):
-
-    linearized = getattr(__config__, 'gw_gw_GW_linearized', False)
-    # Analytic continuation: pade or twopole
-    ac = getattr(__config__, 'gw_gw_GW_ac', 'pade')
-    # Whether applying finite size corrections
-    fc = getattr(__config__, 'gw_gw_GW_fc', True)
-
-    _keys = {
-        'linearized', 'ac', 'fc', 'frozen', 'mol', 'with_df',
-        'kpts', 'nkpts', 'mo_energy', 'mo_coeff', 'mo_occ', 'sigma',
-    }
-
-    def __init__(self, mf, frozen=None):
-        self.mol = mf.mol
-        self._scf = mf
-        self.verbose = self.mol.verbose
-        self.stdout = self.mol.stdout
-        self.max_memory = mf.max_memory
-
-        #TODO: implement frozen orbs
-        if frozen is not None and frozen > 0:
-            raise NotImplementedError
-        self.frozen = frozen
-
-        # DF-KGW must use GDF integrals
-        if getattr(mf, 'with_df', None):
-            self.with_df = mf.with_df
+    assert gw.nmo[0] == gw.nmo[1], "current implementation requires nmoa = nmob."
+
+    if gw.frozen is not None:
+        if gw.orbs is not None:
+            if isinstance(gw.frozen, (int, np.int64)):
+                # frozen core
+                gw.orbs_frz = [x - gw.frozen for x in gw.orbs]
+            else:
+                # frozen list
+                assert isinstance(gw.frozen[0][0], (int, np.int64))
+                assert gw.frozen[0] == gw.frozen[1]
+                gw.orbs_frz = []
+                for orbi in gw.orbs:
+                    count = len([p for p in gw.frozen[0] if p <= orbi])
+                    gw.orbs_frz.append(orbi - count)
+            if any(np.array(gw.orbs_frz) < 0):
+                raise RuntimeError('GW orbs must be larger than frozen core!')
         else:
-            raise NotImplementedError
-
-##################################################
-# don't modify the following attributes, they are not input options
-        self._nocc = None
-        self._nmo = None
-        self.kpts = mf.kpts
-        self.nkpts = len(self.kpts)
-        # self.mo_energy: GW quasiparticle energy, not scf mo_energy
-        self.mo_energy = None
-        self.mo_coeff = mf.mo_coeff
-        self.mo_occ = mf.mo_occ
-        self.sigma = None
-
-    def dump_flags(self):
+            gw.orbs_frz = range(gw.nmo[0])
+            gw.orbs = range(len(gw._scf.mo_energy[0][0]))
+            if isinstance(gw.frozen, (int, np.int64)):
+                gw.orbs = list(set(gw.orbs) - set(range(gw.frozen)))
+            else:
+                assert isinstance(gw.frozen[0][0], (int, np.int64))
+                assert gw.frozen[0] == gw.frozen[1]
+                gw.orbs = list(set(gw.orbs) - set(gw.frozen[0]))
+    else:
+        if gw.orbs is None:
+            gw.orbs = range(len(gw._scf.mo_energy[0][0]))
+        gw.orbs_frz = gw.orbs
+    return
+
+
+class KUGWAC(KRGWAC):
+    def dump_flags(self, verbose=None):
         log = logger.Logger(self.stdout, self.verbose)
         log.info('')
         log.info('******** %s ********', self.__class__)
@@ -647,134 +1019,111 @@ def dump_flags(self):
         nmoa, nmob = self.nmo
         nvira = nmoa - nocca
         nvirb = nmob - noccb
-        nkpts = self.nkpts
-        log.info('GW (nocca, noccb) = (%d, %d), (nvira, nvirb) = (%d, %d), nkpts = %d',
-                 nocca, noccb, nvira, nvirb, nkpts)
+        log.info('GW (nocca, noccb) = (%d, %d), (nvira, nvirb) = (%d, %d)', nocca, noccb, nvira, nvirb)
+        log.info('nkpt = %d', self.nkpts)
         if self.frozen is not None:
-            log.info('frozen orbitals %s', str(self.frozen))
-        logger.info(self, 'use perturbative linearized QP eqn = %s', self.linearized)
-        logger.info(self, 'analytic continuation method = %s', self.ac)
-        logger.info(self, 'GW finite size corrections = %s', self.fc)
-        return self
+            log.info('frozen orbitals = %s', str(self.frozen))
+        if self.kptlist is not None:
+            log.info('k-point list = %s', str(self.kptlist))
+        if self.orbs is not None:
+            log.info('orbital list = %s', str(self.orbs))
+        log.info('off-diagonal self-energy = %s', self.fullsigma)
+        log.info('GW density matrix = %s', self.rdm)
+        log.info('density-fitting for exchange = %s', self.vhf_df)
+        log.info('finite size corrections = %s', self.fc)
+        if self.fc_grid is not None:
+            log.info('grids for finite size corrections = %s', self.fc_grid)
+        log.info('broadening parameter = %.3e', self.eta)
+        log.info('number of grids = %d', self.nw)
+        log.info('analytic continuation method = %s', self.ac)
+        log.info('imaginary frequency cutoff = %s', str(self.ac_iw_cutoff))
+        if self.ac == 'pade':
+            log.info('Pade points = %d', self.ac_pade_npts)
+            log.info('Pade step ratio = %.3f', self.ac_pade_step_ratio)
+        log.info('use perturbative linearized QP eqn = %s', self.qpe_linearized)
+        if self.qpe_linearized is True:
+            log.info('linearized factor range = %s', self.qpe_linearized_range)
+        else:
+            log.info('QPE max iter = %d', self.qpe_max_iter)
+            log.info('QPE tolerance = %.1e', self.qpe_tol)
+        log.info('')
+        return
 
     @property
     def nocc(self):
-        return self.get_nocc()
+        frozen_mask = get_frozen_mask(self)
+        nkpts = len(self._scf.mo_energy[0])
+        neleca = 0.0
+        nelecb = 0.0
+        for k in range(nkpts):
+            neleca += np.sum(self._scf.mo_occ[0][k][frozen_mask[0][k]])
+            nelecb += np.sum(self._scf.mo_occ[1][k][frozen_mask[1][k]])
+        neleca = int(neleca / nkpts)
+        nelecb = int(nelecb / nkpts)
+        return (neleca, nelecb)
+
     @nocc.setter
     def nocc(self, n):
         self._nocc = n
 
     @property
     def nmo(self):
-        return self.get_nmo()
+        frozen_mask = get_frozen_mask(self)
+        nmoa = len(self._scf.mo_energy[0][0][frozen_mask[0][0]])
+        nmob = len(self._scf.mo_energy[1][0][frozen_mask[1][0]])
+        return (nmoa, nmob)
+
     @nmo.setter
     def nmo(self, n):
         self._nmo = n
 
-    get_nocc = get_nocc
-    get_nmo = get_nmo
-    get_frozen_mask = get_frozen_mask
+    def kernel(self, orbs=None, kptlist=None):
+        """Run a G0W0 calculation.
 
-    def kernel(self, mo_energy=None, mo_coeff=None, orbs=None, kptlist=None, nw=100):
-        """
-        Input:
-            kptlist: self-energy k-points
-            orbs: self-energy orbs
-            nw: grid number
-        Output:
-            mo_energy: GW quasiparticle energy
+        Parameters
+        ----------
+        orbs : list, optional
+            orbital list to calculate self-energy, by default None
+        kptlist : list, optional
+            k-point list to calculate self-energy, by default None
         """
-        if mo_coeff is None:
-            mo_coeff = np.array(self._scf.mo_coeff)
-        if mo_energy is None:
-            mo_energy = np.array(self._scf.mo_energy)
+        if self.mo_energy is None:
+            self.mo_energy = np.array(self._scf.mo_energy, copy=True)
+        if self.mo_coeff is None:
+            self.mo_coeff = np.array(self._scf.mo_coeff, copy=True)
+        if self.mo_occ is None:
+            self.mo_occ = np.array(self._scf.mo_occ, copy=True)
+
+        if isinstance(self.frozen, list) and (not isinstance(self.frozen[0], list)):
+            # make sure self.frozen is a list of lists if not frozen core
+            self.frozen = [self.frozen, self.frozen]
+        else:
+            assert self.frozen is None or isinstance(self.frozen, (int, np.int64))
 
-        nmoa, nmob = self.nmo
+        self.orbs = orbs
+        self.kptlist = kptlist
+
+        if hasattr(self._scf, "sigma"):
+            self.nw = max(400, self.nw)
+            self.ac_pade_npts = 18
+            self.ac_pade_step_ratio = 5.0 / 6.0
+            self.fc = False
+
+        nmoa, _ = self.nmo
         naux = self.with_df.get_naoaux()
         nkpts = self.nkpts
-        mem_incore = (3*nkpts*nmoa**2*naux) * 16/1e6
+        mem_incore = (3 * nkpts * nmoa**2 * naux) * 16 / 1e6
         mem_now = lib.current_memory()[0]
-        if (mem_incore + mem_now > 0.99*self.max_memory):
+        if mem_incore + mem_now > 0.99 * self.max_memory:
             logger.warn(self, 'Memory may not be enough!')
-            raise NotImplementedError
 
-        cput0 = (logger.process_clock(), logger.perf_counter())
+        cput0 = (time.process_time(), time.perf_counter())
         self.dump_flags()
-        self.converged, self.mo_energy, self.mo_coeff = \
-                kernel(self, mo_energy, mo_coeff, orbs=orbs,
-                       kptlist=kptlist, nw=nw, verbose=self.verbose)
-
+        kernel(self)
         logger.warn(self, 'GW QP energies may not be sorted from min to max')
         logger.timer(self, 'GW', *cput0)
-        return self.mo_energy
-
-if __name__ == '__main__':
-    from pyscf.pbc import gto
-    from pyscf.pbc.lib import chkfile
-    import os
-    cell = gto.Cell()
-    cell.build(
-        unit = 'B',
-        a = [[ 0.,          6.74027466,  6.74027466],
-             [ 6.74027466,  0.,          6.74027466],
-             [ 6.74027466,  6.74027466,  0.        ]],
-        atom = '''H 0 0 0
-                  H 1.68506866 1.68506866 1.68506866
-                  H 3.37013733 3.37013733 3.37013733''',
-        basis = 'gth-dzvp',
-        pseudo = 'gth-pade',
-        verbose = 5,
-        charge = 0,
-        spin = 1)
-
-    cell.spin = cell.spin * 3
-    kpts = cell.make_kpts([3,1,1],scaled_center=[0,0,0])
-    gdf = df.GDF(cell, kpts)
-    gdf_fname = 'h3_ints_311.h5'
-    gdf._cderi_to_save = gdf_fname
-    if not os.path.isfile(gdf_fname):
-        gdf.build()
-
-    chkfname = 'h_311.chk'
-    if os.path.isfile(chkfname):
-        kmf = scf.KUHF(cell, kpts, exxdiv=None)
-        kmf.with_df = gdf
-        kmf.with_df._cderi = gdf_fname
-        data = chkfile.load(chkfname, 'scf')
-        kmf.__dict__.update(data)
-    else:
-        kmf = scf.KUHF(cell, kpts, exxdiv=None)
-        kmf.with_df = gdf
-        kmf.with_df._cderi = gdf_fname
-        kmf.conv_tol = 1e-12
-        kmf.chkfile = chkfname
-        kmf.kernel()
-
-    gw = KUGWAC(kmf)
-    gw.linearized = False
-    gw.ac = 'pade'
-    gw.fc = False
-    nocca, noccb = gw.nocc
-    gw.kernel(kptlist=[0,1,2],orbs=range(0,nocca+3))
-    print(gw.mo_energy)
-    assert ((abs(gw.mo_energy[0][0][nocca-1]--0.28012813))<1e-5)
-    assert ((abs(gw.mo_energy[0][0][nocca]-0.13748876))<1e-5)
-    assert ((abs(gw.mo_energy[0][1][nocca-1]--0.29515851))<1e-5)
-    assert ((abs(gw.mo_energy[0][1][nocca]-0.14128011))<1e-5)
-    assert ((abs(gw.mo_energy[1][0][noccb-1]--0.33991721))<1e-5)
-    assert ((abs(gw.mo_energy[1][0][noccb]-0.10578847))<1e-5)
-    assert ((abs(gw.mo_energy[1][1][noccb-1]--0.33547973))<1e-5)
-    assert ((abs(gw.mo_energy[1][1][noccb]-0.08053408))<1e-5)
-
-    gw.fc = True
-    nocca, noccb = gw.nocc
-    gw.kernel(kptlist=[0,1,2],orbs=range(0,nocca+3))
-    print(gw.mo_energy)
-    assert ((abs(gw.mo_energy[0][0][nocca-1]--0.40244058))<1e-5)
-    assert ((abs(gw.mo_energy[0][0][nocca]-0.13618348))<1e-5)
-    assert ((abs(gw.mo_energy[0][1][nocca-1]--0.41743063))<1e-5)
-    assert ((abs(gw.mo_energy[0][1][nocca]-0.13997427))<1e-5)
-    assert ((abs(gw.mo_energy[1][0][noccb-1]--0.46133481))<1e-5)
-    assert ((abs(gw.mo_energy[1][0][noccb]-0.1044926))<1e-5)
-    assert ((abs(gw.mo_energy[1][1][noccb-1]--0.4568894))<1e-5)
-    assert ((abs(gw.mo_energy[1][1][noccb]-0.07922511))<1e-5)
+        return
+
+    set_frozen_orbs = set_frozen_orbs
+    make_rdm1 = make_rdm1_linear
+    make_gf = make_gf
diff --git a/pyscf/pbc/gw/kurpa.py b/pyscf/pbc/gw/kurpa.py
new file mode 100644
index 0000000000..01b7a437c8
--- /dev/null
+++ b/pyscf/pbc/gw/kurpa.py
@@ -0,0 +1,811 @@
+#!/usr/bin/env python
+# Copyright 2014-2026 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Tianyu Zhu <zhutianyu1991@gmail.com>
+# Author: Christopher Hillenbrand <chillenbrand15@gmail.com>
+# Author: Chaoqun Zhang <cq_zhang@outlook.com>
+# Author: Jincheng Yu <pimetamon@gmail.com>
+# Author: Jiachen Li <lijiachen.duke@gmail.com>
+#
+
+"""
+Periodic spin-unrestricted random phase approximation (direct RPA) with N^4 scaling.
+
+References:
+    T. Zhu and G.K.-L. Chan, J. Chem. Theory. Comput. 17, 727-741 (2021)
+    New J. Phys. 14, 053020 (2012)
+"""
+
+import time
+import numpy as np
+import scipy.linalg.blas as blas
+
+from pyscf import lib
+from pyscf.lib import logger, temporary_env
+from pyscf.ao2mo import _ao2mo
+from pyscf.ao2mo.incore import _conc_mos
+from pyscf.pbc import scf, tools
+from pyscf.pbc.cc.kccsd_uhf import get_nocc, get_nmo, get_frozen_mask
+
+from pyscf.gw.utils.ac_grid import _get_scaled_legendre_roots
+from pyscf.pbc.gw.kugw_ac import get_rho_response, get_rho_response_metal, get_rho_response_head, \
+    get_rho_response_wing, get_qij
+from pyscf.pbc.gw.krpa import KRPA, rho_accum_inner, rho_wing_accum_inner, get_rpa_ecorr_w, get_kconserv_ria_efficient
+
+
+def kernel(rpa, mo_energy, mo_coeff, nw=None, with_e_hf=None):
+    """RPA correlation and total energy
+
+    Parameters
+    ----------
+    rpa : KURPA
+        rpa object
+    mo_energy : double array
+        molecular orbital energies
+    mo_coeff : double ndarray
+        molecular orbital coefficients
+    nw : int, optional
+        number of frequency point on imaginary axis, by default None
+    with_e_hf : float, optional
+        extra input HF energy, by default None
+
+    Returns
+    -------
+    e_tot : float
+        RPA total energy
+    e_hf : float
+        HF energy (exact exchange for given mo_coeff)
+    e_corr : float
+        RPA correlation energy
+    """
+    assert rpa.frozen == 0 or rpa.frozen is None
+
+    # Compute HF exchange energy (EXX)
+    if with_e_hf is None:
+        uhf = scf.KUHF(rpa.mol, rpa.kpts, exxdiv=rpa._scf.exxdiv)
+        uhf.verbose = 0
+        if hasattr(rpa._scf, 'sigma'):
+            uhf = scf.addons.smearing_(uhf, sigma=rpa._scf.sigma, method=rpa._scf.smearing_method)
+        uhf.with_df = rpa._scf.with_df
+        with temporary_env(rpa.with_df, verbose=0), temporary_env(rpa.mol, verbose=0):
+            dm = rpa._scf.make_rdm1()
+            vj = uhf.get_j(uhf.cell, dm)
+            vj_tot = vj[0] + vj[1]
+            e_1e = 1.0 / len(rpa.kpts) * lib.einsum('kij,kji', dm[0] + dm[1], uhf.get_hcore()).real
+            e_j = 0.5 / len(rpa.kpts) * lib.einsum('kij,kji', dm[0] + dm[1], vj_tot).real
+            e_x = get_rpa_exx(rpa, acfd=rpa.acfd_exx, correction_only=False)
+            e_nuc = rpa._scf.energy_nuc()
+            e_hf = e_1e + e_j + e_x + e_nuc
+    else:
+        e_hf = with_e_hf
+        logger.debug(rpa, f'  Setting EXX energy explicitly to {e_hf}')
+
+    is_metal = hasattr(rpa._scf, 'sigma')
+
+    # Turn off FC for metals
+    if is_metal and rpa.fc:
+        logger.warn(rpa, 'FC not available for metals - setting rpa.fc to False')
+        rpa.fc = False
+
+    # Grids for integration on imaginary axis
+    freqs, wts = rpa.get_grids(nw=nw, mo_energy=mo_energy)
+
+    # Compute RPA correlation energy
+    if rpa.outcore:
+        if is_metal:
+            e_corr = get_rpa_ecorr_outcore_metal(rpa, freqs, wts)
+        else:
+            e_corr = get_rpa_ecorr_outcore(rpa, freqs, wts)
+    else:
+        e_corr = get_rpa_ecorr(rpa, freqs, wts)
+
+    # Compute total energy
+    e_tot = e_hf + e_corr
+
+    logger.debug(rpa, '  RPA total energy = %s', e_tot)
+    logger.debug(rpa, '  EXX energy = %s, RPA corr energy = %s', e_hf, e_corr)
+
+    return e_tot, e_hf, e_corr
+
+
+def get_idx_metal(mo_occ, threshold=1.0e-6):
+    """Get index of occupied/virtual/fractional orbitals of metals.
+
+    Parameters
+    ----------
+    mo_occ : double 1d array
+        occupation number
+    threshold : double, optional
+        threshold to determine fractionally occupied orbitals, by default 1.0e-6
+
+    Returns
+    -------
+    idx_occ : list
+        list of occupied orbital indexes
+    idx_frac : list
+        list of fractionally occupied orbital indexes
+    idx_vir : list
+        list of virtual orbital indexes
+    """
+    idx_occ = np.where(mo_occ > 1.0 - threshold)[0]
+    idx_vir = np.where(mo_occ < threshold)[0]
+    idx_frac = list(range(idx_occ[-1] + 1, idx_vir[0]))
+
+    return idx_occ, idx_frac, idx_vir
+
+
+def get_rpa_ecorr(rpa, freqs, wts):
+    """Compute RPA correlation energy.
+
+    Parameters
+    ----------
+    rpa : KURPA
+        rpa object
+    freqs : double 1d array
+        frequency grid
+    wts : double 1d array
+        weight of grids
+
+    Returns
+    -------
+    e_corr : double
+        correlation energy
+    """
+    mo_energy = np.array(rpa._scf.mo_energy)
+    mo_coeff = np.array(rpa._scf.mo_coeff)
+    nmoa, nmob = rpa.nmo
+    nkpts = rpa.nkpts
+    kpts = rpa.kpts
+    nw = len(freqs)
+    mydf = rpa.with_df
+    mo_occ = rpa.mo_occ
+
+    # possible kpts shift
+    kscaled = rpa.mol.get_scaled_kpts(kpts)
+    kscaled -= kscaled[0]
+
+    is_metal = hasattr(rpa._scf, 'sigma')
+
+    if rpa.fc:
+        qij_a, qij_b, q_abs, nq_pts = rpa.get_q_mesh(mo_energy, mo_coeff)
+
+    e_corr = 0j
+
+    # Precompute k-conservation table
+    # Given k-point indices (kL, i), kconserv_table[kshift,i] contains
+    # the index j that satisfies momentum conservation,
+    # (k(i) - k(j) - k(kL)) \dot a = 2n\pi
+    # i.e.
+    # - ki + kj + kL = G
+    kconserv_table = get_kconserv_ria_efficient(rpa.mol, kpts)
+    cderiarr = mydf.cderi_array()
+
+    for kL in range(nkpts):
+        # Lij: (2, ki, L, i, j) for looping every kL
+        # Lij = np.zeros((2,nkpts,naux,nmoa,nmoa),dtype=np.complex128)
+        Lij = []
+        # kidx: save kj that conserves with kL and ki (-ki+kj+kL=G)
+        # kidx_r: save ki that conserves with kL and kj (-ki+kj+kL=G)
+        kidx = np.zeros((nkpts), dtype=np.int64)
+        kidx_r = np.zeros((nkpts), dtype=np.int64)
+        for i, kpti in enumerate(kpts):
+            j = kconserv_table[kL, i]
+            kptj = kpts[j]
+            # Find (ki,kj) that satisfies momentum conservation with kL
+            kconserv = -kscaled[i] + kscaled[j] + kscaled[kL]
+            assert np.linalg.norm(np.round(kconserv) - kconserv) < 1e-12  # kidx[i] = j
+            kidx[i] = j
+            kidx_r[j] = i
+            logger.debug(rpa, 'Read Lpq (kL: %s / %s, ki: %s, kj: %s)' % (kL + 1, nkpts, i, j))
+            Lij_out_a = None
+            Lij_out_b = None
+            # Read (L|pq) and ao2mo transform to (L|ij)
+            Lpq = cderiarr.load(kpti, kptj)
+            if Lpq.shape[-1] == (nmoa * (nmoa + 1)) // 2:
+                Lpq = lib.unpack_tril(Lpq).reshape(-1, nmoa**2)
+            else:
+                Lpq = Lpq.reshape(-1, nmoa**2)
+            Lpq = Lpq.astype(np.complex128)
+            moija, ijslicea = _conc_mos(mo_coeff[0, i], mo_coeff[0, j])[2:]
+            moijb, ijsliceb = _conc_mos(mo_coeff[1, i], mo_coeff[1, j])[2:]
+            Lij_out_a = _ao2mo.r_e2(Lpq, moija, ijslicea, tao=[], ao_loc=None, out=Lij_out_a)
+            Lij_out_b = _ao2mo.r_e2(Lpq, moijb, ijsliceb, tao=[], ao_loc=None, out=Lij_out_b)
+            Lij.append(np.asarray((Lij_out_a.reshape(-1, nmoa, nmoa), Lij_out_b.reshape(-1, nmob, nmob))))
+
+        Lij = np.asarray(Lij)
+        naux = Lij.shape[2]
+        if is_metal is False:
+            Lia = [
+                np.ascontiguousarray(Lij[:, 0, :, : rpa.nocc[0], rpa.nocc[0] :]),
+                np.ascontiguousarray(Lij[:, 1, :, : rpa.nocc[1], rpa.nocc[1] :]),
+            ]
+
+        for w in range(nw):
+            # body polarizability
+            if is_metal:
+                Pi = get_rho_response_metal(freqs[w], mo_energy, mo_occ, Lij, kidx)
+            else:
+                Pi = get_rho_response(freqs[w], rpa.nocc, mo_energy, Lia, kidx)
+            if kL == 0 and rpa.fc:
+                for iq in range(nq_pts):
+                    # head Pi_00
+                    Pi_00 = get_rho_response_head(freqs[w], mo_energy, (qij_a[iq], qij_b[iq]))
+                    Pi_00 = 4.0 * np.pi / np.linalg.norm(q_abs[iq]) ** 2 * Pi_00
+                    # wings Pi_P0
+                    Pi_P0 = get_rho_response_wing(freqs[w], mo_energy, Lia, (qij_a[iq], qij_b[iq]))
+                    Pi_P0 = np.sqrt(4.0 * np.pi) / np.linalg.norm(q_abs[iq]) * Pi_P0
+
+                    # assemble Pi
+                    Pi_fc = np.zeros((naux + 1, naux + 1), dtype=Pi.dtype)
+                    Pi_fc[0, 0] = Pi_00
+                    Pi_fc[0, 1:] = Pi_P0.conj()
+                    Pi_fc[1:, 0] = Pi_P0
+                    Pi_fc[1:, 1:] = Pi
+
+                    # First, compute ec_w = Tr(Pi) + |log(det(I-Pi))|
+                    ec_w = np.trace(Pi_fc)
+                    # The following two lines are equivalent to
+                    # Pi = np.eye(naux) - Pi
+                    blas.zdscal(-1.0, Pi_fc.ravel(), overwrite_x=1)
+                    np.fill_diagonal(Pi_fc, np.diagonal(Pi_fc) + 1.0)
+                    ec_w += np.linalg.slogdet((Pi_fc))[1]
+                    e_corr += 1.0 / (2.0 * np.pi) * 1.0 / nkpts * 1.0 / nq_pts * ec_w * wts[w]
+            else:
+                # First, compute ec_w = Tr(Pi) + |log(det(I-Pi))|
+                ec_w = np.trace(Pi)
+                # The following two lines are equivalent to
+                # Pi = np.eye(naux) - Pi
+                blas.zdscal(-1.0, Pi.ravel(), overwrite_x=1)
+                np.fill_diagonal(Pi, np.diagonal(Pi) + 1.0)
+                ec_w += np.linalg.slogdet((Pi))[1]
+                e_corr += 1.0 / (2.0 * np.pi) * 1.0 / nkpts * ec_w * wts[w]
+
+    return e_corr.real
+
+
+def get_rpa_ecorr_outcore(rpa, freqs, wts):
+    """Low-memory routine to compute RPA correlation energy.
+
+    Parameters
+    ----------
+    rpa : KURPA
+        rpa object
+    freqs : double 1d array
+        frequency grid
+    wts : double 1d array
+        weight of grids
+
+    Returns
+    -------
+    e_corr : double
+        correlation energy
+    """
+    mo_energy = np.array(rpa._scf.mo_energy)
+    mo_coeff = np.array(rpa._scf.mo_coeff)
+    nmoa = rpa.nmo[0]
+    nkpts = rpa.nkpts
+    kpts = rpa.kpts
+    nw = len(freqs)
+    mydf = rpa.with_df
+
+    # possible kpts shift
+    kscaled = rpa.mol.get_scaled_kpts(kpts)
+    kscaled -= kscaled[0]
+
+    if rpa.fc:
+        qij_a, qij_b, q_abs, nq_pts = rpa.get_q_mesh(mo_energy, mo_coeff)
+
+    e_corr = 0j
+
+    # Precompute k-conservation table
+    # Given k-point indices (kL, i), kconserv_table[kshift,i] contains
+    # the index j that satisfies momentum conservation,
+    # (k(i) - k(j) - k(kL)) \dot a = 2n\pi
+    # i.e.
+    # - ki + kj + kL = G
+    kconserv_table = get_kconserv_ria_efficient(rpa.mol, kpts)
+    cderiarr = mydf.cderi_array()
+
+    for kL in range(nkpts):
+        Pi = None
+        Pi_P0 = None
+        kidx = np.zeros((nkpts), dtype=np.int64)
+        kidx_r = np.zeros((nkpts), dtype=np.int64)
+        for s in range(2):
+            nseg = rpa.nocc[s] // rpa.segsize + 1
+            for iseg in range(nseg):
+                orb_start = iseg * rpa.segsize
+                orb_end = min((iseg + 1) * rpa.segsize, rpa.nocc[s])
+                if orb_end == orb_start:
+                    continue
+                norb_this_iter = orb_end - orb_start
+
+                for i, kpti in enumerate(kpts):
+                    j = kconserv_table[kL, i]
+                    kptj = kpts[j]
+                    # Find (ki,kj) that satisfies momentum conservation with kL
+                    kconserv = -kscaled[i] + kscaled[j] + kscaled[kL]
+                    assert np.linalg.norm(np.round(kconserv) - kconserv) < 1e-12  # kidx[i] = j
+                    kidx[i] = j
+                    kidx_r[j] = i
+                    logger.debug(rpa, 'Read Lpq (kL: %s / %s, ki: %s, kj: %s)' % (kL + 1, nkpts, i, j))
+                    # Read (L|pq) and ao2mo transform to (L|ij)
+                    Lpq = cderiarr.load(kpti, kptj)
+                    if Lpq.shape[-1] == (nmoa * (nmoa + 1)) // 2:
+                        Lpq = lib.unpack_tril(Lpq).reshape(-1, nmoa**2)
+                    else:
+                        Lpq = Lpq.reshape(-1, nmoa**2)
+                    Lpq = Lpq.astype(np.complex128)
+                    naux = Lpq.shape[0]
+                    moij, ijslice = _conc_mos(mo_coeff[s, i], mo_coeff[s, j])[2:]
+                    ijslice = (orb_start, orb_end, rpa.nmo[s] + rpa.nocc[s], 2 * rpa.nmo[s])
+                    Lij_slice = _ao2mo.r_e2(Lpq, moij, ijslice, tao=[], ao_loc=None)
+                    Lij_slice = Lij_slice.reshape(naux, norb_this_iter, rpa.nmo[s] - rpa.nocc[s])
+                    if Pi is None:
+                        Pi = np.zeros((nw, naux, naux), dtype=np.complex128)
+                        if kL == 0 and rpa.fc:
+                            Pi_P0 = np.zeros((nq_pts, nw, naux), dtype=np.complex128)
+                    eia = mo_energy[s, i][orb_start:orb_end, None] - mo_energy[s, j][None, rpa.nocc[s] :]
+                    for w in range(nw):
+                        rho_accum_inner(Pi[w], eia, freqs[w], Lij_slice, alpha=2.0 / nkpts)
+                        if kL == 0 and rpa.fc:
+                            for iq in range(nq_pts):
+                                rho_wing_accum_inner(
+                                    Pi_P0[iq, w],
+                                    eia,
+                                    freqs[w],
+                                    Lij_slice,
+                                    (qij_a if s == 0 else qij_b)[iq, i, orb_start:orb_end],
+                                    alpha=2.0 / nkpts,
+                                )
+
+        for w in range(nw):
+            if kL == 0 and rpa.fc:
+                for iq in range(nq_pts):
+                    Pi_00 = get_rho_response_head(freqs[w], mo_energy, (qij_a[iq], qij_b[iq]))
+                    Pi_00 = 4.0 * np.pi / np.linalg.norm(q_abs[iq]) ** 2 * Pi_00
+                    Pi_P0_iq = np.sqrt(4.0 * np.pi) / np.linalg.norm(q_abs[iq]) * Pi_P0[iq, w]
+
+                    Pi_fc = np.zeros((naux + 1, naux + 1), dtype=Pi.dtype)
+                    Pi_fc[0, 0] = Pi_00
+                    Pi_fc[0, 1:] = Pi_P0_iq.conj()
+                    Pi_fc[1:, 0] = Pi_P0_iq
+                    Pi_fc[1:, 1:] = Pi[w]
+
+                    e_corr += get_rpa_ecorr_w(Pi_fc, wts[w] / nq_pts)
+            else:
+                e_corr += get_rpa_ecorr_w(Pi[w], wts[w])
+
+    e_corr = e_corr.real
+    e_corr *= 1.0 / (2.0 * np.pi) / nkpts
+    return e_corr
+
+
+def get_rpa_ecorr_outcore_metal(rpa, freqs, wts):
+    """Low-memory routine to compute RPA correlation energy for metals.
+
+    Parameters
+    ----------
+    rpa : KURPA
+        rpa object
+    freqs : double 1d array
+        frequency grid
+    wts : double 1d array
+        weight of grids
+
+    Returns
+    -------
+    e_corr : double
+        correlation energy
+    """
+    mo_energy = np.array(rpa._scf.mo_energy)
+    mo_coeff = np.array(rpa._scf.mo_coeff)
+    nmoa = rpa.nmo[0]
+    nkpts = rpa.nkpts
+    kpts = rpa.kpts
+    nw = len(freqs)
+    mydf = rpa.with_df
+    mo_occ = np.array(rpa.mo_occ)
+
+    # possible kpts shift
+    kscaled = rpa.mol.get_scaled_kpts(kpts)
+    kscaled -= kscaled[0]
+
+    if rpa.fc:
+        qij_a, qij_b, q_abs, nq_pts = rpa.get_q_mesh(mo_energy, mo_coeff)
+
+    e_corr = 0j
+
+    # Precompute k-conservation table
+    # Given k-point indices (kL, i), kconserv_table[kshift,i] contains
+    # the index j that satisfies momentum conservation,
+    # (k(i) - k(j) - k(kL)) \dot a = 2n\pi
+    # i.e.
+    # - ki + kj + kL = G
+    kconserv_table = get_kconserv_ria_efficient(rpa.mol, kpts)
+    cderiarr = mydf.cderi_array()
+
+    for kL in range(nkpts):
+        Pi = None
+        kidx = np.zeros((nkpts), dtype=np.int64)
+        kidx_r = np.zeros((nkpts), dtype=np.int64)
+        for s in range(2):
+            for i, kpti in enumerate(kpts):
+                j = kconserv_table[kL, i]
+                kptj = kpts[j]
+                # Find (ki,kj) that satisfies momentum conservation with kL
+                kconserv = -kscaled[i] + kscaled[j] + kscaled[kL]
+                assert np.linalg.norm(np.round(kconserv) - kconserv) < 1e-12  # kidx[i] = j
+                kidx[i] = j
+                kidx_r[j] = i
+                logger.debug(rpa, 'Read Lpq (kL: %s / %s, ki: %s, kj: %s)' % (kL + 1, nkpts, i, j))
+                # Read (L|pq) and ao2mo transform to (L|ij)
+                Lpq = cderiarr.load(kpti, kptj)
+                if Lpq.shape[-1] == (nmoa * (nmoa + 1)) // 2:
+                    Lpq = lib.unpack_tril(Lpq).reshape(-1, nmoa**2)
+                else:
+                    Lpq = Lpq.reshape(-1, nmoa**2)
+                Lpq = Lpq.astype(np.complex128)
+                naux = Lpq.shape[0]
+
+                idx_occ_i, idx_frac_i, idx_vir_i = get_idx_metal(mo_occ[s, i])
+                idx_occ_j, idx_frac_j, idx_vir_j = get_idx_metal(mo_occ[s, j])
+
+                nocc_i = len(idx_occ_i)
+                nfrac_i = len(idx_frac_i)
+                nocc_j = len(idx_occ_j)
+                nfrac_j = len(idx_frac_j)
+                nseg = (nocc_i + nfrac_i) // rpa.segsize + 1
+                for iseg in range(nseg):
+                    orb_start = iseg * rpa.segsize
+                    orb_end = min((iseg + 1) * rpa.segsize, nocc_i + nfrac_i)
+                    if orb_end == orb_start:
+                        break
+                    norb_this_iter = orb_end - orb_start
+
+                    moij, ijslice = _conc_mos(mo_coeff[s, i], mo_coeff[s, j])[2:]
+
+                    ijslice = (orb_start, orb_end, rpa.nmo[s] + nocc_j, 2 * rpa.nmo[s])
+                    Lij_slice = _ao2mo.r_e2(Lpq, moij, ijslice, tao=[], ao_loc=None)
+                    Lij_slice = Lij_slice.reshape(naux, norb_this_iter, rpa.nmo[s] - nocc_j)
+                    if Pi is None:
+                        Pi = np.zeros((nw, naux, naux), dtype=np.complex128)
+
+                    # Find ka that conserves with ki and kL (-ki+ka+kL=G)
+                    eia = mo_energy[s, i][orb_start:orb_end, None] - mo_energy[s, j][None, nocc_j:]
+                    fia = mo_occ[s, i][orb_start:orb_end, None] - mo_occ[s, j][None, nocc_j:]
+                    # The overall fia[nocc_i:, :nfrac_j] *= 0.5 for double counting
+                    if orb_start >= nocc_i:
+                        fia[:, :nfrac_j] *= 0.5
+                    elif orb_end > nocc_i:
+                        offset = nocc_i - orb_start
+                        fia[offset:, :nfrac_j] *= 0.5
+                    for w in range(nw):
+                        rho_accum_inner(Pi[w], eia, freqs[w], Lij_slice, alpha=2.0 / nkpts, fia=fia)
+
+        for w in range(nw):
+            e_corr += get_rpa_ecorr_w(Pi[w], wts[w])
+
+    e_corr = e_corr.real
+    e_corr *= 1.0 / (2.0 * np.pi) / nkpts
+
+    return e_corr
+
+
+def get_rpa_exx(rpa, acfd=False, correction_only=False):
+    """Calculate RPA exchange energy.
+    For gapped systems, Hartree-Fock and adiabatic connection fluctuation dissipation exchange energies are the same.
+    For metallic systems, they are different.
+    The ACFD exchange energy is given by equation 12 in doi.org/10.1103/PhysRevB.81.115126
+
+    Parameters
+    ----------
+    rpa : KURPA
+        rpa object
+    acfd : bool, optional
+        calculate ACFD exchange energy, by default False
+    correction_only : bool, optional
+        only calculate the correction term, by default False
+
+    Returns
+    -------
+    ex : double
+        exchange energy
+    """
+    mo_energy = np.asarray(rpa._scf.mo_energy)
+    mo_coeff = np.asarray(rpa._scf.mo_coeff)
+    mo_occ = np.asarray(rpa._scf.mo_occ)
+
+    nocc = rpa.nocc
+    nspin, _, nao, _ = mo_coeff.shape
+    nkpts = rpa.nkpts
+    kpts = rpa.kpts
+    mydf = rpa.with_df
+
+    # possible kpts shift center
+    kscaled = rpa.mol.get_scaled_kpts(kpts)
+    kscaled -= kscaled[0]
+
+    ex = 0j
+    cderiarr = mydf.cderi_array()
+    for kL in range(nkpts):
+        # kidx: save kj that conserves with kL and ki (-ki+kj+kL=G)
+        # kidx_r: save ki that conserves with kL and kj (-ki+kj+kL=G)
+        kidx = np.zeros(shape=[nkpts], dtype=np.int64)
+        kidx_r = np.zeros(shape=[nkpts], dtype=np.int64)
+        for i in range(nkpts):
+            for j in range(nkpts):
+                # Find (ki,kj) that satisfies momentum conservation with kL
+                kconserv = -kscaled[i] + kscaled[j] + kscaled[kL]
+                is_kconserv = np.linalg.norm(np.round(kconserv) - kconserv) < 1e-12
+                if is_kconserv:
+                    kidx[i] = j
+                    kidx_r[j] = i
+
+        for kn in range(nkpts):
+            # Find km that conserves with kn and kL (-km+kn+kL=G)
+            km = kidx_r[kn]
+
+            # logger.debug(gw, 'Read Lpq (kL: %s / %s, ki: %s, kj: %s @ Rank %d)' % (kL + 1, nkpts, i, j, rank))
+            # Read (L|pq) and ao2mo transform to (L|ij)
+            # support unequal naux on different k points
+            Lpq_ao = cderiarr.load(kpts[km], kpts[kn])
+            if Lpq_ao.shape[-1] == (nao * (nao + 1)) // 2:
+                Lpq_ao = lib.unpack_tril(Lpq_ao).reshape(-1, nao**2)
+            else:
+                Lpq_ao = Lpq_ao.reshape(-1, nao**2)
+            Lpq_ao = Lpq_ao.astype(np.complex128)
+
+            for s in range(nspin):
+                Lij = None
+                if hasattr(rpa._scf, 'sigma'):
+                    idx_occ_i, idx_frac_i, _ = get_idx_metal(mo_occ[s][km])
+                    idx_occ_j, idx_frac_j, _ = get_idx_metal(mo_occ[s][kn])
+                    nocc_i = len(idx_occ_i) + len(idx_frac_i)
+                    nocc_j = len(idx_occ_j) + len(idx_frac_j)
+                    moij, ijslice = _conc_mos(mo_coeff[s][km][:, :nocc_i], mo_coeff[s][kn][:, :nocc_j])[2:]
+                    Lij = _ao2mo.r_e2(Lpq_ao, moij, ijslice, tao=[], ao_loc=None, out=Lij)
+                    Lij = Lij.reshape(-1, nocc_i, nocc_j)
+
+                    if acfd is True:
+                        if correction_only is True:
+                            mo_occ_ij = np.minimum(mo_occ[s][km][:nocc_i, None], mo_occ[s][kn][None, :nocc_j])
+                            mo_occ_ij -= mo_occ[s][km][:nocc_i, None] * mo_occ[s][kn][None, :nocc_j]
+                        else:
+                            # The following line is equivalent to the frequency integration in equation 12 in
+                            # doi.org/10.1103/PhysRevB.81.115126
+                            # TODO: add a detailed note
+                            eij = mo_energy[s][km][:nocc_i, None] - mo_energy[s][kn][None, :nocc_j]
+                            integrand = np.zeros((nocc_i, nocc_j), dtype=np.complex128)
+                            integrand[eij > 1e-6] = 1
+                            integrand[eij < -1e-6] = -1
+                            mo_occ_ij = 1.0 - integrand
+                            mo_occ_ij = mo_occ_ij * mo_occ[s][km][:nocc_i, None]
+                    else:
+                        mo_occ_ij = mo_occ[s][km][:nocc_i, None] * mo_occ[s][kn][None, :nocc_j]
+                    Lij_occ = Lij * mo_occ_ij[None]
+                    # ex -= np.einsum('Lij,Lij->', Lij_occ.reshape(-1, nocc, nocc), Lij.reshape(-1, nocc, nocc).conj())
+                    ex -= blas.zdotc(Lij_occ.ravel(), Lij.ravel())
+                else:
+                    moij, ijslice = _conc_mos(mo_coeff[s][km][:, :nocc[s]], mo_coeff[s][kn][:, :nocc[s]])[2:]
+                    Lij = _ao2mo.r_e2(Lpq_ao, moij, ijslice, tao=[], ao_loc=None, out=Lij)
+                    # ex -= np.einsum('Lij,Lij->', Lij.reshape(-1, nocc, nocc), Lij.reshape(-1, nocc, nocc).conj())
+                    ex -= blas.zdotc(Lij.ravel(), Lij.ravel())
+
+    ex = ex.real
+    ex *= 0.5 / nkpts**2
+
+    if rpa._scf.exxdiv == 'ewald' and rpa._scf.cell.dimension != 0:
+        madelung = tools.pbc.madelung(rpa._scf.cell, kpts)
+        for s in range(nspin):
+            exxdiv_shift = 0.5 * madelung * np.sum(mo_occ[s]**2) / (nkpts)
+            ex -= exxdiv_shift
+            if acfd is True:
+                for k in range(nkpts):
+                    idx_occ, idx_frac, _ = get_idx_metal(mo_occ[s][k])
+                    f_i = mo_occ[s][k][:(len(idx_occ) + len(idx_frac))]
+                    ex -= 0.5 * madelung * np.sum(f_i - f_i * f_i) / nkpts
+
+    return ex
+
+
+class KURPA(KRPA):
+    def dump_flags(self, verbose=None):
+        log = logger.Logger(self.stdout, self.verbose)
+        log.info('')
+        log.info('******** %s ********', self.__class__)
+        log.info('method = %s', self.__class__.__name__)
+        nocca, noccb = self.nocc
+        nmoa, nmob = self.nmo
+        nvira = nmoa - nocca
+        nvirb = nmob - noccb
+        nkpts = self.nkpts
+        log.info(
+            'RPA (nocca, noccb) = (%d, %d), (nvira, nvirb) = (%d, %d), nkpts = %d', nocca, noccb, nvira, nvirb, nkpts
+        )
+        if self.frozen is not None and self.frozen > 0:
+            log.info('frozen orbitals %s', str(self.frozen))
+        log.info('grid type = %s', self.grids_alg)
+        log.info('outcore mode = %s', self.outcore)
+        if self.outcore is True:
+            log.info('outcore segment size = %d', self.segsize)
+        log.info('RPA finite size corrections = %s', self.fc)
+        log.info('ACFD exchange energy = %s', self.acfd_exx)
+        log.info('')
+        return
+
+    @property
+    def nocc(self):
+        mo_occ = self._scf.mo_occ
+        return (int(np.sum(mo_occ[0][0])), int(np.sum(mo_occ[1][0])))
+
+    @nocc.setter
+    def nocc(self, n):
+        self._nocc = n
+
+    @property
+    def nmo(self):
+        return (len(self._scf.mo_energy[0][0]), len(self._scf.mo_energy[1][0]))
+
+    @nmo.setter
+    def nmo(self, n):
+        self._nmo = n
+
+    get_nocc = get_nocc
+    get_nmo = get_nmo
+    get_frozen_mask = get_frozen_mask
+
+    def kernel(self, mo_energy=None, mo_coeff=None, nw=None):
+        """RPA correlation and total energy
+
+        Calculated total energy, HF energy and RPA correlation energy
+        are stored in self.e_tot, self.e_hf, self.e_corr
+
+        Parameters
+        ----------
+        mo_energy : double array
+            molecular orbital energies
+        mo_coeff : double ndarray
+            molecular orbital coefficients
+        nw : int, optional
+            number of frequency point on imaginary axis, by default None
+
+        Returns
+        -------
+        e_tot : float
+            RPA total energy
+        e_hf : float
+            HF energy (exact exchange for given mo_coeff)
+        e_corr : float
+            RPA correlation energy
+        """
+        if mo_coeff is None:
+            mo_coeff = np.array(self._scf.mo_coeff)
+        if mo_energy is None:
+            mo_energy = np.array(self._scf.mo_energy)
+
+        nmoa = self.nmo[0]
+        naux = self.with_df.get_naoaux()
+        nkpts = self.nkpts
+        mem_incore = (3 * nkpts * nmoa**2 * naux) * 16 / 1e6
+        mem_now = lib.current_memory()[0]
+        if mem_incore + mem_now > 0.99 * self.max_memory:
+            logger.warn(self, 'Memory may not be enough!')
+            raise NotImplementedError
+
+        cput0 = (time.process_time(), time.perf_counter())
+        self.dump_flags()
+        self.e_tot, self.e_hf, self.e_corr = kernel(self, mo_energy, mo_coeff, nw=nw)
+        logger.timer(self, 'RPA', *cput0)
+        return self.e_tot, self.e_hf, self.e_corr
+
+    def get_grids(self, alg=None, nw=None, mo_energy=None):
+        """Generate grids for integration.
+
+        Parameters
+        ----------
+        alg : str, optional
+            algorithm for generating grids, by default None
+        nw : int, optional
+            number of grids, by default None
+        mo_energy : double 3d array, optional
+            orbital energy, used for minimax grids, by default None
+
+        Returns
+        -------
+        freqs : double 1d array
+            frequency grid
+        wts : double 1d array
+            weight of grids
+        """
+        if alg is None:
+            alg = self.grids_alg
+        if mo_energy is None:
+            mo_energy = np.array(self._scf.mo_energy)
+
+        if alg == 'legendre':
+            nw = 40 if nw is None else nw
+            freqs, wts = _get_scaled_legendre_roots(nw)
+        else:
+            raise NotImplementedError('Grids algorithm not implemented!')
+
+        return freqs, wts
+
+    def get_q_mesh(self, mo_energy, mo_coeff):
+        """Get q-mesh for finite size correction.
+        Equation 39-42 in doi.org/10.1021/acs.jctc.0c00704
+
+        Parameters
+        ----------
+        mo_energy : double 3d array
+            orbital energy
+        mo_coeff : double 4d array
+            coefficient from AO to MO
+
+        Returns
+        -------
+        qij : double 1d array
+            q-mesh grids
+        q_abs : double 1d array
+            absolute positions of q-mesh grids
+        nq_pts : init
+            number of q-mesh grids
+        """
+        # Set up q mesh for q->0 finite size correction
+        nmoa, nmob = self.nmo
+        nocca, noccb = self.nocc
+        nkpts = self.nkpts
+        if not self.fc_grid:
+            q_pts = np.array([1e-3, 0, 0]).reshape(1, 3)
+        else:
+            Nq = 4
+            q_pts = np.zeros((Nq**3 - 1, 3))
+            for i in range(Nq):
+                for j in range(Nq):
+                    for k in range(Nq):
+                        if i == 0 and j == 0 and k == 0:
+                            continue
+                        else:
+                            q_pts[i * Nq**2 + j * Nq + k - 1, 0] = k * 5e-4
+                            q_pts[i * Nq**2 + j * Nq + k - 1, 1] = j * 5e-4
+                            q_pts[i * Nq**2 + j * Nq + k - 1, 2] = i * 5e-4
+        nq_pts = len(q_pts)
+        q_abs = self.mol.get_abs_kpts(q_pts)
+
+        # Get qij = 1/sqrt(Omega) * < psi_{ik} | e^{iqr} | psi_{ak-q} > at q: (nkpts, nocc, nvir)
+        qij_a = np.zeros((nq_pts, nkpts, nocca, nmoa - nocca), dtype=np.complex128)
+        qij_b = np.zeros((nq_pts, nkpts, noccb, nmob - noccb), dtype=np.complex128)
+        for k in range(nq_pts):
+            qij_tmp = get_qij(self, q_abs[k], mo_energy, mo_coeff)
+            qij_a[k] = qij_tmp[0]
+            qij_b[k] = qij_tmp[1]
+
+        return qij_a, qij_b, q_abs, nq_pts
+
+    def get_acfd_exx(self, correction_only=False):
+        """Calculate ACFD exchange energy.
+
+        Parameters
+        ----------
+        correction_only : bool
+            only return the correction term
+
+        Returns
+        -------
+        ex_acfd : double
+            ACFD exchange energy
+        """
+        ex_acfd = get_rpa_exx(self, acfd=True, correction_only=correction_only)
+        return ex_acfd
diff --git a/pyscf/pbc/gw/test/test_gw_ac.py b/pyscf/pbc/gw/test/test_gw_ac.py
new file mode 100644
index 0000000000..4d36cc1781
--- /dev/null
+++ b/pyscf/pbc/gw/test/test_gw_ac.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+
+import pytest
+
+from pyscf.pbc import df, dft, gto, tools
+from pyscf.pbc.gw.gw_ac import GWAC
+
+
+@pytest.fixture(scope="module")
+def diamond_supercell_pbe():
+    ucell = gto.Cell()
+    ucell.build(
+        unit="angstrom",
+        a="""
+            0.000000     1.783500     1.783500
+            1.783500     0.000000     1.783500
+            1.783500     1.783500     0.000000
+        """,
+        atom="C 1.337625 1.337625 1.337625; C 2.229375 2.229375 2.229375",
+        dimension=3,
+        verbose=0,
+        output="/dev/null",
+        pseudo="gth-pade",
+        basis="gth-szv",
+        precision=1e-12,
+    )
+
+    cell = tools.super_cell(ucell, [3, 1, 1])
+    cell.verbose = 0
+    cell.output = "/dev/null"
+
+    gdf = df.RSDF(cell)
+    gdf.build()
+
+    mf = dft.RKS(cell).rs_density_fit()
+    mf.xc = "pbe"
+    mf.exxdiv = None
+    mf.with_df = gdf
+    mf.conv_tol = 1e-12
+    mf.kernel()
+
+    yield mf
+
+
+def test_gwac_pade_diamond_supercell_high_cost(diamond_supercell_pbe):
+    gw = GWAC(diamond_supercell_pbe)
+    gw.kernel()
+
+    assert gw.mo_energy[5] == pytest.approx(0.52637379, abs=1e-4)
+    assert gw.mo_energy[10] == pytest.approx(0.62044176, abs=1e-4)
+    assert gw.mo_energy[12] == pytest.approx(0.96572544, abs=1e-4)
+    assert gw.mo_energy[15] == pytest.approx(1.0751724, abs=1e-4)
diff --git a/pyscf/pbc/gw/test/test_krgw.py b/pyscf/pbc/gw/test/test_krgw.py
index 45bbe4553e..15c88715bd 100644
--- a/pyscf/pbc/gw/test/test_krgw.py
+++ b/pyscf/pbc/gw/test/test_krgw.py
@@ -1,11 +1,7 @@
 #!/usr/bin/env python
 
 import unittest
-import numpy
-import os
-from pyscf import lib
-from pyscf.pbc import gto, dft, scf, df
-from pyscf.pbc.gw import krgw_ac
+from pyscf.pbc import gto, dft, df
 from pyscf.pbc.gw import krgw_cd
 
 def setUpModule():
@@ -33,32 +29,6 @@ def tearDownModule():
     del cell, kpts, gdf
 
 class KnownValues(unittest.TestCase):
-    def test_gwac_pade_high_cost(self):
-        kmf = dft.KRKS(cell, kpts).density_fit(with_df=gdf)
-        kmf.xc = 'pbe'
-        kmf.kernel()
-
-        gw = krgw_ac.KRGWAC(kmf)
-        gw.linearized = False
-        gw.ac = 'pade'
-
-        # without finite size corrections
-        gw.fc = False
-        nocc = gw.nocc
-        gw.kernel(kptlist=[0,1,2],orbs=range(0, nocc+3))
-        self.assertAlmostEqual(gw.mo_energy[0][nocc-1], 0.62045797, 4)
-        self.assertAlmostEqual(gw.mo_energy[0][nocc]  , 0.96574324, 4)
-        self.assertAlmostEqual(gw.mo_energy[1][nocc-1], 0.52639137, 4)
-        self.assertAlmostEqual(gw.mo_energy[1][nocc]  , 1.07513258, 4)
-
-        # with finite size corrections
-        gw.fc = True
-        gw.kernel(kptlist=[0,1,2], orbs=range(0, nocc+3))
-        self.assertAlmostEqual(gw.mo_energy[0][nocc-1], 0.54277092, 4)
-        self.assertAlmostEqual(gw.mo_energy[0][nocc]  , 0.80148537, 4)
-        self.assertAlmostEqual(gw.mo_energy[1][nocc-1], 0.45073793, 4)
-        self.assertAlmostEqual(gw.mo_energy[1][nocc]  , 0.92910108, 4)
-
     def test_gwcd_high_cost(self):
         kmf = dft.KRKS(cell, kpts).density_fit(with_df=gdf)
         kmf.xc = 'pbe'
@@ -84,42 +54,6 @@ def test_gwcd_high_cost(self):
         self.assertAlmostEqual(gw.mo_energy[1][nocc-1], 0.45073751, 4)
         self.assertAlmostEqual(gw.mo_energy[1][nocc],   0.92910117, 4)
 
-    def test_gw(self):
-        cell = gto.Cell()
-        cell.build(a = '''
-                   0.000000     1.783500     1.783500
-                   1.783500     0.000000     1.783500
-                   1.783500     1.783500     0.000000
-                   ''',
-                   atom = 'H 1.337625 1.337625 1.337625; H 2.229375 2.229375 2.229375',
-                   verbose = 4,
-                   output = '/dev/null',
-                   basis=[[0, [2., 1.]], [0, [.5, 1.]]])
-
-        kpts = cell.make_kpts([3,1,1],scaled_center=[0,0,0])
-        kmf = dft.KRKS(cell, kpts).density_fit().run()
-
-        gw = krgw_ac.KRGWAC(kmf)
-        gw.linearized = True
-        gw.ac = 'pade'
-        # without finite size corrections
-        gw.fc = False
-        nocc = gw.nocc
-        gw.kernel(kptlist=[0,1,2],orbs=range(0,nocc+3))
-        self.assertAlmostEqual(gw.mo_energy[0][nocc-1], -0.257088388010083, 6)
-        self.assertAlmostEqual(gw.mo_energy[0][nocc]  , 0.7377021147675703, 6)
-        self.assertAlmostEqual(gw.mo_energy[1][nocc-1], -0.121872186953884, 6)
-        self.assertAlmostEqual(gw.mo_energy[1][nocc]  , 0.570710170186033 , 6)
-
-        # with finite size corrections
-        gw.linearized = False
-        gw.fc = True
-        gw.kernel(kptlist=[0,1,2],orbs=range(0,nocc+3))
-        self.assertAlmostEqual(gw.mo_energy[0][nocc-1], -0.464099926108335, 6)
-        self.assertAlmostEqual(gw.mo_energy[0][nocc]  , 0.7105306664244474, 6)
-        self.assertAlmostEqual(gw.mo_energy[1][nocc-1], -0.347704595829313, 6)
-        self.assertAlmostEqual(gw.mo_energy[1][nocc]  , 0.552136080110482 , 6)
-
 if __name__ == '__main__':
     print('Full Tests for KRGW')
     unittest.main()
diff --git a/pyscf/pbc/gw/test/test_krgw_ac.py b/pyscf/pbc/gw/test/test_krgw_ac.py
new file mode 100644
index 0000000000..617e7a2441
--- /dev/null
+++ b/pyscf/pbc/gw/test/test_krgw_ac.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+
+import pytest
+
+from pyscf.pbc import df, dft, gto
+from pyscf.pbc.gw.krgw_ac import KRGWAC
+
+
+@pytest.fixture(scope="module")
+def diamond_pbe():
+    cell = gto.Cell()
+    cell.build(
+        unit="angstrom",
+        a="""
+            0.000000     1.783500     1.783500
+            1.783500     0.000000     1.783500
+            1.783500     1.783500     0.000000
+        """,
+        atom="C 1.337625 1.337625 1.337625; C 2.229375 2.229375 2.229375",
+        dimension=3,
+        verbose=0,
+        output="/dev/null",
+        pseudo="gth-pade",
+        basis="gth-szv",
+        precision=1e-10,
+    )
+
+    kpts = cell.make_kpts([3, 1, 1], scaled_center=[0, 0, 0])
+    gdf = df.RSDF(cell, kpts)
+    gdf.build()
+
+    kmf = dft.KRKS(cell, kpts).rs_density_fit()
+    kmf.xc = "pbe"
+    kmf.with_df = gdf
+    kmf.conv_tol = 1e-12
+    kmf.kernel()
+
+    yield kmf
+
+    cell.stdout.close()
+
+
+def test_krgwac_pade_no_fc(diamond_pbe):
+    gw = KRGWAC(diamond_pbe)
+    gw.ac = "pade"
+    gw.qpe_linearized = False
+    gw.fc = False
+    gw.kernel(kptlist=[0, 1, 2], orbs=range(0, 7))
+
+    assert gw.mo_energy[0][3] == pytest.approx(0.62044205, abs=1e-4)
+    assert gw.mo_energy[0][4] == pytest.approx(0.96572609, abs=1e-4)
+    assert gw.mo_energy[1][3] == pytest.approx(0.52637438, abs=1e-4)
+    assert gw.mo_energy[1][4] == pytest.approx(1.07517363, abs=1e-4)
+
+
+def test_krgwac_pade_no_fc_outcore(diamond_pbe):
+    gw = KRGWAC(diamond_pbe)
+    gw.ac = "pade"
+    gw.qpe_linearized = False
+    gw.fc = False
+    gw.outcore = True
+    gw.kernel(kptlist=[0, 1, 2], orbs=range(0, 7))
+
+    assert gw.mo_energy[0][3] == pytest.approx(0.62044205, abs=1e-4)
+    assert gw.mo_energy[0][4] == pytest.approx(0.96572609, abs=1e-4)
+    assert gw.mo_energy[1][3] == pytest.approx(0.52637438, abs=1e-4)
+    assert gw.mo_energy[1][4] == pytest.approx(1.07517363, abs=1e-4)
+
+
+def test_krgwac_pade_with_fc(diamond_pbe):
+    gw = KRGWAC(diamond_pbe)
+    gw.ac = "pade"
+    gw.qpe_linearized = False
+    gw.fc = True
+    gw.kernel(kptlist=[0, 1, 2], orbs=range(0, 7))
+
+    assert gw.mo_energy[0][3] == pytest.approx(0.44025061, abs=1e-4)
+    assert gw.mo_energy[0][4] == pytest.approx(0.80148565, abs=1e-4)
+    assert gw.mo_energy[1][3] == pytest.approx(0.35193483, abs=1e-4)
+    assert gw.mo_energy[1][4] == pytest.approx(0.92909525, abs=1e-4)
+
+
+def test_krgwac_pade_with_fc_frozen_core(diamond_pbe):
+    gw = KRGWAC(diamond_pbe)
+    gw.ac = "pade"
+    gw.qpe_linearized = False
+    gw.fc = True
+    gw.frozen = 1
+    gw.kernel()
+
+    assert gw.mo_energy[0][3] == pytest.approx(0.44092615, abs=1e-4)
+    assert gw.mo_energy[0][4] == pytest.approx(0.79820946, abs=1e-4)
diff --git a/pyscf/pbc/gw/test/test_krpa.py b/pyscf/pbc/gw/test/test_krpa.py
new file mode 100644
index 0000000000..b71ef7bc2d
--- /dev/null
+++ b/pyscf/pbc/gw/test/test_krpa.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+
+import pytest
+
+from pyscf.pbc import df, gto, scf
+from pyscf.pbc.gw.krpa import KRPA
+
+
+@pytest.fixture(scope="module")
+def diamond_krhf():
+    cell = gto.Cell()
+    cell.build(
+        unit="angstrom",
+        a="""
+            0.000000     1.783500     1.783500
+            1.783500     0.000000     1.783500
+            1.783500     1.783500     0.000000
+        """,
+        atom="C 1.337625 1.337625 1.337625; C 2.229375 2.229375 2.229375",
+        dimension=3,
+        verbose=0,
+        output="/dev/null",
+        pseudo="gth-pbe",
+        basis="gth-dzv",
+        precision=1e-12,
+    )
+
+    kpts = cell.make_kpts([3, 1, 1], scaled_center=[0, 0, 0])
+    gdf = df.RSGDF(cell, kpts)
+    gdf.build()
+
+    kmf = scf.KRHF(cell, kpts).rs_density_fit()
+    kmf.with_df = gdf
+    kmf.conv_tol = 1e-12
+    kmf.kernel()
+
+    yield kmf
+
+    cell.stdout.close()
+
+
+def test_krpa_no_fc(diamond_krhf):
+    rpa = KRPA(diamond_krhf)
+    rpa.fc = False
+    rpa.kernel()
+
+    assert rpa.e_corr == pytest.approx(-0.1852772037535004, abs=1e-6)
+    assert rpa.e_tot == pytest.approx(-10.694392044197565, abs=1e-6)
+
+
+def test_krpa_no_fc_outcore(diamond_krhf):
+    rpa = KRPA(diamond_krhf)
+    rpa.outcore = True
+    rpa.segsize = 2
+    rpa.kernel()
+
+    assert rpa.e_corr == pytest.approx(-0.1852772037535004, abs=1e-6)
+    assert rpa.e_tot == pytest.approx(-10.694392044197565, abs=1e-6)
+
+
+def test_krpa_with_fc(diamond_krhf):
+    rpa = KRPA(diamond_krhf)
+    rpa.fc = True
+    rpa.kernel()
+
+    assert rpa.e_corr == pytest.approx(-0.20723389722097715, abs=1e-6)
+    assert rpa.e_tot == pytest.approx(-10.716348738655793, abs=1e-6)
+
+
+def test_krpa_with_fc_outcore(diamond_krhf):
+    rpa = KRPA(diamond_krhf)
+    rpa.fc = True
+    rpa.outcore = True
+    rpa.segsize = 2
+    rpa.kernel()
+
+    assert rpa.e_corr == pytest.approx(-0.20723389722097715, abs=1e-6)
+    assert rpa.e_tot == pytest.approx(-10.716348738655793, abs=1e-6)
+
diff --git a/pyscf/pbc/gw/test/test_kugw.py b/pyscf/pbc/gw/test/test_kugw.py
deleted file mode 100644
index a4fd92d7e9..0000000000
--- a/pyscf/pbc/gw/test/test_kugw.py
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/usr/bin/env python
-
-import unittest
-import numpy
-import os
-from pyscf import lib
-from pyscf.pbc import gto, dft, scf, df
-from pyscf.pbc.gw import kugw_ac
-
-def setUpModule():
-    global cell, kmf, kpts
-    cell = gto.Cell()
-    cell.build(
-        unit = 'B',
-        a = [[ 0.,          6.74027466,  6.74027466],
-             [ 6.74027466,  0.,          6.74027466],
-             [ 6.74027466,  6.74027466,  0.        ]],
-        atom = '''H 0 0 0
-                  H 1.68506866 1.68506866 1.68506866
-                  H 3.37013733 3.37013733 3.37013733''',
-        basis = 'gth-dzvp',
-        pseudo = 'gth-pade',
-        verbose = 7,
-        output = '/dev/null',
-        charge = 0,
-        spin = None)
-    cell.spin = 3
-    kpts = cell.make_kpts([3,1,1], scaled_center=[0,0,0])
-    kmf = scf.KUHF(cell, kpts, exxdiv=None).density_fit()
-    kmf.run()
-
-def tearDownModule():
-    global cell, kmf
-    cell.stdout.close()
-    del cell, kmf
-
-class KnownValues(unittest.TestCase):
-    def test_gwac_pade(self):
-        gw = kugw_ac.KUGWAC(kmf)
-        gw.linearized = False
-        gw.ac = 'pade'
-        gw.fc = False
-        nocca, noccb = gw.nocc
-        gw.kernel(kptlist=[0,1,2], orbs=range(0, nocca+3))
-        self.assertAlmostEqual(gw.mo_energy[0][0][nocca-1], -0.28012813, 5)
-        self.assertAlmostEqual(gw.mo_energy[0][0][nocca],    0.13748876, 5)
-        self.assertAlmostEqual(gw.mo_energy[0][1][nocca-1], -0.29515851, 5)
-        self.assertAlmostEqual(gw.mo_energy[0][1][nocca],    0.14128011, 5)
-        self.assertAlmostEqual(gw.mo_energy[1][0][noccb-1], -0.33991721, 5)
-        self.assertAlmostEqual(gw.mo_energy[1][0][noccb],    0.10578847, 5)
-        self.assertAlmostEqual(gw.mo_energy[1][1][noccb-1], -0.33547973, 5)
-        self.assertAlmostEqual(gw.mo_energy[1][1][noccb],    0.08053408, 5)
-
-        gw.fc = True
-        nocca, noccb = gw.nocc
-        gw.kernel(kptlist=[0,1,2], orbs=range(0,nocca+3))
-        self.assertAlmostEqual(gw.mo_energy[0][0][nocca-1], -0.40244058, 5)
-        self.assertAlmostEqual(gw.mo_energy[0][0][nocca],    0.13618348, 5)
-        self.assertAlmostEqual(gw.mo_energy[0][1][nocca-1], -0.41743063, 5)
-        self.assertAlmostEqual(gw.mo_energy[0][1][nocca],    0.13997427, 5)
-        self.assertAlmostEqual(gw.mo_energy[1][0][noccb-1], -0.46133481, 5)
-        self.assertAlmostEqual(gw.mo_energy[1][0][noccb],    0.1044926 , 5)
-        self.assertAlmostEqual(gw.mo_energy[1][1][noccb-1], -0.4568894 , 5)
-        self.assertAlmostEqual(gw.mo_energy[1][1][noccb],    0.07922511, 5)
-
-if __name__ == '__main__':
-    print('Full Tests for KUGW')
-    unittest.main()
diff --git a/pyscf/pbc/gw/test/test_kugw_ac.py b/pyscf/pbc/gw/test/test_kugw_ac.py
new file mode 100644
index 0000000000..86ec2f11b5
--- /dev/null
+++ b/pyscf/pbc/gw/test/test_kugw_ac.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+
+import pytest
+
+from pyscf.pbc import df, gto, scf
+from pyscf.pbc.gw.kugw_ac import KUGWAC
+
+
+@pytest.fixture(scope="module")
+def hydrogen_kuhf():
+    cell = gto.Cell()
+    cell.build(
+        unit="B",
+        a=[[0.0, 6.74027466, 6.74027466], [6.74027466, 0.0, 6.74027466], [6.74027466, 6.74027466, 0.0]],
+        atom="""H 0 0 0
+                  H 1.68506866 1.68506866 1.68506866
+                  H 3.37013733 3.37013733 3.37013733""",
+        basis="gth-dzvp",
+        pseudo="gth-pade",
+        verbose=0,
+        output="/dev/null",
+        charge=0,
+        spin=3,
+    )
+
+    kpts = cell.make_kpts([3, 1, 1], scaled_center=[0, 0, 0])
+    gdf = df.RSDF(cell, kpts)
+    gdf.build()
+
+    kmf = scf.KUHF(cell, kpts, exxdiv="ewald")
+    kmf.with_df = gdf
+    kmf.conv_tol = 1e-12
+    kmf.kernel()
+
+    yield kmf
+
+
+def test_kugwac_pade_no_fc(hydrogen_kuhf):
+    gw = KUGWAC(hydrogen_kuhf)
+    gw.qpe_linearized = False
+    gw.fc = False
+    gw.kernel(kptlist=[0, 1, 2], orbs=range(0, 5))
+
+    assert gw.mo_energy[0][0][1] == pytest.approx(-0.28661016, abs=1e-5)
+    assert gw.mo_energy[0][0][2] == pytest.approx(0.13952572, abs=1e-5)
+    assert gw.mo_energy[1][1][0] == pytest.approx(-0.34174199, abs=1e-5)
+    assert gw.mo_energy[1][1][1] == pytest.approx(0.08296260, abs=1e-5)
+
+
+def test_kugwac_pade_with_fc(hydrogen_kuhf):
+    gw = KUGWAC(hydrogen_kuhf)
+    gw.qpe_linearized = False
+    gw.fc = True
+    gw.kernel(kptlist=[0, 1, 2], orbs=range(0, 5))
+
+    assert gw.mo_energy[0][0][1] == pytest.approx(-0.48063839, abs=1e-5)
+    assert gw.mo_energy[0][0][2] == pytest.approx(0.13870787, abs=1e-5)
+    assert gw.mo_energy[1][1][0] == pytest.approx(-0.53502818, abs=1e-5)
+    assert gw.mo_energy[1][1][1] == pytest.approx(0.08214267, abs=1e-5)
+
+
+def test_kugwac_pade_with_fc_frozen_orbitals(hydrogen_kuhf):
+    gw = KUGWAC(hydrogen_kuhf)
+    gw.qpe_linearized = False
+    gw.fc = True
+    gw.frozen = [12, 13, 14]
+    gw.kernel()
+
+    assert gw.mo_energy[0][0][1] == pytest.approx(-0.47649992, abs=1e-5)
+    assert gw.mo_energy[0][0][2] == pytest.approx(0.14513332, abs=1e-5)
diff --git a/pyscf/pbc/gw/test/test_kurpa.py b/pyscf/pbc/gw/test/test_kurpa.py
new file mode 100644
index 0000000000..c9577ca2d0
--- /dev/null
+++ b/pyscf/pbc/gw/test/test_kurpa.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python
+
+import pytest
+
+from pyscf.pbc import df, gto, scf
+from pyscf.pbc.gw.kurpa import KURPA
+
+
+@pytest.fixture(scope="module")
+def hydrogen_kuhf():
+    cell = gto.Cell()
+    cell.build(
+        unit="B",
+        a=[[0.0, 6.74027466, 6.74027466], [6.74027466, 0.0, 6.74027466], [6.74027466, 6.74027466, 0.0]],
+        atom="""H 0 0 0
+                  H 1.68506866 1.68506866 1.68506866
+                  H 3.37013733 3.37013733 3.37013733""",
+        basis="gth-dzvp",
+        pseudo="gth-pade",
+        verbose=0,
+        output="/dev/null",
+        charge=0,
+        spin=3,
+    )
+
+    kpts = cell.make_kpts([3, 1, 1], scaled_center=[0, 0, 0])
+    gdf = df.RSDF(cell, kpts)
+    gdf.build()
+
+    kmf = scf.KUHF(cell, kpts, exxdiv="ewald")
+    kmf.with_df = gdf
+    kmf.conv_tol = 1e-12
+    kmf.kernel()
+
+    yield kmf
+
+    cell.stdout.close()
+
+
+def test_kurpa_no_fc(hydrogen_kuhf):
+    rpa = KURPA(hydrogen_kuhf)
+    rpa.fc = False
+    rpa.kernel()
+
+    assert rpa.e_corr == pytest.approx(-0.04288352903004621, abs=1e-6)
+    assert rpa.e_tot == pytest.approx(-1.584806462873674, abs=1e-6)
+
+
+def test_kurpa_no_fc_outcore(hydrogen_kuhf):
+    rpa = KURPA(hydrogen_kuhf)
+    rpa.fc = False
+    rpa.outcore = True
+    rpa.segsize = 3
+    rpa.kernel()
+
+    assert rpa.e_corr == pytest.approx(-0.04288352903004621, abs=1e-6)
+    assert rpa.e_tot == pytest.approx(-1.584806462873674, abs=1e-6)
+
+
+def test_kurpa_with_fc(hydrogen_kuhf):
+    rpa = KURPA(hydrogen_kuhf)
+    rpa.fc = True
+    rpa.kernel()
+
+    assert rpa.e_corr == pytest.approx(-0.04295466718074476, abs=1e-6)
+
+
+def test_kurpa_with_fc_outcore(hydrogen_kuhf):
+    rpa = KURPA(hydrogen_kuhf)
+    rpa.fc = True
+    rpa.outcore = True
+    rpa.segsize = 3
+    rpa.kernel()
+
+    assert rpa.e_corr == pytest.approx(-0.04295466718074476, abs=1e-6)
+
diff --git a/pyscf/pbc/scf/khf.py b/pyscf/pbc/scf/khf.py
index 3e054165cf..e49e95307b 100644
--- a/pyscf/pbc/scf/khf.py
+++ b/pyscf/pbc/scf/khf.py
@@ -531,9 +531,9 @@ def kmesh(self):
         '''The number of k-points along each axis in the first Brillouin zone'''
         from pyscf.pbc.tools.k2gamma import kpts_to_kmesh
         kpts = self.kpts
-        kmesh = kpts_to_kmesh(kpts)
+        kmesh = kpts_to_kmesh(self.cell, kpts)
         if len(kpts) != np.prod(kmesh):
-            logger.WARN(self, 'K-points specified in %s are not Monkhorst-Pack %s grids',
+            logger.warn(self, 'K-points specified in %s are not Monkhorst-Pack %s grids',
                         self, kmesh)
         return kmesh
 
diff --git a/pyscf/pbc/scf/test/test_hf.py b/pyscf/pbc/scf/test/test_hf.py
index 6abe18fb1f..df39dfe396 100644
--- a/pyscf/pbc/scf/test/test_hf.py
+++ b/pyscf/pbc/scf/test/test_hf.py
@@ -17,7 +17,6 @@
 #
 
 import unittest
-import tempfile
 import numpy
 from pyscf import lib
 from pyscf.scf import atom_hf
@@ -153,9 +152,9 @@ def test_init_guess_by_chkfile(self):
         numpy.random.seed(1)
         k = numpy.random.random(3)
         mf = pbchf.RHF(cell, k, exxdiv='vcut_sph')
-        mf.chkfile = tempfile.NamedTemporaryFile().name
         mf.max_cycle = 1
         mf.diis = None
+        mf.chkfile = lib.NamedTemporaryFile().name
         e1 = mf.kernel()
         self.assertAlmostEqual(e1, -4.132445328608581, 7)
 
diff --git a/pyscf/pbc/scf/test/test_khf.py b/pyscf/pbc/scf/test/test_khf.py
index 101d6b12f7..b1008ded1c 100644
--- a/pyscf/pbc/scf/test/test_khf.py
+++ b/pyscf/pbc/scf/test/test_khf.py
@@ -18,7 +18,6 @@
 #
 
 import unittest
-import tempfile
 import numpy as np
 
 from pyscf import lib
@@ -101,8 +100,8 @@ def test_init_guess_by_chkfile(self):
 
         kpts = cell.make_kpts(nk)
         kmf = khf.KRHF(cell, kpts, exxdiv='vcut_sph')
-        kmf.chkfile = tempfile.NamedTemporaryFile().name
         kmf.conv_tol = 1e-9
+        kmf.chkfile = lib.NamedTemporaryFile().name
         ekpt = kmf.scf()
         dm1 = kmf.make_rdm1()
         dm2 = kmf.from_chk(kmf.chkfile)
@@ -301,6 +300,13 @@ def test_damping(self):
         for k in range(len(kpts)):
             self.assertAlmostEqual(abs(f_damp[k] - (f[k]*(1-damp) + f_prev[k]*damp)).max(), 0, 9)
 
+    def test_kmesh_property(self):
+        kmf = cell.KRHF(kpts=cell.make_kpts([3,1,1]))
+        assert np.array_equal(kmf.kmesh, [3, 1, 1])
+
+        kmf = cell.KRHF(kpts=cell.make_kpts([18]*3))
+        assert np.array_equal(kmf.kmesh, [13, 13, 13])
+
 if __name__ == '__main__':
     print("Full Tests for pbc.scf.khf")
     unittest.main()
diff --git a/pyscf/pbc/scf/test/test_rohf.py b/pyscf/pbc/scf/test/test_rohf.py
index b093ef0c9f..641a0f9477 100644
--- a/pyscf/pbc/scf/test/test_rohf.py
+++ b/pyscf/pbc/scf/test/test_rohf.py
@@ -15,7 +15,6 @@
 #
 
 import unittest
-import tempfile
 import numpy as np
 from pyscf import lib
 from pyscf.pbc import gto as pgto
@@ -80,10 +79,10 @@ def test_init_guess_by_chkfile(self):
         np.random.seed(1)
         k = np.random.random(3)
         mf = pscf.KROHF(cell, [k], exxdiv='vcut_sph')
-        mf.chkfile = tempfile.NamedTemporaryFile().name
         mf.init_guess = 'hcore'
         mf.max_cycle = 1
         mf.diis = None
+        mf.chkfile = lib.NamedTemporaryFile().name
         e1 = mf.kernel()
         self.assertAlmostEqual(e1, -3.4376090968645068, 7)
 
diff --git a/pyscf/pbc/scf/test/test_uhf.py b/pyscf/pbc/scf/test/test_uhf.py
index 195c207eed..c684b0ca3d 100644
--- a/pyscf/pbc/scf/test/test_uhf.py
+++ b/pyscf/pbc/scf/test/test_uhf.py
@@ -15,7 +15,6 @@
 #
 
 import unittest
-import tempfile
 import numpy as np
 from pyscf import lib
 from pyscf.pbc import gto as pgto
@@ -82,9 +81,9 @@ def test_init_guess_by_chkfile(self):
         np.random.seed(1)
         k = np.random.random(3)
         mf = pscf.KUHF(cell, [k], exxdiv='vcut_sph')
-        mf.chkfile = tempfile.NamedTemporaryFile().name
         mf.max_cycle = 1
         mf.diis = None
+        mf.chkfile = lib.NamedTemporaryFile().name
         e1 = mf.kernel()
         self.assertAlmostEqual(e1, -3.4070772194665477, 7)
 
diff --git a/pyscf/pbc/tdscf/test/test_uks.py b/pyscf/pbc/tdscf/test/test_uks.py
index 69efe556a0..070a699925 100644
--- a/pyscf/pbc/tdscf/test/test_uks.py
+++ b/pyscf/pbc/tdscf/test/test_uks.py
@@ -87,7 +87,7 @@ def tearDownClass(cls):
 
     def kernel(self, TD, ref, **kwargs):
         td = getattr(self.mf, TD)().set(nstates=self.nstates, **kwargs).run()
-        self.assertAlmostEqual(abs(td.e[:self.nstates_test] * unitev  - ref).max(), 0, 5)
+        self.assertAlmostEqual(abs(td.e[:self.nstates_test] * unitev  - ref).max(), 0, 4)
         return td
 
     def test_tda(self):
@@ -103,7 +103,7 @@ def test_tdhf(self):
         td = self.kernel('TDDFT', ref, conv_tol=1e-8)
         a, b = td.get_ab()
         eref = diagonalize(a, b)
-        self.assertAlmostEqual(abs(td.e[:4] - eref[:4]).max(), 0, 8)
+        self.assertAlmostEqual(abs(td.e[:4] - eref[:4]).max(), 0, 7)
 
     def check_rsh_tda(self, xc, place=6):
         cell = self.cell
diff --git a/pyscf/pbc/x2c/sfx2c1e.py b/pyscf/pbc/x2c/sfx2c1e.py
index a2c3a1ffa4..d69a898b37 100644
--- a/pyscf/pbc/x2c/sfx2c1e.py
+++ b/pyscf/pbc/x2c/sfx2c1e.py
@@ -107,7 +107,16 @@ def get_hcore(self, cell=None, kpts=None, kpt=None):
         else:
             return super(x2c._X2C_SCF, self).get_hcore(cell, kpts)
 
-class PBCX2CHelper(x2c.X2C):
+    def undo_x2c(self):
+        obj = lib.view(self, lib.drop_class(self.__class__, SFX2C1E_SCF))
+        del obj.with_x2c
+        return obj
+
+    def to_gpu(self):
+        obj = self.undo_x2c().to_gpu().sfx2c1e()
+        return lib.to_gpu(self, obj)
+
+class PBCX2CHelper(x2c.X2CHelperBase):
 
     exp_drop = getattr(__config__, 'pbc_x2c_X2C_exp_drop', 0.2)
     # 1e: X2C1e, atom1e: X2C1e with one-center approximation
@@ -118,13 +127,15 @@ class PBCX2CHelper(x2c.X2C):
 
     def __init__(self, cell, kpts=None):
         self.cell = cell
-        x2c.X2C.__init__(self, cell)
+        x2c.X2CHelperBase.__init__(self, cell)
 
     def reset(self, cell=None):
         if cell is not None:
             self.cell = cell
         return self
 
+    to_gpu = lib.to_gpu
+
 class SpinFreeX2CHelper(PBCX2CHelper):
     '''1-component X2c Foldy-Wouthuysen (FW Hamiltonian  (spin-free part only)
     '''
@@ -142,11 +153,12 @@ def get_hcore(self, cell=None, kpts=None):
         c = lib.param.LIGHT_SPEED
         assert ('1E' in self.approx.upper())
         if 'ATOM' in self.approx.upper():
+            raise NotImplementedError(
+                'Atomic X is generated in molecular orbitals. '
+                'It might be incompatible with PBC setup.')
             atom_slices = xcell.offset_nr_by_atom()
             nao = xcell.nao_nr()
             x = numpy.zeros((nao,nao))
-            vloc = numpy.zeros((nao,nao))
-            wloc = numpy.zeros((nao,nao))
             for ia in range(xcell.natm):
                 ish0, ish1, p0, p1 = atom_slices[ia]
                 shls_slice = (ish0, ish1, ish0, ish1)
@@ -156,8 +168,6 @@ def get_hcore(self, cell=None, kpts=None):
                     z = -xcell.atom_charge(ia)
                     v1 = z * xcell.intor('int1e_rinv', shls_slice=shls_slice)
                     w1 = z * xcell.intor('int1e_prinvp', shls_slice=shls_slice)
-                vloc[p0:p1,p0:p1] = v1
-                wloc[p0:p1,p0:p1] = w1
                 x[p0:p1,p0:p1] = x2c._x2c1e_xmatrix(t1, v1, w1, s1, c)
         else:
             w = get_pnucp(with_df, kpts_lst)
@@ -175,13 +185,9 @@ def get_hcore(self, cell=None, kpts=None):
         h1_kpts = []
         for k in range(len(kpts_lst)):
             if 'ATOM' in self.approx.upper():
-                # The treatment of pnucp local part has huge effects to hcore
-                #h1 = x2c._get_hcore_fw(t[k], vloc, wloc, s[k], x, c) - vloc + v[k]
-                #h1 = x2c._get_hcore_fw(t[k], v[k], w[k], s[k], x, c)
-                h1 = x2c._get_hcore_fw(t[k], v[k], wloc, s[k], x, c)
+                h1 = x2c._get_hcore_fw(t[k], v[k], w[k], s[k], x, c)
             else:
-                xk = x2c._x2c1e_xmatrix(t[k], v[k], w[k], s[k], c)
-                h1 = x2c._get_hcore_fw(t[k], v[k], w[k], s[k], xk, c)
+                h1 = x2c._x2c1e_get_hcore(t[k], v[k], w[k], s[k], c)
 
             if self.basis is not None:
                 # If cell = xcell, U = identity matrix
@@ -201,6 +207,9 @@ def get_xmat(self, cell=None, kpts=None):
         c = lib.param.LIGHT_SPEED
         assert ('1E' in self.approx.upper())
         if 'ATOM' in self.approx.upper():
+            raise NotImplementedError(
+                'Atomic X is generated in molecular orbitals. '
+                'It might be incompatible with PBC setup.')
             atom_slices = xcell.offset_nr_by_atom()
             nao = xcell.nao_nr()
             x = numpy.zeros((nao,nao))
@@ -310,46 +319,3 @@ def get_pnucp(mydf, kpts=None):
     if kpts is None or numpy.shape(kpts) == (3,):
         wj_kpts = wj_kpts[0]
     return numpy.asarray(wj_kpts)
-
-
-if __name__ == '__main__':
-    from pyscf.pbc import scf
-    cell = pbcgto.Cell()
-    cell.build(unit = 'B',
-               a = numpy.eye(3)*4,
-               mesh = [11]*3,
-               atom = 'H 0 0 0; H 0 0 1.8',
-               verbose = 4,
-               basis='sto3g')
-    lib.param.LIGHT_SPEED = 2
-    mf = scf.RHF(cell)
-    mf.with_df = aft.AFTDF(cell)
-    enr = mf.kernel()
-    print('E(NR) = %.12g' % enr)
-
-    mf = sfx2c1e(mf)
-    esfx2c = mf.kernel()
-    print('E(SFX2C1E) = %.12g' % esfx2c)
-
-    mf = scf.KRHF(cell)
-    mf.with_df = aft.AFTDF(cell)
-    mf.kpts = cell.make_kpts([2,2,1])
-    enr = mf.kernel()
-    print('E(k-NR) = %.12g' % enr)
-
-    mf = sfx2c1e(mf)
-    esfx2c = mf.kernel()
-    print('E(k-SFX2C1E) = %.12g' % esfx2c)
-
-#    cell = pbcgto.M(unit = 'B',
-#               a = numpy.eye(3)*4,
-#               atom = 'H 0 0 0; H 0 0 1.8',
-#               mesh = None,
-#               dimension = 2,
-#               basis='sto3g')
-#    with_df = aft.AFTDF(cell)
-#    w0 = get_pnucp(with_df, cell.make_kpts([2,2,1]))
-#    with_df = aft.AFTDF(cell)
-#    with_df.eta = 0
-#    w1 = get_pnucp(with_df, cell.make_kpts([2,2,1]))
-#    print(abs(w0-w1).max())
diff --git a/pyscf/pbc/x2c/test/test_x2c.py b/pyscf/pbc/x2c/test/test_x2c.py
index c14eeadd6c..cfd435ca86 100644
--- a/pyscf/pbc/x2c/test/test_x2c.py
+++ b/pyscf/pbc/x2c/test/test_x2c.py
@@ -54,6 +54,7 @@ def tearDownModule():
     del cell, cell1
 
 class KnownValues(unittest.TestCase):
+    @unittest.skip('The implementation of atom-X approximation requires more validation.')
     def test_hf(self):
         with lib.light_speed(4) as c:
             mf = scf.RHF(cell1).sfx2c1e()
@@ -69,6 +70,7 @@ def test_hf(self):
             h1 = mf.get_hcore(kpt=kpts[1])
             self.assertAlmostEqual(numpy.einsum('ij,ji', dm, h1), -0.32361715420090226 + 0j, 8)
 
+    @unittest.skip('The implementation of atom-X approximation requires more validation.')
     def test_hf_high_cost(self):
         with lib.light_speed(2) as c:
             mf = scf.RHF(cell).sfx2c1e()
@@ -89,6 +91,7 @@ def test_hf_high_cost(self):
             h1 = mf.get_hcore(kpt=kpts[1])
             self.assertAlmostEqual(numpy.einsum('ij,ji', dm, h1), -0.04113247191600125+0j, 8)
 
+    @unittest.skip('The implementation of atom-X approximation requires more validation.')
     def test_khf_high_cost(self):
         with lib.light_speed(2) as c:
             mf = scf.KRHF(cell).sfx2c1e()
diff --git a/pyscf/pbc/x2c/x2c1e.py b/pyscf/pbc/x2c/x2c1e.py
index 69a8ca66f7..e4955a4f07 100644
--- a/pyscf/pbc/x2c/x2c1e.py
+++ b/pyscf/pbc/x2c/x2c1e.py
@@ -107,6 +107,15 @@ def get_hcore(self, cell=None, kpts=None, kpt=None):
         else:
             return super(x2c._X2C_SCF).get_hcore(cell, kpts)
 
+    def undo_x2c(self):
+        obj = lib.view(self, lib.drop_class(self.__class__, X2C1E_GSCF))
+        del obj.with_x2c
+        return obj
+
+    def to_gpu(self):
+        obj = self.undo_x2c().to_gpu().x2c1e()
+        return lib.to_gpu(self, obj)
+
 class SpinOrbitalX2C1EHelper(sfx2c1e.PBCX2CHelper):
     def get_hcore(self, cell=None, kpts=None):
         if cell is None:
@@ -125,7 +134,9 @@ def get_hcore(self, cell=None, kpts=None):
         c = lib.param.LIGHT_SPEED
 
         if 'ATOM' in self.approx.upper():
-            raise NotImplementedError
+            raise NotImplementedError(
+                'Atomic X is generated in molecular orbitals. '
+                'It might be incompatible with PBC setup.')
         else:
             w_sr = sfx2c1e.get_pnucp(with_df, kpts_lst)
             w_soc = get_pbc_pvxp(with_df, kpts_lst)
@@ -155,8 +166,7 @@ def get_hcore(self, cell=None, kpts=None):
             if 'ATOM' in self.approx.upper():
                 raise NotImplementedError
             else:
-                xk = x2c._x2c1e_xmatrix(t[k], v[k], w[k], s[k], c)
-                h1 = x2c._get_hcore_fw(t[k], v[k], w[k], s[k], xk, c)
+                h1 = x2c._x2c1e_get_hcore(t[k], v[k], w[k], s[k], c)
 
             if self.basis is not None:
                 # If cell = xcell, U = identity matrix
diff --git a/pyscf/qmmm/pbc/itrf.py b/pyscf/qmmm/pbc/itrf.py
index 408661f8d0..ac48a69a0c 100644
--- a/pyscf/qmmm/pbc/itrf.py
+++ b/pyscf/qmmm/pbc/itrf.py
@@ -804,60 +804,48 @@ def grad_kTij(R, r, eta):
         TGGcosGvRqm = lib.einsum("iab,ga,gb,ig->g", qm_quads, Gv, Gv, cosGvRqm)
         TGGsinGvRqm = lib.einsum("iab,ga,gb,ig->g", qm_quads, Gv, Gv, sinGvRqm)
 
+        DGqm = lib.einsum('ia,ga->ig', qm_dipoles, Gv)
+        TGGqm = lib.einsum('iab,ga,gb->ig', qm_quads, Gv, Gv)
+
         qm_ewg_grad = np.zeros_like(qm_coords)
         if with_mm:
             mm_ewg_grad = np.zeros_like(mm_coords)
 
         # qm pc - mm pc
-        p = ['einsum_path', (3, 4), (1, 3), (1, 2), (0, 1)]
-        qm_ewg_grad -= lib.einsum('i,gx,ig,g,g->ix', qm_charges, Gv, sinGvRqm, zcosGvRmm, Gpref, optimize=p)
-        qm_ewg_grad += lib.einsum('i,gx,ig,g,g->ix', qm_charges, Gv, cosGvRqm, zsinGvRmm, Gpref, optimize=p)
+        qm_ewg_grad -= qm_charges[:,None] * lib.einsum('ig,gx->ix', sinGvRqm, Gv*(zcosGvRmm*Gpref)[:,None])
+        qm_ewg_grad += qm_charges[:,None] * lib.einsum('ig,gx->ix', cosGvRqm, Gv*(zsinGvRmm*Gpref)[:,None])
         if with_mm:
-            p = ['einsum_path', (0, 2), (1, 2), (0, 2), (0, 1)]
-            mm_ewg_grad -= lib.einsum('i,gx,ig,g,g->ix', mm_charges, Gv, sinGvRmm, zcosGvRqm, Gpref, optimize=p)
-            mm_ewg_grad += lib.einsum('i,gx,ig,g,g->ix', mm_charges, Gv, cosGvRmm, zsinGvRqm, Gpref, optimize=p)
+            mm_ewg_grad -= mm_charges[:,None] * lib.einsum('ig,gx->ix', sinGvRmm, Gv*(zcosGvRqm*Gpref)[:,None])
+            mm_ewg_grad += mm_charges[:,None] * lib.einsum('ig,gx->ix', cosGvRmm, Gv*(zsinGvRqm*Gpref)[:,None])
         # qm dip - mm pc
-        p = ['einsum_path', (4, 5), (1, 4), (0, 1), (0, 2), (0, 1)]
-        qm_ewg_grad -= lib.einsum('ia,gx,ga,ig,g,g->ix', qm_dipoles, Gv, Gv, sinGvRqm, zsinGvRmm, Gpref, optimize=p)
-        qm_ewg_grad -= lib.einsum('ia,gx,ga,ig,g,g->ix', qm_dipoles, Gv, Gv, cosGvRqm, zcosGvRmm, Gpref, optimize=p)
+        qm_ewg_grad -= lib.einsum('ig,gx->ix', DGqm*sinGvRqm, Gv*(zsinGvRmm*Gpref)[:,None])
+        qm_ewg_grad -= lib.einsum('ig,gx->ix', DGqm*cosGvRqm, Gv*(zcosGvRmm*Gpref)[:,None])
         if with_mm:
-            p = ['einsum_path', (1, 3), (0, 2), (0, 2), (0, 1)]
-            mm_ewg_grad += lib.einsum('g,j,gx,jg,g->jx', DGcosGvRqm, mm_charges, Gv, cosGvRmm, Gpref, optimize=p)
-            mm_ewg_grad += lib.einsum('g,j,gx,jg,g->jx', DGsinGvRqm, mm_charges, Gv, sinGvRmm, Gpref, optimize=p)
+            mm_ewg_grad += mm_charges[:,None] * lib.einsum('ig,gx->ix', cosGvRmm, Gv*(DGcosGvRqm*Gpref)[:,None])
+            mm_ewg_grad += mm_charges[:,None] * lib.einsum('ig,gx->ix', sinGvRmm, Gv*(DGsinGvRqm*Gpref)[:,None])
         # qm quad - mm pc
-        p = ['einsum_path', (5, 6), (0, 5), (0, 2), (2, 3), (1, 2), (0, 1)]
-        qm_ewg_grad += lib.einsum('ga,gb,iab,gx,ig,g,g->ix', Gv, Gv, qm_quads,
-                                  Gv, sinGvRqm, zcosGvRmm, Gpref, optimize=p) / 3
-        qm_ewg_grad -= lib.einsum('ga,gb,iab,gx,ig,g,g->ix', Gv, Gv, qm_quads,
-                                  Gv, cosGvRqm, zsinGvRmm, Gpref, optimize=p) / 3
+        qm_ewg_grad += lib.einsum('ig,gx->ix', TGGqm*sinGvRqm, Gv*(zcosGvRmm*Gpref)[:,None]) / 3
+        qm_ewg_grad -= lib.einsum('ig,gx->ix', TGGqm*cosGvRqm, Gv*(zsinGvRmm*Gpref)[:,None]) / 3
         if with_mm:
-            p = ['einsum_path', (1, 3), (0, 2), (0, 2), (0, 1)]
-            mm_ewg_grad += lib.einsum('g,j,gx,jg,g->jx', TGGcosGvRqm, mm_charges, Gv, sinGvRmm, Gpref, optimize=p) / 3
-            mm_ewg_grad -= lib.einsum('g,j,gx,jg,g->jx', TGGsinGvRqm, mm_charges, Gv, cosGvRmm, Gpref, optimize=p) / 3
+            mm_ewg_grad += mm_charges[:,None] * lib.einsum('ig,gx->ix', sinGvRmm, Gv*(TGGcosGvRqm*Gpref)[:,None]) / 3
+            mm_ewg_grad -= mm_charges[:,None] * lib.einsum('ig,gx->ix', cosGvRmm, Gv*(TGGsinGvRqm*Gpref)[:,None]) / 3
 
         # qm pc - qm pc
-        p = ['einsum_path', (3, 4), (1, 3), (1, 2), (0, 1)]
-        qm_ewg_grad -= lib.einsum('i,gx,ig,g,g->ix', qm_charges, Gv, sinGvRqm, zcosGvRqm, Gpref, optimize=p)
-        qm_ewg_grad += lib.einsum('i,gx,ig,g,g->ix', qm_charges, Gv, cosGvRqm, zsinGvRqm, Gpref, optimize=p)
+        qm_ewg_grad -= qm_charges[:,None] * lib.einsum('ig,gx->ix', sinGvRqm, Gv*(zcosGvRqm*Gpref)[:,None])
+        qm_ewg_grad += qm_charges[:,None] * lib.einsum('ig,gx->ix', cosGvRqm, Gv*(zsinGvRqm*Gpref)[:,None])
         # qm pc - qm dip
-        qm_ewg_grad += lib.einsum('i,gx,ig,g,g->ix', qm_charges, Gv, cosGvRqm, DGcosGvRqm, Gpref, optimize=p)
-        qm_ewg_grad += lib.einsum('i,gx,ig,g,g->ix', qm_charges, Gv, sinGvRqm, DGsinGvRqm, Gpref, optimize=p)
-        p = ['einsum_path', (3, 5), (1, 4), (1, 3), (1, 2), (0, 1)]
-        qm_ewg_grad -= lib.einsum('ja,ga,gx,g,jg,g->jx', qm_dipoles, Gv, Gv, zsinGvRqm, sinGvRqm, Gpref, optimize=p)
-        qm_ewg_grad -= lib.einsum('ja,ga,gx,g,jg,g->jx', qm_dipoles, Gv, Gv, zcosGvRqm, cosGvRqm, Gpref, optimize=p)
+        qm_ewg_grad += qm_charges[:,None] * lib.einsum('ig,gx->ix', cosGvRqm, Gv*(DGcosGvRqm*Gpref)[:,None])
+        qm_ewg_grad += qm_charges[:,None] * lib.einsum('ig,gx->ix', sinGvRqm, Gv*(DGsinGvRqm*Gpref)[:,None])
+        qm_ewg_grad -= lib.einsum('ig,gx->ix', DGqm*sinGvRqm, Gv*(zsinGvRqm*Gpref)[:,None])
+        qm_ewg_grad -= lib.einsum('ig,gx->ix', DGqm*cosGvRqm, Gv*(zcosGvRqm*Gpref)[:,None])
         # qm dip - qm dip
-        p = ['einsum_path', (4, 5), (1, 4), (1, 3), (1, 2), (0, 1)]
-        qm_ewg_grad -= lib.einsum('ia,ga,gx,ig,g,g->ix', qm_dipoles, Gv, Gv, sinGvRqm, DGcosGvRqm, Gpref, optimize=p)
-        qm_ewg_grad += lib.einsum('ia,ga,gx,ig,g,g->ix', qm_dipoles, Gv, Gv, cosGvRqm, DGsinGvRqm, Gpref, optimize=p)
+        qm_ewg_grad -= lib.einsum('ig,gx->ix', DGqm*sinGvRqm, Gv*(DGcosGvRqm*Gpref)[:,None])
+        qm_ewg_grad += lib.einsum('ig,gx->ix', DGqm*cosGvRqm, Gv*(DGsinGvRqm*Gpref)[:,None])
         # qm pc - qm quad
-        p = ['einsum_path', (3, 4), (1, 3), (1, 2), (0, 1)]
-        qm_ewg_grad += lib.einsum('i,gx,ig,g,g->ix', qm_charges, Gv, sinGvRqm, TGGcosGvRqm, Gpref, optimize=p) / 3
-        qm_ewg_grad -= lib.einsum('i,gx,ig,g,g->ix', qm_charges, Gv, cosGvRqm, TGGsinGvRqm, Gpref, optimize=p) / 3
-        p = ['einsum_path', (4, 6), (1, 5), (1, 2), (2, 3), (1, 2), (0, 1)]
-        qm_ewg_grad += lib.einsum('jab,ga,gb,gx,g,jg,g->jx', qm_quads, Gv, Gv,
-                                  Gv, zcosGvRqm, sinGvRqm, Gpref, optimize=p) / 3
-        qm_ewg_grad -= lib.einsum('jab,ga,gb,gx,g,jg,g->jx', qm_quads, Gv, Gv,
-                                  Gv, zsinGvRqm, cosGvRqm, Gpref, optimize=p) / 3
+        qm_ewg_grad += qm_charges[:,None] * lib.einsum('ig,gx->ix', sinGvRqm, Gv*(TGGcosGvRqm*Gpref)[:,None]) / 3
+        qm_ewg_grad -= qm_charges[:,None] * lib.einsum('ig,gx->ix', cosGvRqm, Gv*(TGGsinGvRqm*Gpref)[:,None]) / 3
+        qm_ewg_grad += lib.einsum('ig,gx->ix', TGGqm*sinGvRqm, Gv*(zcosGvRqm*Gpref)[:,None]) / 3
+        qm_ewg_grad -= lib.einsum('ig,gx->ix', TGGqm*cosGvRqm, Gv*(zsinGvRqm*Gpref)[:,None]) / 3
 
         logger.timer(self, 'grad_ewald k-space', *cput2)
         logger.timer(self, 'grad_ewald', *cput0)
diff --git a/pyscf/qmmm/pbc/mm_mole.py b/pyscf/qmmm/pbc/mm_mole.py
index d1d4d9533e..fbe93e5cbc 100644
--- a/pyscf/qmmm/pbc/mm_mole.py
+++ b/pyscf/qmmm/pbc/mm_mole.py
@@ -305,13 +305,11 @@ def get_ewald_pot(self, coords1, coords2=None, charges2=None):
             ewg0  = lib.einsum('ig,g,g->i', cosGvR1, zcosGvR2, Gpref)
             ewg0 += lib.einsum('ig,g,g->i', sinGvR1, zsinGvR2, Gpref)
             # qm dip - mm pc
-            p = ['einsum_path', (2, 3), (0, 2), (0, 1)]
-            ewg1  = lib.einsum('gx,ig,g,g->ix', Gv, cosGvR1, zsinGvR2, Gpref, optimize=p)
-            ewg1 -= lib.einsum('gx,ig,g,g->ix', Gv, sinGvR1, zcosGvR2, Gpref, optimize=p)
+            ewg1  = lib.einsum('gx,ig->ix', Gv*(zsinGvR2*Gpref)[:,None], cosGvR1)
+            ewg1 -= lib.einsum('gx,ig->ix', Gv*(zcosGvR2*Gpref)[:,None], sinGvR1)
             # qm quad - mm pc
-            p = ['einsum_path', (3, 4), (0, 3), (0, 2), (0, 1)]
-            ewg2  = -lib.einsum('gx,gy,ig,g,g->ixy', Gv, Gv, cosGvR1, zcosGvR2, Gpref, optimize=p)
-            ewg2 += -lib.einsum('gx,gy,ig,g,g->ixy', Gv, Gv, sinGvR1, zsinGvR2, Gpref, optimize=p)
+            ewg2  = -lib.einsum('ig,gx,gy->ixy', cosGvR1*(zcosGvR2*Gpref)[None,:], Gv, Gv)
+            ewg2 += -lib.einsum('ig,gx,gy->ixy', sinGvR1*(zsinGvR2*Gpref)[None,:], Gv, Gv)
             ewg2 /= 3
         else:
             # qm pc - qm pc
diff --git a/pyscf/scf/__init__.py b/pyscf/scf/__init__.py
index 0dca602af5..07531aff45 100644
--- a/pyscf/scf/__init__.py
+++ b/pyscf/scf/__init__.py
@@ -36,11 +36,12 @@
     chkfile : str
         checkpoint file to save MOs, orbital energies etc.
     conv_tol : float
-        converge threshold.  Default is 1e-10
+        converge threshold.  Default is 1e-9
     max_cycle : int
         max number of iterations.  Default is 50
     init_guess : str
-        initial guess method.  It can be one of 'minao', 'atom', '1e', 'chkfile'.
+        initial guess method.  It can be one of 'minao', 'atom', 'huckel',
+        'mod_huckel', '1e', 'hcore', 'sap', 'chkfile'.
         Default is 'minao'
     DIIS : class listed in :mod:`scf.diis`
         Default is :class:`diis.SCF_DIIS`. Set it to None/False to turn off DIIS.
diff --git a/pyscf/scf/_vhf.py b/pyscf/scf/_vhf.py
index 65833914bf..98fb062b94 100644
--- a/pyscf/scf/_vhf.py
+++ b/pyscf/scf/_vhf.py
@@ -15,15 +15,15 @@
 
 import sys
 import ctypes
-import _ctypes
 import numpy
 from pyscf import lib
 from pyscf import gto
 from pyscf.gto.moleintor import make_cintopt, make_loc, ascint3
 
 libcvhf = lib.load_library('libcvhf')
+
 def _fpointer(name):
-    return ctypes.c_void_p(_ctypes.dlsym(libcvhf._handle, name))
+    return ctypes.cast(getattr(libcvhf, name), ctypes.c_void_p)
 
 class VHFOpt:
     def __init__(self, mol, intor=None,
diff --git a/pyscf/scf/dhf.py b/pyscf/scf/dhf.py
index 1703ceab0d..18e92fb606 100644
--- a/pyscf/scf/dhf.py
+++ b/pyscf/scf/dhf.py
@@ -842,8 +842,39 @@ def __init__(self, mol):
             raise RuntimeError('zquatev library is required to perform Kramers-restricted DHF')
         UHF.__init__(self, mol)
 
+    def check_linear_dependency(self, s, verbose=None):
+        log = logger.new_logger(self, verbose)
+        idx = _kramers_pair_sort_ao_idx(self.mol, four_component=True)
+        s = s[idx[:,None], idx]
+        e, v = zquatev.eigh(s)
+        if log is not None:
+            abs_e = abs(e)
+            emax = abs_e.max()
+            emin = abs_e.min()
+            c = emax / emin
+            log.debug('cond(S) = %s', c)
+            if c > 1e10:
+                log.warn('Singularity detected in the overlap matrix. '
+                         'SCF may be inaccurate and difficult to converge.')
+
+        if hf.remove_overlap_zero_eigenvalue:
+            mask = e > hf.overlap_zero_eigenvalue_threshold
+            x = v[:,mask] / numpy.sqrt(e[mask])
+        else:
+            x = v / numpy.sqrt(e)
+        x1 = numpy.empty_like(x)
+        x1[idx] = x
+        return x1
+
     def _eigh(self, h, s, overwrite=False, x=None):
-        return zquatev.solve_KR_FCSCE(self.mol, h, s)
+        if x is None:
+            if h.dtype != s.dtype:
+                s = s.astype(h.dtype)
+            return zquatev.solve_KR_FCSCE(self.mol, h, s)
+        else:
+            h = x.conj().T.dot(h).dot(x)
+            e, c = zquatev.eigh(h, iop=1)
+            return e, x.dot(c)
 
     def x2c1e(self):
         from pyscf.x2c import x2c
@@ -1110,26 +1141,13 @@ def set_dm(self, dm, atm, bas, env):
                 mol._bas.ctypes, ctypes.c_int(nbas), mol._env.ctypes)
         self.dm_cond = dm_cond
 
-
-if __name__ == '__main__':
-    import pyscf.gto
-    mol = pyscf.gto.Mole()
-    mol.verbose = 5
-    mol.output = 'out_dhf'
-
-    mol.atom.extend([['He', (0.,0.,0.)], ])
-    mol.basis = {
-        'He': [(0, 0, (1, 1)),
-               (0, 0, (3, 1)),
-               (1, 0, (1, 1)), ]}
-    mol.build()
-
-    ##############
-    # SCF result
-    method = UHF(mol)
-    energy = method.scf() #-2.38146942868
-    print(energy)
-    method.with_gaunt = True
-    print(method.scf()) # -2.38138339005
-    method.with_breit = True
-    print(method.scf()) # -2.38138339005
+def _kramers_pair_sort_ao_idx(mol, four_component=True):
+    trmaps = mol.time_reversal_map()
+    idxA = numpy.where(trmaps > 0)[0]
+    idxB = trmaps[idxA] - 1
+    if four_component:
+        n = trmaps.size
+        idx = numpy.hstack((idxA,idxA+n,idxB,idxB+n))
+    else:
+        idx = numpy.hstack((idxA,idxB))
+    return idx
diff --git a/pyscf/scf/dispersion.py b/pyscf/scf/dispersion.py
index 1c4a86f01c..91080c709e 100644
--- a/pyscf/scf/dispersion.py
+++ b/pyscf/scf/dispersion.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright 2014-2023 The PySCF Developers. All Rights Reserved.
+# Copyright 2014-2026 The PySCF Developers. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,13 +14,13 @@
 # limitations under the License.
 #
 # Author: Xiaojie Wu <wxj6000@gmail.com>
+# modified by Jiashu Liang <jsliang25@gmail.com>
 #
 
 '''
 dispersion correction for HF and DFT
 '''
 
-import warnings
 from functools import lru_cache
 from pyscf.lib import logger
 from pyscf import scf
@@ -46,6 +46,11 @@
     'wb97m-d3bj': ('wb97m-v', False, 'd3bj'),
     'b97m-d3bj': ('b97m-v', False, 'd3bj'),
     'wb97x-d3bj': ('wb97x-v', False, 'd3bj'),
+    'wb97x-3c': ('wb97x-v', False, 'd4:wb97x-3c'),
+    # CF22D is parameterized together with its D3 (zero-damping) dispersion
+    # correction, so it is enabled by default. The cf22d damping parameters
+    # are shipped with simple-dftd3 (>=1.2.1) under zero damping.
+    'cf22d': ('cf22d', '', 'd3zero'),
 }
 
 # These xc functionals are not supported yet
@@ -71,7 +76,9 @@ def parse_dft(xc_code):
         return _white_list[method_lower]
 
     if method_lower.endswith('-3c'):
-        raise NotImplementedError('*-3c methods are not supported yet.')
+        if method_lower == "wb97x-3c":
+            return _white_list[method_lower]
+        raise NotImplementedError('Only wb97x-3c is supported for now. Other 3c methods are not supported yet.')
 
     if '-d3' in method_lower or '-d4' in method_lower:
         xc, disp = method_lower.split('-')
@@ -81,39 +88,112 @@ def parse_dft(xc_code):
     return xc, '', disp
 
 @lru_cache(128)
-def parse_disp(dft_method):
+def parse_disp(dft_method=None, disp=None):
     '''Decode the disp parameters based on the xc code.
-    Returns xc_code_for_dftd3, disp_version, with_3body
 
-    Example: b3lyp-d3bj2b -> (b3lyp, d3bj, False)
-             wb97x-d3bj   -> (wb97x, d3bj, False)
+    The logic for determining the dispersion parameters is as follows:
+    1. If `disp` is specified, it takes precedence.
+       - If `disp` contains ':', it is parsed as `disp_version:method`.
+       - Otherwise, the method is derived from `dft_method`.
+    2. If `disp` is not specified, the dispersion settings are inferred from `dft_method`.
+
+    The `with_3body` flag is determined by the dispersion version suffix:
+    - '2b' suffix -> False (2-body only)
+    - 'atm' suffix -> True (Axilrod-Teller-Muto 3-body term)
+    - 'd4' -> True (D4 always includes 3-body)
+    - 'd3' (without suffix) -> False
+
+    Args:
+        dft_method (str): The DFT method name (e.g., 'b3lyp', 'wb97x-d3bj').
+        disp (str): Explicit dispersion version (e.g., 'd3bj', 'd3bjatm').
+
+    Returns:
+        tuple: (disp_method, disp_version, with_3body)
+
+    Examples:
+        >>> parse_disp('b3lyp-d3bj2b')
+        ('b3lyp', 'd3bj', False)
+        >>> parse_disp('b3lyp-d3bjatm')
+        ('b3lyp', 'd3bj', True)
+        >>> parse_disp('wb97x-d3bj')
+        ('wb97x', 'd3bj', False)
+        >>> parse_disp(None, 'd4:wb97x-3c')
+        ('wb97x-3c', 'd4', True)
     '''
-    if dft_method == 'hf':
-        return 'hf', None, False
 
-    dft_lower = dft_method.lower()
-    xc, nlc, disp = parse_dft(dft_lower)
-    if dft_lower in XC_MAP:
-        xc = XC_MAP[dft_lower]
+    # If anything not specified, return None
+    if dft_method is None and disp is None:
+        return None, None, False
+
+    def process_3body(disp_version):
+        if not disp_version:
+            return disp_version, False
+        if disp_version.endswith('2b'):
+            return disp_version[:-2], False
+        elif disp_version.endswith('atm'):
+            return disp_version[:-3], True
+        elif 'd4' in disp_version:
+            return disp_version, True
+        elif 'd3' in disp_version:
+            return disp_version, False
+        else:
+            raise ValueError(f"Unknown dispersion version {disp_version} in parse_disp.")
+
+    if dft_method is not None:
+        dft_lower = dft_method.lower()
+        xc, _, disp_from_dft = parse_dft(dft_lower)
+        if xc in XC_MAP:
+            xc = XC_MAP[xc]
+
+    # Use disp if specified
+    # returned method will be the latter part of disp if disp is a string with colon, otherwise, use xc
+    if disp is not None:
+        if ":" in disp:
+            disp_version, method = disp.split(':')
+            disp_version, with_3body = process_3body(disp_version)
+            return method, disp_version, with_3body
+        elif dft_method is not None:
+            disp, with_3body = process_3body(disp)
+            return xc, disp, with_3body
+        else:
+            raise ValueError(f"the method used in dispersion {disp} is not specified.")
+
+    # otherwise, use disp_from_dft
+    if disp_from_dft is None:
+        return None, None, False
+
+    if ":" in disp_from_dft:
+        disp_version, method = disp_from_dft.split(':')
+        disp_version, with_3body = process_3body(disp_version)
+        return method, disp_version, with_3body
+
+    disp_from_dft, with_3body = process_3body(disp_from_dft)
+    return xc, disp_from_dft, with_3body
 
-    if disp is None:
-        return xc, None, False
-    disp_lower = disp.lower()
-    if disp_lower.endswith('2b'):
-        return xc, disp_lower.replace('2b', ''), False
-    elif disp_lower.endswith('atm'):
-        return xc, disp_lower.replace('atm', ''), True
-    else:
-        return xc, disp_lower, False
 
 def check_disp(mf, disp=None):
-    '''Check whether to apply dispersion correction based on the xc attribute.
-    If dispersion is allowed, return the DFTD3 disp version, such as d3bj,
-    d3zero, d4.
+    '''Check if dispersion correction should be applied and if the version is supported.
+
+    The function determines the dispersion method from the SCF object (`mf`) or the
+    explicit `disp` argument. It then verifies if the determined dispersion version
+    is supported in `DISP_VERSIONS`.
+
+    Args:
+        mf (scf.hf.SCF): The SCF object (HF or DFT).
+        disp (str or bool, optional): Dispersion version to check.
+            If None, uses `mf.disp`.
+            If False, returns False immediately.
+
+    Returns:
+        bool: True if dispersion is enabled and supported.
+              False if dispersion is disabled (disp=False) or not specified/implied.
+
+    Raises:
+        ValueError: If the dispersion version is not supported.
     '''
     if disp is None:
-        disp = mf.disp
-    if disp == 0: # disp = False
+        disp = getattr(mf, 'disp', None)
+    if disp is False or disp == 0:
         return False
 
     # To prevent mf.do_disp() triggering the SCF.__getattr__ method, do not use
@@ -123,38 +203,60 @@ def check_disp(mf, disp=None):
     else:
         # Set the disp method for both HF and MCSCF to 'hf'
         method = 'hf'
-    disp_version = parse_disp(method)[1]
+    disp_version = parse_disp(method, disp)[1]
 
-    if disp is None: # Using the disp version decoded from the mf.xc attribute
-        if disp_version is None:
-            return False
-    elif disp_version is None: # Using the version specified by mf.disp
-        disp_version = disp
-    elif disp != disp_version:
-        raise RuntimeError(f'mf.disp {disp} conflicts with mf.xc {method}')
+    if disp_version is None:
+        return False
 
     if disp_version not in DISP_VERSIONS:
-        raise NotImplementedError
-    return disp_version
+        raise ValueError(f"Unknown dispersion version {disp_version}.")
+    return True
 
 def get_dispersion(mf, disp=None, with_3body=None, verbose=None):
-    disp_version = check_disp(mf, disp)
-    if not disp_version:
+    '''
+    Calculate the dispersion correction energy.
+
+    Args:
+        mf : SCF object
+            The SCF object.
+        disp : str, optional
+            The dispersion correction version. Default is None.
+            Format examples: "d3", "d3bj", "d4", "d3bj2b", "d3bjatm", "d4:wb97x-3c", etc.
+            Note: In "d4:wb97x-3c", the latter part follows the method id of simple-dftd3 and dftd4 repo.
+        with_3body : bool, optional
+            Whether to include the 3-body term. Default is None.
+        verbose : int, optional
+            The verbose level. Default is None.
+
+    Returns:
+        float
+            The dispersion correction energy.
+
+    Note:
+        Priority of `disp` and `with_3body`:
+        1. Function arguments (disp, with_3body)
+        2. mf.disp (if available)
+        3. mf.xc (parsed from the functional name)
+    '''
+    if not check_disp(mf, disp):
         return 0.
 
+    if disp is None:
+        disp = getattr(mf, 'disp', None)
+
     try:
         from pyscf.dispersion import dftd3, dftd4
     except ImportError:
         print('dftd3 and dftd4 not available. Install them with `pip install pyscf-dispersion`')
         raise
 
-    mol = mf.mol
-    method = getattr(mf, 'xc', 'hf')
-    method, _, disp_with_3body = parse_disp(method)
-
-    if with_3body is not None:
+    dft_method = getattr(mf, 'xc', 'hf')
+    method, disp_version, disp_with_3body = parse_disp(dft_method, disp)
+    if with_3body is None:
         with_3body = disp_with_3body
 
+    mol = mf.mol
+
     # for dftd3
     if disp_version[:2].upper() == 'D3':
         logger.info(mf, "Calc dispersion correction with DFTD3.")
diff --git a/pyscf/scf/hf.py b/pyscf/scf/hf.py
index 3fcab01f8d..d63dad977e 100644
--- a/pyscf/scf/hf.py
+++ b/pyscf/scf/hf.py
@@ -21,8 +21,6 @@
 '''
 
 import sys
-import tempfile
-
 from functools import reduce
 import numpy
 import scipy.linalg
@@ -766,7 +764,7 @@ def get_init_guess(mol, key='minao', **kwargs):
 
     Kwargs:
         key : str
-            One of 'minao', 'atom', 'huckel', 'hcore', '1e', 'sap', 'chkfile'.
+            One of 'minao', 'atom', 'huckel', 'mod_huckel', 'hcore', '1e', 'sap', 'chkfile'.
     '''
     return RHF(mol).get_init_guess(mol, key, **kwargs)
 
@@ -1654,7 +1652,8 @@ class SCF(lib.StreamObject):
             be skipped and the kernel function will compute only the total
             energy based on the initial guess. Default value is 50.
         init_guess : str
-            initial guess method.  It can be one of 'minao', 'atom', 'huckel', 'hcore', '1e', 'sap', 'chkfile'.
+            initial guess method.  It can be one of 'minao', 'atom', 'huckel',
+            'mod_huckel', 'hcore', '1e', 'sap', 'chkfile'.
             Default is 'minao'
         sap_basis : str or dict
             basis for SAP initial guess, either filename or path as str or
@@ -1771,7 +1770,7 @@ def __init__(self, mol):
         else:
             # the chkfile will be removed automatically, to save the chkfile, assign a
             # filename to self.chkfile
-            self._chkfile = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+            self._chkfile = lib.NamedTemporaryFile(dir=lib.param.TMPDIR)
             self.chkfile = self._chkfile.name
 
 ##################################################
@@ -1946,7 +1945,7 @@ def init_guess_by_huckel(self, mol=None):
         return self.make_rdm1(mo_coeff, mo_occ)
 
     @lib.with_doc(init_guess_by_mod_huckel.__doc__)
-    def init_guess_by_mod_huckel(self, updated_rule, mol=None):
+    def init_guess_by_mod_huckel(self, mol=None):
         if mol is None: mol = self.mol
         logger.info(self, '''Initial guess from on-the-fly Huckel, doi:10.1021/acs.jctc.8b01089,
 employing the updated GWH rule from doi:10.1021/ja00480a005.''')
diff --git a/pyscf/scf/test/test_dhf.py b/pyscf/scf/test/test_dhf.py
index 2db8b48017..280ca4dbcd 100644
--- a/pyscf/scf/test/test_dhf.py
+++ b/pyscf/scf/test/test_dhf.py
@@ -100,25 +100,25 @@ def test_get_grad(self):
         g = mf.get_grad(mf.mo_coeff, mf.mo_occ)
         self.assertAlmostEqual(abs(g).max(), 0, 5)
 
-    if scf.dhf.zquatev:
-        def test_rhf(self):
-            mol = gto.M(
-                verbose = 5,
-                output = '/dev/null',
-                atom = '''
-                    O     0    0        0
-                    H     0    -0.757   0.587
-                    H     0    0.757    0.587''',
-                basis = '631g',
-            )
-            mf = scf.dhf.RHF(mol)
-            mf.with_ssss = False
-            mf.conv_tol_grad = 1e-5
-            self.assertAlmostEqual(mf.kernel(), -76.03852477545016, 8)
-
-            mf.ssss_approx = None
-            mf.conv_tol_grad = 1e-5
-            self.assertAlmostEqual(mf.kernel(), -76.03852480744785, 8)
+    @unittest.skipIf(scf.dhf.zquatev is None, 'requires zquatev')
+    def test_rhf(self):
+        mol = gto.M(
+            verbose = 5,
+            output = '/dev/null',
+            atom = '''
+                O     0    0        0
+                H     0    -0.757   0.587
+                H     0    0.757    0.587''',
+            basis = '631g',
+        )
+        mf = scf.dhf.RHF(mol)
+        mf.with_ssss = False
+        mf.conv_tol_grad = 1e-5
+        self.assertAlmostEqual(mf.kernel(), -76.03852477545016, 8)
+
+        mf.ssss_approx = None
+        mf.conv_tol_grad = 1e-5
+        self.assertAlmostEqual(mf.kernel(), -76.03852480744785, 8)
 
     def test_get_veff(self):
         n4c = mol.nao_2c() * 2
@@ -326,6 +326,23 @@ def test_h2_sto3g(self):
         e = mol.DHF().kernel()
         self.assertAlmostEqual(e, -1.066122658859047, 12)
 
+    def test_he_with_gaunt(self):
+        mol = gto.M(
+            atom=[['He', (0.,0.,0.)]],
+            basis = {
+                'He': [(0, 0, (1, 1)),
+                       (0, 0, (3, 1)),
+                       (1, 0, (1, 1)), ]})
+        method = mol.DHF()
+        energy = method.scf()
+        self.assertAlmostEqual(energy, -2.38146942868, 8)
+        method.with_gaunt = True
+        energy = method.scf()
+        self.assertAlmostEqual(energy, -2.38138339005, 8)
+        method.with_breit = True
+        energy = method.scf()
+        self.assertAlmostEqual(energy, -2.38138339005, 8)
+
 def _fill_gaunt(mol, erig):
     n2c = erig.shape[0]
     n4c = n2c * 2
diff --git a/pyscf/scf/test/test_diffuse_orbital.py b/pyscf/scf/test/test_diffuse_orbital.py
index 2ac3420d15..a9ca8eedac 100644
--- a/pyscf/scf/test/test_diffuse_orbital.py
+++ b/pyscf/scf/test/test_diffuse_orbital.py
@@ -19,6 +19,11 @@
 from pyscf import lib
 from pyscf import scf, dft
 
+try:
+    from pyscf.dispersion import dftd3
+except (ImportError, OSError):
+    dftd3 = None
+
 def setUpModule():
     global mol
     mol = pyscf.M(
@@ -97,6 +102,23 @@ def test_rhf_soscf(self):
             [[ 2.44273951e-01,  2.44377010e-02,  6.79546462e-17],
              [-2.44288315e-01, -2.44313901e-02, -1.66959137e-16]])).max() < 1e-5
 
+    @unittest.skipIf(dftd3 is None, "dftd3 not available")
+    def test_rks_soscf(self):
+        mf = dft.RKS(mol, xc = "wB97M-d3bj")
+        mf.grids.atom_grid = (99,590)
+        mf.conv_tol = 1e-10
+        mf = mf.newton()
+        energy = mf.kernel()
+        assert mf.converged
+        assert np.abs(energy - -7.773544875779531) < 1e-5
+
+        gobj = mf.Gradients()
+        gradient = gobj.kernel()
+        assert np.max(np.abs(gradient - np.array([
+            [ 2.44614610e-01,  2.44653881e-02, -3.14001231e-18],
+            [-2.44641034e-01, -2.44569088e-02, -6.41480825e-18],
+        ]))) < 1e-5
+
     def test_uhf(self):
         mf = dft.RKS(mol, xc = "PBE")
         mf.grids.atom_grid = (50,194)
diff --git a/pyscf/scf/test/test_diis.py b/pyscf/scf/test/test_diis.py
index afa53d989b..524f220e1d 100644
--- a/pyscf/scf/test/test_diis.py
+++ b/pyscf/scf/test/test_diis.py
@@ -14,9 +14,9 @@
 # limitations under the License.
 
 import unittest
-import tempfile
 import numpy
 from pyscf import gto
+from pyscf import lib
 from pyscf import scf
 from pyscf.scf import diis
 
@@ -82,7 +82,7 @@ def test_diis_restart(self):
         H     0    1.757    1.587''',
             basis = '631g',
         )
-        tmpf = tempfile.NamedTemporaryFile()
+        tmpf = lib.NamedTemporaryFile()
         mf = scf.RHF(mol)
         mf.diis_file = tmpf.name
         eref = mf.kernel()
diff --git a/pyscf/scf/test/test_dispersion_logic.py b/pyscf/scf/test/test_dispersion_logic.py
new file mode 100644
index 0000000000..3e22ae3edd
--- /dev/null
+++ b/pyscf/scf/test/test_dispersion_logic.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python
+# Copyright 2014-2026 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from pyscf import gto, scf
+from pyscf.scf import dispersion
+
+
+class KnownKS(scf.hf.KohnShamDFT):
+    def __init__(self, xc='b3lyp'):
+        self.xc = xc
+        self.disp = None
+
+
+class KnownHF(scf.hf.SCF):
+    def __init__(self):
+        self.disp = None
+
+
+class TestDispersionLogic(unittest.TestCase):
+    def test_parse_disp_none(self):
+        # Case 1: All None
+        self.assertEqual(dispersion.parse_disp(None, None), (None, None, False))
+
+    def test_parse_disp_explicit(self):
+        # Case 2: Explicit disp takes precedence
+        # b3lyp normally has no disp.
+        self.assertEqual(dispersion.parse_disp('b3lyp', 'd3bj'), ('b3lyp', 'd3bj', False))
+
+        # disp with colon override method
+        self.assertEqual(dispersion.parse_disp(None, 'd4:wb97x-3c'), ('wb97x-3c', 'd4', True))
+        self.assertEqual(dispersion.parse_disp('b3lyp', 'd3bj:pbe'), ('pbe', 'd3bj', False))
+
+        # disp with suffix
+        self.assertEqual(dispersion.parse_disp('b3lyp', 'd3bj2b'), ('b3lyp', 'd3bj', False))
+        self.assertEqual(dispersion.parse_disp('b3lyp', 'd3bjatm'), ('b3lyp', 'd3bj', True))
+
+        # d4 always implies 3body
+        self.assertEqual(dispersion.parse_disp('b3lyp', 'd4'), ('b3lyp', 'd4', True))
+
+    def test_parse_disp_from_method(self):
+        # Case 3: Infer from method
+        # b3lyp -> no disp
+        self.assertEqual(dispersion.parse_disp('b3lyp'), (None, None, False))
+
+        # wb97x-d3bj -> d3bj
+        self.assertEqual(dispersion.parse_disp('wb97x-d3bj'), ('wb97x', 'd3bj', False))
+
+        # wb97x-d4s -> d4s
+        self.assertEqual(dispersion.parse_disp('wb97x-d4s'), ('wb97x', 'd4s', True))
+
+        # wb97x-3c -> d4, 3body=True (from whitelist)
+        self.assertEqual(dispersion.parse_disp('wb97x-3c'), ('wb97x-3c', 'd4', True))
+
+    def test_parse_disp_errors(self):
+        # Unknown disp version
+        with self.assertRaises(ValueError):
+            dispersion.parse_disp('b3lyp', 'unknown_ver')
+
+        # Disp specified but method unknown/missing (if disp string doesn't contain colon)
+        # Actually parse_disp(None, 'd3bj') -> raises ValueError "the method used in dispersion d3bj is not specified."
+        with self.assertRaises(ValueError):
+            dispersion.parse_disp(None, 'd3bj')
+
+    def test_check_disp(self):
+        mol = gto.M(atom='H 0 0 0; H 0 0 1')
+
+        # 1. RHF object (no .xc)
+        mf_hf = scf.RHF(mol)
+        self.assertFalse(dispersion.check_disp(mf_hf))
+
+        # If mf.disp = None
+        mf_hf.disp = None
+        # parse_disp('hf', None) -> ('hf', None, False) -> check_disp returns False
+        self.assertFalse(dispersion.check_disp(mf_hf))
+
+        # If we set mf.disp = 'd3bj'
+        mf_hf.disp = 'd3bj'
+        self.assertTrue(dispersion.check_disp(mf_hf))
+
+        # 2. KohnShamDFT object (has .xc)
+        mf_dft = KnownKS()
+        mf_dft.xc = 'b3lyp'
+        mf_dft.disp = None
+
+        # b3lyp -> no disp -> False
+        self.assertFalse(dispersion.check_disp(mf_dft))
+
+        # Explicit disp
+        self.assertTrue(dispersion.check_disp(mf_dft, disp='d3bj'))
+
+        # Explicit disp=False
+        self.assertFalse(dispersion.check_disp(mf_dft, disp=False))
+
+        # Implicit disp from method
+        mf_dft.xc = 'wb97x-d3bj'
+        self.assertTrue(dispersion.check_disp(mf_dft))
+
+        # Unsupported disp version
+        with self.assertRaises(ValueError):
+            dispersion.check_disp(mf_dft, disp='unsupported')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/pyscf/scf/test/test_ghf.py b/pyscf/scf/test/test_ghf.py
index e5ce7b393a..4639e7f89b 100644
--- a/pyscf/scf/test/test_ghf.py
+++ b/pyscf/scf/test/test_ghf.py
@@ -17,7 +17,6 @@
 #
 
 import unittest
-import tempfile
 import numpy
 import scipy.linalg
 from functools import reduce
@@ -41,7 +40,7 @@ def setUpModule():
     )
     mf = scf.GHF(mol)
     mf.conv_tol = 1e-12
-    mf.chkfile = tempfile.NamedTemporaryFile().name
+    mf.chkfile = lib.NamedTemporaryFile().name
     mf.kernel()
 
     molsym = gto.M(
@@ -57,8 +56,8 @@ def setUpModule():
     mfsym = scf.GHF(molsym).run(conv_tol=1e-10)
 
     mol1 = gto.M(atom=mol.atom, basis='631g', spin=2, verbose=0)
-    mf_r = scf.RHF(mol1).run(conv_tol=1e-10, chkfile=tempfile.NamedTemporaryFile().name)
-    mf_u = scf.RHF(mol1).run(conv_tol=1e-10, chkfile=tempfile.NamedTemporaryFile().name)
+    mf_r = scf.RHF(mol1).run(conv_tol=1e-10, chkfile=lib.NamedTemporaryFile().name)
+    mf_u = scf.RHF(mol1).run(conv_tol=1e-10, chkfile=lib.NamedTemporaryFile().name)
 
 def tearDownModule():
     global mol, mf, molsym, mfsym, mol1, mf_r, mf_u
@@ -110,7 +109,7 @@ def test_init_guess_atom(self):
         self.assertAlmostEqual(lib.fp(dm[24:,24:])*2, 2.7821827416174094, 7)
 
     def test_init_guess_chk(self):
-        dm = mol.GHF(chkfile=tempfile.NamedTemporaryFile().name).get_init_guess(mol, key='chkfile')
+        dm = mol.GHF(chkfile=lib.NamedTemporaryFile().name).get_init_guess(mol, key='chkfile')
         self.assertEqual(dm.shape, (48,48))
         self.assertAlmostEqual(lib.fp(dm), 1.8117584283411752, 5)
 
diff --git a/pyscf/scf/test/test_h2o.py b/pyscf/scf/test/test_h2o.py
index b545e177c7..1eb8d78b8b 100644
--- a/pyscf/scf/test/test_h2o.py
+++ b/pyscf/scf/test/test_h2o.py
@@ -19,7 +19,6 @@
 import unittest
 import numpy
 import scipy.linalg
-import tempfile
 from pyscf import lib
 from pyscf import gto
 from pyscf import scf
@@ -196,7 +195,7 @@ def test_init_guess_minao(self):
         self.assertEqual(dm.mo_occ.size, dm.mo_coeff.shape[1])
         s = scf.hf.get_ovlp(mol)
         occ, mo = scipy.linalg.eigh(dm, s, type=2)
-        ftmp = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+        ftmp = lib.NamedTemporaryFile(dir=lib.param.TMPDIR)
         scf.chkfile.dump_scf(mol, ftmp.name, 0, occ, mo, occ)
         self.assertAlmostEqual(numpy.linalg.norm(dm), 3.0334714065913508, 9)
 
@@ -220,7 +219,7 @@ def test_init_guess_atom(self):
         self.assertEqual(dm.mo_occ.size, dm.mo_coeff.shape[1])
         s = scf.hf.get_ovlp(mol)
         occ, mo = scipy.linalg.eigh(dm, s, type=2)
-        ftmp = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+        ftmp = lib.NamedTemporaryFile(dir=lib.param.TMPDIR)
         scf.chkfile.dump_scf(mol, ftmp.name, 0, occ, mo, occ)
         self.assertAlmostEqual(numpy.linalg.norm(dm), 3.041411845876416, 8)
 
@@ -249,7 +248,7 @@ def test_init_guess_1e(self):
         self.assertEqual(dm.mo_occ.size, dm.mo_coeff.shape[1])
         s = scf.hf.get_ovlp(mol)
         occ, mo = scipy.linalg.eigh(dm, s, type=2)
-        ftmp = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+        ftmp = lib.NamedTemporaryFile(dir=lib.param.TMPDIR)
         scf.chkfile.dump_scf(mol, ftmp.name, 0, occ, mo, occ,
                              overwrite_mol=False)  # dump_scf twice to test overwrite_mol
         scf.chkfile.dump_scf(mol, ftmp.name, 0, occ, mo, occ)
@@ -275,7 +274,7 @@ def test_init_guess_1e(self):
         self.assertAlmostEqual(numpy.linalg.norm(dm1), 7.5925205205065422, 9)
 
     def test_init_guess_chkfile(self):
-        ftmp = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+        ftmp = lib.NamedTemporaryFile(dir=lib.param.TMPDIR)
         def save(HFclass):
             mf0 = HFclass(mol)
             mf0.chkfile = ftmp.name
diff --git a/pyscf/scf/test/test_rhf.py b/pyscf/scf/test/test_rhf.py
index 2334cdd248..c48acd8e05 100644
--- a/pyscf/scf/test/test_rhf.py
+++ b/pyscf/scf/test/test_rhf.py
@@ -18,22 +18,16 @@
 
 import numpy
 import unittest
-import tempfile
 from pyscf import lib
 from pyscf import gto
 from pyscf import scf
+from pyscf.scf import _vhf
 from pyscf.scf import atom_hf
 
-import sys
 try:
-    import dftd3
-except ImportError:
-    pass
-
-try:
-    import dftd4
-except ImportError:
-    pass
+    from pyscf.dispersion import dftd3, dftd4
+except (ImportError, OSError):
+    dftd3 = dftd4 = None
 
 def setUpModule():
     global mol, mf, n2sym, n2mf, re_ecp1, re_ecp2
@@ -49,7 +43,7 @@ def setUpModule():
 
     mf = scf.RHF(mol)
     mf.conv_tol = 1e-10
-    mf.chkfile = tempfile.NamedTemporaryFile().name
+    mf.chkfile = lib.NamedTemporaryFile().name
     mf.kernel()
 
     n2sym = gto.M(
@@ -229,7 +223,7 @@ def test_atom_hf_with_ecp(self):
         self.assertAlmostEqual(scf_result['Cu'][0], -194.92388639203045, 9)
 
     def test_init_guess_chk(self):
-        dm = mol.HF(chkfile=tempfile.NamedTemporaryFile().name).get_init_guess(mol, key='chkfile')
+        dm = mol.HF(chkfile=lib.NamedTemporaryFile().name).get_init_guess(mol, key='chkfile')
         self.assertAlmostEqual(lib.fp(dm), 2.5912875957299684, 5)
 
         dm = mf.get_init_guess(mol, key='chkfile')
@@ -253,6 +247,12 @@ def test_init_guess_huckel(self):
         dm = scf.hf.RHF(mol).get_init_guess(mol, key='mod_huckel')
         self.assertAlmostEqual(lib.fp(dm), 3.233072986208057, 5)
 
+        # init_guess_by_mod_huckel should be callable without arguments,
+        # consistent with init_guess_by_huckel and the UHF/ROHF/GHF/DHF
+        # implementations.
+        dm = scf.hf.RHF(mol).init_guess_by_mod_huckel()
+        self.assertAlmostEqual(lib.fp(dm), 3.233072986208057, 5)
+
         dm = scf.ROHF(mol).init_guess_by_mod_huckel()
         self.assertAlmostEqual(lib.fp(dm[0]), 3.233072986208057/2, 5)
 
@@ -371,7 +371,7 @@ def test_analyze(self):
     def test_scf(self):
         self.assertAlmostEqual(mf.e_tot, -76.026765673119627, 9)
 
-    @unittest.skipIf('dispersion' not in sys.modules, "requires the dftd3 library")
+    @unittest.skipIf(dftd3 is None, "requires the dftd3 library")
     def test_scf_d3(self):
         mf = scf.RHF(mol)
         mf.disp = 'd3bj'
@@ -380,7 +380,7 @@ def test_scf_d3(self):
         e_tot = mf.kernel()
         self.assertAlmostEqual(e_tot, -76.03127458778653, 9)
 
-    @unittest.skipIf('dispersion' not in sys.modules, "requires the dftd4 library")
+    @unittest.skipIf(dftd4 is None, "requires the dftd4 library")
     def test_scf_d4(self):
         mf = scf.RHF(mol)
         mf.disp = 'd4'
@@ -763,12 +763,12 @@ def test_as_scanner(self):
         self.assertAlmostEqual(mf_scanner(mol.atom), -76.075408156235909, 9)
 
         mol1 = gto.M(atom='H 0 0 0; H 0 0 .9', basis='cc-pvdz')
-        ref = mol1.RHF(chkfile=tempfile.NamedTemporaryFile().name).x2c().density_fit().run()
+        ref = mol1.RHF(chkfile=lib.NamedTemporaryFile().name).x2c().density_fit().run()
         e1 = mf_scanner('H 0 0 0; H 0 0 .9')
         self.assertAlmostEqual(e1, -1.116394048204042, 9)
         self.assertAlmostEqual(e1, ref.e_tot, 9)
 
-        mfs = mol1.RHF(chkfile=tempfile.NamedTemporaryFile().name).as_scanner()
+        mfs = mol1.RHF(chkfile=lib.NamedTemporaryFile().name).as_scanner()
         mfs.__dict__.update(scf.chkfile.load(ref.chkfile, 'scf'))
         e = mfs(mol1)
         self.assertAlmostEqual(e, -1.1163913004438035, 9)
@@ -901,10 +901,10 @@ def test_get_vj(self):
         self.assertAlmostEqual(numpy.linalg.norm(vj1), 77.035779188661465, 9)
 
         orig = mf1.opt.prescreen
-        self.assertEqual(orig, scf._vhf._fpointer('CVHFnrs8_prescreen').value)
+        self.assertEqual(orig, _vhf._fpointer('CVHFnrs8_prescreen').value)
         mf1.opt.prescreen = orig
         mf1.opt.prescreen = 'CVHFnoscreen'
-        self.assertEqual(mf1.opt.prescreen, scf._vhf._fpointer('CVHFnoscreen').value)
+        self.assertEqual(mf1.opt.prescreen, _vhf._fpointer('CVHFnoscreen').value)
 
         # issue #1114
         dm = numpy.eye(nao, dtype=int)
diff --git a/pyscf/scf/test/test_uhf.py b/pyscf/scf/test/test_uhf.py
index e94f0d52e5..e0163e47ff 100644
--- a/pyscf/scf/test/test_uhf.py
+++ b/pyscf/scf/test/test_uhf.py
@@ -23,6 +23,25 @@
 from pyscf import gto
 from pyscf import scf
 
+try:
+    from pyscf.dispersion import dftd3, dftd4
+except (ImportError, OSError):
+    dftd3 = dftd4 = None
+
+
+def make_disp_mol():
+    return gto.M(
+        atom='''
+C  -0.65830719,  0.61123287, -0.00800148
+C   0.73685281,  0.61123287, -0.00800148
+C   1.43439081,  1.81898387, -0.00800148
+C   0.73673681,  3.02749287, -0.00920048
+''',
+        basis='ccpvtz',
+        charge=1,
+        spin=1,
+        output='/dev/null')
+
 def setUpModule():
     global mol, mf, n2sym, n2mf, mol2, mf2, bak
     mol = gto.M(
@@ -120,6 +139,44 @@ def test_init_guess_sap(self):
         dm2 = scf.uhf.UHF(mol).get_init_guess(mol, key='sap')
         self.assertAlmostEqual(lib.fp(dm2), 0.6440359527450615, 7)
 
+    def test_break_spin_symm_mix(self):
+        # H2 at equilibrium: verify DM properties of the breaksym='mix' initial guess
+        mol_h2 = gto.M(atom='H 0 0 0; H 0 0 1.4', basis='sto-3g', spin=0, verbose=0)
+        s = mol_h2.intor_symmetric('int1e_ovlp')
+
+        mf_h2 = scf.UHF(mol_h2)
+        dm = mf_h2.init_guess_by_minao(mol_h2, breaksym='mix')
+        dma, dmb = dm
+
+        # spin symmetry must be broken
+        self.assertFalse(numpy.allclose(dma, dmb))
+
+        # electron count preserved: Tr(S * DM) = N_elec per spin
+        self.assertAlmostEqual(numpy.einsum('ij,ji->', s, dma), 1.0, 5)
+        self.assertAlmostEqual(numpy.einsum('ij,ji->', s, dmb), 1.0, 5)
+
+        # DMs must be positive semidefinite (physically valid density matrices)
+        self.assertTrue(numpy.all(numpy.linalg.eigvalsh(dma) > -1e-10))
+        self.assertTrue(numpy.all(numpy.linalg.eigvalsh(dmb) > -1e-10))
+
+    def test_break_spin_symm_mix_h2_dissociation(self):
+        # At stretched H2 (well past the Coulson-Fischer point) UHF with
+        # breaksym='mix' should find a lower-energy broken-symmetry solution
+        # than RHF, with the unpaired electrons localised on separate atoms.
+        mol_h2 = gto.M(atom='H 0 0 0; H 0 0 4.0', basis='sto-3g', spin=0, verbose=0)
+
+        e_rhf = scf.RHF(mol_h2).kernel()
+
+        mf_uhf = scf.UHF(mol_h2)
+        mf_uhf.init_guess_breaksym = 'mix'
+        e_uhf = mf_uhf.kernel()
+
+        self.assertTrue(mf_uhf.converged)
+        # broken-symmetry UHF must be lower than RHF at stretched geometry
+        self.assertLess(e_uhf, e_rhf)
+        # significant spin contamination expected (<S^2> -> 1 as R -> inf)
+        self.assertGreater(mf_uhf.spin_square()[0], 0.5)
+
     def test_get_grad(self):
         g = mf2.get_grad(mf2.mo_coeff, mf2.mo_occ)
         self.assertAlmostEqual(abs(g).max(), 0, 6)
@@ -152,6 +209,24 @@ def test_mulliken_spin_pop(self):
     def test_scf(self):
         self.assertAlmostEqual(mf.e_tot, -76.026765673119627, 9)
 
+    @unittest.skipIf(dftd3 is None, "requires the dftd3 library")
+    def test_uhf_d3bj(self):
+        mol = make_disp_mol()
+        mf = scf.UHF(mol)
+        mf.disp = 'd3bj'
+        e_disp = mf.get_dispersion()
+        print(e_disp)
+        self.assertAlmostEqual(e_disp, -0.030566786972, 9)
+
+    @unittest.skipIf(dftd4 is None, "requires the dftd4 library")
+    def test_uhf_d4(self):
+        mol = make_disp_mol()
+        mf = scf.UHF(mol)
+        mf.disp = 'd4'
+        e_disp = mf.get_dispersion()
+        print(e_disp)
+        self.assertAlmostEqual(e_disp, -0.0096708308236, 9)
+
     def test_scf_negative_spin(self):
         mol = gto.M(atom = '''
         O     0    0        0
diff --git a/pyscf/scf/uhf.py b/pyscf/scf/uhf.py
index fa71c9b1e9..7190be0be1 100644
--- a/pyscf/scf/uhf.py
+++ b/pyscf/scf/uhf.py
@@ -122,6 +122,57 @@ def _break_dm_spin_symm(mol, dm, breaksym=1):
             dmb = numpy.zeros_like(dma)
             for b0, b1, p0, p1 in mol.aoslice_by_atom():
                 dmb[...,p0:p1,p0:p1] = dma[...,p0:p1,p0:p1]
+        elif breaksym == 'mix':
+            # 45-degree HOMO-LUMO rotation mixes the frontier orbitals between alpha
+            # and beta spins while keeping them delocalized over the full molecule.
+            # Unlike breaksym=1 (which zeroes off-diagonal AO blocks and artificially
+            # localises charge onto atoms), this rotation preserves the full molecular
+            # orbital character and provides a smoother path away from the RHF fixed
+            # point, reducing the risk of converging back to the closed-shell solution.
+            mo_coeff = getattr(dm, 'mo_coeff', None)
+            mo_occ = getattr(dm, 'mo_occ', None)
+            if mo_coeff is not None and mo_occ is not None:
+                # Energy-ordered MOs (ascending): HOMO = last occupied, LUMO = first virtual.
+                mo_a = mo_coeff[0]
+                occ_a = mo_occ[0]
+                occ_b = mo_occ[1]
+                occ_idx = numpy.where(occ_a > 0.5)[0]
+                vir_idx = numpy.where(occ_a < 0.5)[0]
+                homo_idx = occ_idx[-1]
+                lumo_idx = vir_idx[0]
+            else:
+                # No MO info: build a MINAO restricted DM, construct the Fock matrix
+                # from it, and diagonalise once to get energy-ordered MOs.  This costs
+                # one Fock build and one diagonalisation (no SCF iterations) but gives
+                # the true HOMO/LUMO rather than an arbitrary vector from the degenerate
+                # virtual subspace that a plain DM diagonalisation would produce.
+                rhf_tmp = hf.RHF(mol)
+                dm_minao = hf.init_guess_by_minao(mol)
+                fock = rhf_tmp.get_hcore() + rhf_tmp.get_veff(mol, dm_minao)
+                s1e = mol.intor_symmetric('int1e_ovlp')
+                _, mo_a = rhf_tmp.eig(fock, s1e)
+                mo_occ_rhf = rhf_tmp.get_occ(_, mo_a)
+                occ_a = (mo_occ_rhf > 1e-8).astype(numpy.double)
+                occ_b = occ_a.copy()
+                occ_idx = numpy.where(occ_a > 0.5)[0]
+                vir_idx = numpy.where(occ_a < 0.5)[0]
+                homo_idx = occ_idx[-1]
+                lumo_idx = vir_idx[0]
+            if len(occ_idx) > 0 and len(vir_idx) > 0:
+                homo = mo_a[:, homo_idx]
+                lumo = mo_a[:, lumo_idx]
+                c = numpy.sqrt(0.5)
+                # alpha HOMO -> (HOMO + LUMO)/sqrt(2)
+                mo_alpha = mo_a.copy()
+                mo_alpha[:, homo_idx] = c * (homo + lumo)
+                # beta  HOMO -> (HOMO - LUMO)/sqrt(2)
+                mo_beta = mo_a.copy()
+                mo_beta[:, homo_idx] = c * (homo - lumo)
+                dma = numpy.dot(mo_alpha[:, occ_a > 0.5] * occ_a[occ_a > 0.5],
+                                mo_alpha[:, occ_a > 0.5].conj().T)
+                dmb = numpy.dot(mo_beta[:, occ_b > 0.5] * occ_b[occ_b > 0.5],
+                                mo_beta[:, occ_b > 0.5].conj().T)
+
         else:
             # Adjust num. electrons for density matrices (issue #1839)
             # Get overlap matrix
@@ -760,11 +811,15 @@ class UHF(hf.SCF):
             If given, freeze the number of (alpha,beta) electrons to the given value.
         level_shift : number or two-element list
             level shift (in Eh) for alpha and beta Fock if two-element list is given.
-        init_guess_breaksym : int
-             This configuration controls the algorithm used to break the spin
-             symmetry of the initial guess:
-             - 0 to disable symmetry breaking in the initial guess.
-             - 1 to use the default algorithm introduced in pyscf-1.7.
+        init_guess_breaksym : int or str
+             Controls how spin symmetry is broken in the initial guess:
+             - 0 to disable symmetry breaking.
+             - 1 (default) to use the atom-block algorithm introduced in pyscf-1.7.
+             - 'mix' to rotate the HOMO and LUMO by 45 degrees between alpha and
+               beta spins. Builds one MINAO Fock matrix and diagonalises it to get
+               energy-ordered MOs, then mixes: alpha HOMO -> (HOMO+LUMO)/sqrt(2),
+               beta HOMO -> (HOMO-LUMO)/sqrt(2). Preserves molecular delocalization and
+               gives a smoother symmetry break than mode 1.
              - 2 to adjust the num. electrons for spin-up and spin-down density matrices (issue #1839).
 
     Examples:
diff --git a/pyscf/solvent/grad/pcm.py b/pyscf/solvent/grad/pcm.py
index 1e93c38b11..7e81cbe954 100644
--- a/pyscf/solvent/grad/pcm.py
+++ b/pyscf/solvent/grad/pcm.py
@@ -220,6 +220,8 @@ def grad_qv(pcmobj, dm, q_sym = None):
         fakemol.cart = mol.cart
         v_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip1, aosym='s1', cintopt=cintopt)
         dvj += numpy.einsum('xijk,ij,k->xi', v_nj, dm, q_sym[p0:p1])
+        # Free up v_nj to stay within mem limits
+        del v_nj
 
     int3c2e_ip2 = mol._add_suffix('int3c2e_ip2')
     cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas, mol._env, int3c2e_ip2)
@@ -229,6 +231,8 @@ def grad_qv(pcmobj, dm, q_sym = None):
         fakemol.cart = mol.cart
         q_nj = df.incore.aux_e2(mol, fakemol, intor=int3c2e_ip2, aosym='s1', cintopt=cintopt)
         dq[:,p0:p1] = numpy.einsum('xijk,ij,k->xk', q_nj, dm, q_sym[p0:p1])
+        # Free up q_nj to stay within mem limits
+        del q_nj
 
     aoslice = mol.aoslice_by_atom()
     dq = numpy.asarray([numpy.sum(dq[:,p0:p1], axis=1) for p0,p1 in gridslice])
@@ -415,7 +419,7 @@ def make_grad_object(base_method):
     assert isinstance(base_method, _Solvation)
     with_solvent = base_method.with_solvent
     if with_solvent.frozen:
-        raise RuntimeError('Frozen solvent model is not avialbe for energy gradients')
+        raise RuntimeError('Frozen solvent model is not available for energy gradients')
 
     # create the Gradients in vacuum. Cannot call super().Gradients() here
     # because other dynamic corrections might be applied to the base_method.
diff --git a/pyscf/solvent/pol_embed.py b/pyscf/solvent/pol_embed.py
index b6b48e0c9a..17cb71c096 100644
--- a/pyscf/solvent/pol_embed.py
+++ b/pyscf/solvent/pol_embed.py
@@ -496,7 +496,7 @@ def nuc_grad_method(self, grad_method):
            1        0.000000   -0.935307   -1.082500
                 ''', basis='sto3g')
     mf = mol.RHF()
-    with tempfile.NamedTemporaryFile() as f:
+    with lib.NamedTemporaryFile() as f:
         f.write(b'''!
 @COORDINATES
 3
diff --git a/pyscf/solvent/test/test_pol_embed.py b/pyscf/solvent/test/test_pol_embed.py
index 2f6289beec..ec45acd259 100644
--- a/pyscf/solvent/test/test_pol_embed.py
+++ b/pyscf/solvent/test/test_pol_embed.py
@@ -15,7 +15,6 @@
 
 import unittest
 import os
-import tempfile
 import numpy
 from numpy.testing import assert_allclose
 from pyscf import lib, gto, scf, dft
@@ -34,7 +33,7 @@
 
 def setUpModule():
     global potf, potf2, mol, mol2, potfile, potfile2
-    potf = tempfile.NamedTemporaryFile()
+    potf = lib.NamedTemporaryFile()
     potf.write(b'''!
 @COORDINATES
 3
@@ -70,7 +69,7 @@ def setUpModule():
                 ''', basis='sto3g', verbose=7,
                 output='/dev/null')
 
-    potf2 = tempfile.NamedTemporaryFile()
+    potf2 = lib.NamedTemporaryFile()
     potf2.write(b'''! water molecule + a large, positive charge to force electron spill-out
 @COORDINATES
 4
@@ -294,7 +293,7 @@ def test_pe_scf_ecp(self):
         self.assertAlmostEqual(mf.e_tot, -168.147494986446, 8)
 
     def test_as_scanner(self):
-        mf = mol.RHF(chkfile=tempfile.NamedTemporaryFile().name)
+        mf = mol.RHF(chkfile=lib.NamedTemporaryFile().name)
         mf_scanner = solvent.PE(mf, potfile).as_scanner()
         mf_scanner(mol)
         self.assertAlmostEqual(mf_scanner.with_solvent.e, 0.00020182314249546455, 9)
diff --git a/pyscf/tdscf/test/test_tddks.py b/pyscf/tdscf/test/test_tddks.py
index 742671cea7..084887cb87 100644
--- a/pyscf/tdscf/test/test_tddks.py
+++ b/pyscf/tdscf/test/test_tddks.py
@@ -17,7 +17,6 @@
 #
 
 import unittest
-import tempfile
 import numpy
 from pyscf import lib, gto, scf, dft
 from pyscf import tdscf
@@ -38,8 +37,7 @@ def setUpModule():
     mol.basis = 'uncsto3g'
     mol.spin = 1
     mol.build()
-    mf_lda = mol.DKS().set(xc='lda,', conv_tol=1e-12,
-                           chkfile=tempfile.NamedTemporaryFile().name).run()
+    mf_lda = mol.DKS().set(xc='lda,', conv_tol=1e-12, chkfile=lib.NamedTemporaryFile().name).run()
 
 def tearDownModule():
     global mol, mf_lda
diff --git a/pyscf/tdscf/test/test_tdgks.py b/pyscf/tdscf/test/test_tdgks.py
index 30ee8a92bc..4411664f19 100644
--- a/pyscf/tdscf/test/test_tdgks.py
+++ b/pyscf/tdscf/test/test_tdgks.py
@@ -17,7 +17,6 @@
 #
 
 import unittest
-import tempfile
 import numpy
 from pyscf import lib, gto, scf, dft
 from pyscf import tdscf
@@ -48,16 +47,14 @@ def setUpModule():
     mol.spin = 1
     mol.build()
 
-    mf_lda = mol.GKS().set(xc='lda,', conv_tol=1e-12,
-                           chkfile=tempfile.NamedTemporaryFile().name).newton().run()
+    mf_lda = mol.GKS().set(xc='lda,', conv_tol=1e-12, chkfile=lib.NamedTemporaryFile().name).newton().run()
     mcol_lda = None
     if mcfun is not None:
-        mcol_lda = mol.GKS().set(xc='lda,', conv_tol=1e-12,
-                                 collinear='mcol', chkfile=tempfile.NamedTemporaryFile().name)
+        mcol_lda = mol.GKS().set(xc='lda,', conv_tol=1e-12, chkfile=lib.NamedTemporaryFile().name,
+                                  collinear='mcol')
         mcol_lda._numint.spin_samples = 6
         mcol_lda = mcol_lda.run()
-    mf_bp86 = molsym.GKS().set(xc='bp86', conv_tol=1e-12,
-                               chkfile=tempfile.NamedTemporaryFile().name).run()
+    mf_bp86 = molsym.GKS().set(xc='bp86', conv_tol=1e-12, chkfile=lib.NamedTemporaryFile().name).run()
 
 def tearDownModule():
     global mol, molsym, mf_bp86, mf_lda, mcol_lda
diff --git a/pyscf/tdscf/test/test_tdrks_vv10.py b/pyscf/tdscf/test/test_tdrks_vv10.py
index bcb3d779bd..efae7c7c81 100644
--- a/pyscf/tdscf/test/test_tdrks_vv10.py
+++ b/pyscf/tdscf/test/test_tdrks_vv10.py
@@ -117,9 +117,9 @@ def test_wb97xv_tda(self):
         mf = make_mf(mol)
         tda = mf.TDA()
         tda.exclude_nlc = False
-        test_excitation_energy, test_state_vector = tda.kernel(nstates = len(reference_excited_state_energy))
+        test_excitation_energy, test_state_vector = tda.kernel(nstates = 2)
 
-        assert np.linalg.norm(test_excitation_energy - reference_excitation_energy) < excitation_energy_threshold
+        assert np.linalg.norm(test_excitation_energy - reference_excitation_energy[:2]) < excitation_energy_threshold
 
         reference_transition_dipole = np.array([
             [-0.0039, -0.0088, -0.0068],
@@ -130,14 +130,14 @@ def test_wb97xv_tda(self):
         ])
         test_transition_dipole = tda.transition_dipole()
 
-        for i_dipole in range(reference_transition_dipole.shape[0]):
+        for i_dipole in range(2):
             assert np.linalg.norm(test_transition_dipole[i_dipole] - reference_transition_dipole[i_dipole]) < dipole_threshold \
                 or np.linalg.norm(test_transition_dipole[i_dipole] + reference_transition_dipole[i_dipole]) < dipole_threshold
 
         reference_oscillator_strength = np.array([0.0000204074, 0.0054841178, 0.0031204297, 0.0063755735, 0.0137712931])
         test_oscillator_strength = tda.oscillator_strength()
 
-        assert np.linalg.norm(test_oscillator_strength - reference_oscillator_strength) < oscillator_strength_threshold
+        assert np.linalg.norm(test_oscillator_strength - reference_oscillator_strength[:2]) < oscillator_strength_threshold
 
     def test_wb97xv_tddft_triplet_high_cost(self):
         ### Q-Chem input
@@ -232,7 +232,7 @@ def test_wb97xv_unrestricted_tddft_high_cost(self):
 
         assert np.linalg.norm(test_oscillator_strength - reference_oscillator_strength) < oscillator_strength_threshold
 
-    def test_wb97xv_unrestricted_tda(self):
+    def test_wb97xv_unrestricted_tda_high_cost(self):
         # Same Q-Chem input as above, Q-Chem computes both TDA and TDDFT in the same run
         reference_ground_state_energy = -150.9397884760
         reference_excited_state_energy = np.array([-150.88981193, -150.79604327, -150.75118183, -150.72292823, -150.71461300])
diff --git a/pyscf/tdscf/test/test_tduks.py b/pyscf/tdscf/test/test_tduks.py
index f752c2e4fe..7f45ced23f 100644
--- a/pyscf/tdscf/test/test_tduks.py
+++ b/pyscf/tdscf/test/test_tduks.py
@@ -146,8 +146,8 @@ def test_tddft_camb3lyp(self):
         es = td.kernel(nstates=4)[0]
         a,b = td.get_ab()
         e_ref = diagonalize(a, b, 5)
-        self.assertAlmostEqual(abs(es[:3]-e_ref[:3]).max(), 0, 6)
-        self.assertAlmostEqual(lib.fp(es[:3]*27.2114), 7.69383202636, 4)
+        self.assertAlmostEqual(abs(es[:3]-e_ref[:3]).max(), 0, 5)
+        self.assertAlmostEqual(lib.fp(es[:3]*27.2114) - 7.69383202636, 0, 4)
 
     def test_tda_b3lyp(self):
         td = tdscf.TDA(mf_b3lyp)
diff --git a/pyscf/tools/c60struct.py b/pyscf/tools/c60struct.py
index f9bbbb587c..975bd78abb 100644
--- a/pyscf/tools/c60struct.py
+++ b/pyscf/tools/c60struct.py
@@ -15,12 +15,15 @@
 from functools import reduce
 import numpy
 
+
 def rotmatz(ang):
     c = numpy.cos(ang)
     s = numpy.sin(ang)
     return numpy.array((( c, s, 0),
                         (-s, c, 0),
                         ( 0, 0, 1),))
+
+
 def rotmaty(ang):
     c = numpy.cos(ang)
     s = numpy.sin(ang)
@@ -28,11 +31,11 @@ def rotmaty(ang):
                         ( 0, 1, 0),
                         (-s, 0, c),))
 
+
 def r2edge(ang, r):
     return 2*r*numpy.sin(ang/2)
 
 
-
 def make60(b5, b6):
     theta1 = numpy.arccos(1/numpy.sqrt(5))
     theta2 = (numpy.pi - theta1) * .5
diff --git a/pyscf/tools/chgcar.py b/pyscf/tools/chgcar.py
index 315f846732..2be1f9ae9d 100644
--- a/pyscf/tools/chgcar.py
+++ b/pyscf/tools/chgcar.py
@@ -166,7 +166,7 @@ def __init__(self, cell, nx=60, ny=60, nz=60, resolution=RESOLUTION,
             self.mol = cell
             cell = cell.view(pbcgto.Cell)
             if (isinstance(cell.unit, str) and
-                cell.unit.startswith(('B','b','au','AU'))):
+                    cell.unit.startswith(('B','b','au','AU'))):
                 cell.a = self.box
             else:
                 cell.a = self.box * lib.param.BOHR
@@ -183,7 +183,7 @@ def __init__(self, cell, nx=60, ny=60, nz=60, resolution=RESOLUTION,
         self.boxorig = numpy.zeros(3)
         self.vol = cell.vol
 
-    def get_coords(self) :
+    def get_coords(self):
         """  Result: set of coordinates to compute a field which is to be stored
         in the file.
         """
@@ -208,9 +208,9 @@ def write(self, field, fname, comment=None):
         field = field * self.vol
 
         boxA = self.box * lib.param.BOHR
-        atomList= [cell.atom_pure_symbol(i) for i in range(cell.natm)]
+        atomList = [cell.atom_pure_symbol(i) for i in range(cell.natm)]
         Axyz = zip(atomList, cell.atom_coords().tolist())
-        Axyz = sorted(Axyz, key = lambda x: x[0])
+        Axyz = sorted(Axyz, key=lambda x: x[0])
         swappedCoords = [(vec[1]+self.boxorig) * lib.param.BOHR for vec in Axyz]
         vaspAtomicInfo = collections.Counter([xyz[0] for xyz in Axyz])
         vaspAtomicInfo = sorted(vaspAtomicInfo.items())
@@ -221,8 +221,8 @@ def write(self, field, fname, comment=None):
             f.write('%14.8f %14.8f %14.8f \n' % (boxA[0,0],boxA[0,1],boxA[0,2]))
             f.write('%14.8f %14.8f %14.8f \n' % (boxA[1,0],boxA[1,1],boxA[1,2]))
             f.write('%14.8f %14.8f %14.8f \n' % (boxA[2,0],boxA[2,1],boxA[2,2]))
-            f.write(''.join(['%5.3s'%atomN[0] for atomN in vaspAtomicInfo]) + '\n')
-            f.write(''.join(['%5d'%atomN[1] for atomN in vaspAtomicInfo]) + '\n')
+            f.write(''.join(['%5.3s' % atomN[0] for atomN in vaspAtomicInfo]) + '\n')
+            f.write(''.join(['%5d' % atomN[1] for atomN in vaspAtomicInfo]) + '\n')
             f.write('Cartesian \n')
             for ia in range(cell.natm):
                 f.write(' %14.8f %14.8f %14.8f\n' % tuple(swappedCoords[ia]))
@@ -244,6 +244,6 @@ def read(self, chgcar_file):
     from pyscf.tools import chgcar
     cell = gto.M(atom='H 0 0 0; H 0 0 1', a=numpy.eye(3)*3)
     mf = scf.RHF(cell).run()
-    chgcar.density(cell, 'h2.CHGCAR', mf.make_rdm1()) #makes total density
-    chgcar.orbital(cell, 'h2_mo1.CHGCAR', mf.mo_coeff[:,0]) # makes mo#1 (sigma)
-    chgcar.orbital(cell, 'h2_mo2.CHGCAR', mf.mo_coeff[:,1]) # makes mo#2 (sigma*)
+    chgcar.density(cell, 'h2.CHGCAR', mf.make_rdm1())  # makes total density
+    chgcar.orbital(cell, 'h2_mo1.CHGCAR', mf.mo_coeff[:,0])  # makes mo#1 (sigma)
+    chgcar.orbital(cell, 'h2_mo2.CHGCAR', mf.mo_coeff[:,1])  # makes mo#2 (sigma*)
diff --git a/pyscf/tools/cubegen.py b/pyscf/tools/cubegen.py
index 52f769abe8..d4d972965a 100644
--- a/pyscf/tools/cubegen.py
+++ b/pyscf/tools/cubegen.py
@@ -274,12 +274,12 @@ def __init__(self, mol, nx=80, ny=80, nz=80, resolution=RESOLUTION,
             self.ys = numpy.linspace(0, 1, ny, endpoint=True)
             self.zs = numpy.linspace(0, 1, nz, endpoint=True)
 
-    def get_coords(self) :
+    def get_coords(self):
         """  Result: set of coordinates to compute a field which is to be stored
         in the file.
         """
         frac_coords = lib.cartesian_prod([self.xs, self.ys, self.zs])
-        return frac_coords @ self.box + self.boxorig # Convert fractional coordinates to real-space coordinates
+        return frac_coords @ self.box + self.boxorig  # Convert fractional coordinates to real-space coordinates
 
     def get_ngrids(self):
         return self.nx * self.ny * self.nz
@@ -310,7 +310,7 @@ def write(self, field, fname, comment=None):
             f.write(f'{self.nz:5d}{delta[2,0]:12.6f}{delta[2,1]:12.6f}{delta[2,2]:12.6f}\n')
             for ia in range(mol.natm):
                 atmsymb = mol.atom_symbol(ia)
-                f.write('%5d%12.6f'% (gto.charge(atmsymb), 0.))
+                f.write('%5d%12.6f' % (gto.charge(atmsymb), 0.))
                 f.write('%12.6f%12.6f%12.6f\n' % tuple(coord[ia]))
 
             for ix in range(self.nx):
@@ -326,6 +326,7 @@ def read(self, cube_file):
             data = f.readline().split()
             natm = int(data[0])
             self.boxorig = numpy.array([float(x) for x in data[1:]])
+
             def parse_nx(data):
                 from pyscf.pbc.gto import Cell
                 d = data.split()
@@ -361,6 +362,6 @@ def parse_nx(data):
                 H 0.761561, 0.478993, 0.00000000
                 H -0.761561, 0.478993, 0.00000000''', basis='6-31g*')
     mf = scf.RHF(mol).run()
-    cubegen.density(mol, 'h2o_den.cube', mf.make_rdm1()) #makes total density
+    cubegen.density(mol, 'h2o_den.cube', mf.make_rdm1())  # makes total density
     cubegen.mep(mol, 'h2o_pot.cube', mf.make_rdm1())
     cubegen.orbital(mol, 'h2o_mo1.cube', mf.mo_coeff[:,0])
diff --git a/pyscf/tools/molden.py b/pyscf/tools/molden.py
index 07862101be..31da31bca4 100644
--- a/pyscf/tools/molden.py
+++ b/pyscf/tools/molden.py
@@ -76,6 +76,7 @@ def orbital_coeff(mol, fout, mo_coeff, spin='Alpha', symm=None, ene=None,
         for i,j in enumerate(aoidx):
             fout.write(' %3d    %18.14g\n' % (i+1, mo_coeff[j,imo]))
 
+
 def from_mo(mol, filename, mo_coeff, spin='Alpha', symm=None, ene=None,
             occ=None, ignore_h=IGNORE_H):
     '''Dump the given MOs in Molden format'''
@@ -87,6 +88,8 @@ def from_mo(mol, filename, mo_coeff, spin='Alpha', symm=None, ene=None,
 def from_scf(mf, filename, ignore_h=IGNORE_H):
     '''Dump the given SCF object in Molden format'''
     dump_scf(mf, filename, ignore_h)
+
+
 def dump_scf(mf, filename, ignore_h=IGNORE_H):
     import pyscf.scf
     mol = mf.mol
@@ -104,6 +107,7 @@ def dump_scf(mf, filename, ignore_h=IGNORE_H):
             orbital_coeff(mf.mol, f, mf.mo_coeff,
                           ene=mf.mo_energy, occ=mf.mo_occ, ignore_h=ignore_h)
 
+
 def from_mcscf(mc, filename, ignore_h=IGNORE_H, cas_natorb=False):
     mol = mc.mol
     dm1 = mc.make_rdm1()
@@ -118,6 +122,7 @@ def from_mcscf(mc, filename, ignore_h=IGNORE_H, cas_natorb=False):
         header(mol, f, ignore_h)
         orbital_coeff(mol, f, mo_coeff, ene=mo_energy, occ=occ, ignore_h=ignore_h)
 
+
 def from_chkfile(filename, chkfile, key='scf/mo_coeff', ignore_h=IGNORE_H):
     import pyscf.scf
     with open(filename, 'w') as f:
@@ -156,6 +161,7 @@ def from_chkfile(filename, chkfile, key='scf/mo_coeff', ignore_h=IGNORE_H):
 
 _SEC_RE = re.compile(r'\[[^]]+\]')
 
+
 def _read_one_section(molden_fp):
     sec = [None]
     last_pos = 0
@@ -183,10 +189,12 @@ def _read_one_section(molden_fp):
 
     return sec
 
+
 def _parse_natoms(lines, envs):
     envs['natm'] = natm = int(lines[1])
     return natm
 
+
 def _parse_atoms(lines, envs):
     if 'ANG' in lines[0].upper():
         envs['unit'] = 1
@@ -203,9 +211,10 @@ def _parse_atoms(lines, envs):
         sys.stderr.write('Number of atoms in section ATOMS does not equal to N_ATOMS\n')
     return atoms
 
+
 def _parse_charge(lines, envs):
-    mulliken_charges = [float(_d2e(x)) for x in lines[1:]]
-    return mulliken_charges
+    return [float(_d2e(x)) for x in lines[1:]]  # Mulliken charges
+
 
 def _parse_gto(lines, envs):
     mol = envs['mol']
@@ -251,6 +260,7 @@ def read_one_bas(lsym, nb, fac=1):
     mol._basis = envs['basis'] = gto.format_basis(_basis, sort_basis=False)
     return mol
 
+
 def _parse_mo(lines, envs):
     mol = envs['mol']
     if not mol._built:
@@ -263,7 +273,7 @@ def _parse_mo(lines, envs):
     mo_energy = []
     spins = []
     mo_occ = []
-    mo_coeff_prim = [] # primary data, will be reworked for missing values
+    mo_coeff_prim = []  # primary data, will be reworked for missing values
     coeff_idx = []
     mo_id = 0
     for line in lines[1:]:
@@ -296,7 +306,6 @@ def _parse_mo(lines, envs):
         s = mol.intor('int1e_ovlp')
         mo_coeff = numpy.einsum('i,ij->ij', numpy.sqrt(1/s.diagonal()), mo_coeff)
 
-
     return mol, mo_energy, mo_coeff, mo_occ, irrep_labels, spins
 
 
@@ -316,6 +325,7 @@ def _parse_core(lines, envs):
                          'ECP information was lost when saving to molden format.\n\n')
     return mol.ecp
 
+
 _SEC_PARSER = {'N_ATOMS'  : _parse_natoms,
                'ATOMS'    : _parse_atoms,
                'GTO'      : _parse_gto,
@@ -326,10 +336,11 @@ def _parse_core(lines, envs):
 
 _SEC_ORDER = ['N_ATOMS', 'ATOMS', 'GTO', 'CHARGE', 'MO', 'CORE', 'MOLDEN FORMAT']
 
+
 def load(moldenfile, verbose=0):
     '''Extract mol and orbitals from molden file
     '''
-    sec_kinds = {} # found sections and their lines are stored in this dic
+    sec_kinds = {}  # found sections and their lines are stored in this dic
     with open(moldenfile, 'r') as f:
         mol = gto.Mole()
         mol.cart = True
@@ -367,7 +378,7 @@ def load(moldenfile, verbose=0):
         if sec_kind == 'MO' and 'MO' in sec_kinds:
             if len(sec_kinds['MO']) == 1:
                 mol, mo_energy, mo_coeff, mo_occ, irrep_labels, spins = \
-                        _parse_mo(sec_kinds['MO'][0], tokens)
+                    _parse_mo(sec_kinds['MO'][0], tokens)
                 # If found only one MO section while 'B' appears in the spins
                 # labels, the MOs so obtained are spin orbitals, with beta
                 # orbitals at the second half of the mo_coeff matrix.
@@ -376,23 +387,24 @@ def load(moldenfile, verbose=0):
                         # general spin orbitals which allows to mix spin alpha
                         # and spin beta components in the same orbitals
                         raise NotImplementedError
-                    else:
-                        # Regular spin orbitals, alpha and beta do not mix
-                        beta_idx = numpy.array([s[0] == 'B' for s in spins])
-                        alpha_idx = ~beta_idx
-                        mo_energy = mo_energy[alpha_idx], mo_energy[beta_idx]
-                        mo_coeff = mo_coeff[:,alpha_idx], mo_coeff[:,beta_idx]
-                        mo_occ = mo_occ[alpha_idx], mo_occ[beta_idx]
+
+                    # Regular spin orbitals, alpha and beta do not mix
+                    beta_idx = numpy.array([s[0] == 'B' for s in spins])
+                    alpha_idx = ~beta_idx
+                    mo_energy = mo_energy[alpha_idx], mo_energy[beta_idx]
+                    mo_coeff = mo_coeff[:,alpha_idx], mo_coeff[:,beta_idx]
+                    mo_occ = mo_occ[alpha_idx], mo_occ[beta_idx]
+                    if irrep_labels:
                         irrep_labels = numpy.array(irrep_labels)
                         irrep_labels = irrep_labels[alpha_idx], irrep_labels[beta_idx]
-                        spins = numpy.array(spins)
-                        spins = spins[alpha_idx], spins[beta_idx]
+                    spins = numpy.array(spins)
+                    spins = spins[alpha_idx], spins[beta_idx]
 
             elif len(sec_kinds['MO']) == 2:
                 res_a = _parse_mo(sec_kinds['MO'][0], tokens)
                 res_b = _parse_mo(sec_kinds['MO'][1], tokens)
                 mo_energy, mo_coeff, mo_occ, irrep_labels, spins = \
-                        list(zip(res_a[1:], res_b[1:]))
+                    list(zip(res_a[1:], res_b[1:]))
                 mol = res_b[0]
 
         if sec_kind in sec_kinds:
@@ -406,11 +418,14 @@ def load(moldenfile, verbose=0):
         mol.build(0, 0)
     return mol, mo_energy, mo_coeff, mo_occ, irrep_labels, spins
 
+
 parse = read = load
 
+
 def _d2e(token):
     return token.replace('D', 'e').replace('d', 'e')
 
+
 def header(mol, fout, ignore_h=IGNORE_H):
     if ignore_h:
         mol = remove_high_l(mol)[0]
@@ -426,7 +441,7 @@ def header(mol, fout, ignore_h=IGNORE_H):
 
     fout.write('[GTO]\n')
     for ia, (sh0, sh1, p0, p1) in enumerate(mol.offset_nr_by_atom()):
-        fout.write('%d 0\n' %(ia+1))
+        fout.write('%d 0\n' % (ia+1))
         for ib in range(sh0, sh1):
             l = mol.bas_angular(ib)
             nprim = mol.bas_nprim(ib)
@@ -452,6 +467,7 @@ def header(mol, fout, ignore_h=IGNORE_H):
                 fout.write('%s : %d\n' % (ia+1, nelec_ecp_core))
     fout.write('\n')
 
+
 def order_ao_index(mol):
     # reorder d,f,g function to
     #  5D: D 0, D+1, D-1, D+2, D-2
@@ -500,6 +516,7 @@ def order_ao_index(mol):
                 off += l * 2 + 1
     return idx
 
+
 def remove_high_l(mol, mo_coeff=None):
     '''Remove high angular momentum (l >= 5) functions before dumping molden file.
     If molden function raised error message ``RuntimeError l=5 is not supported``,
@@ -521,23 +538,22 @@ def remove_high_l(mol, mo_coeff=None):
     pmol.build(0, 0)
     if mo_coeff is None:
         return pmol, None
-    else:
-        p1 = 0
-        idx = []
-        for ib in range(mol.nbas):
-            l = mol.bas_angular(ib)
-            nc = mol.bas_nctr(ib)
-            if mol.cart:
-                nd = (l + 1) * (l + 2) // 2
-            else:
-                nd = l * 2 + 1
-            p0, p1 = p1, p1 + nd * nc
-            if l <= 4:
-                idx.append(range(p0, p1))
 
-        idx = numpy.hstack(idx)
-        return pmol, mo_coeff[idx]
+    p1 = 0
+    idx = []
+    for ib in range(mol.nbas):
+        l = mol.bas_angular(ib)
+        nc = mol.bas_nctr(ib)
+        if mol.cart:
+            nd = (l + 1) * (l + 2) // 2
+        else:
+            nd = l * 2 + 1
+        p0, p1 = p1, p1 + nd * nc
+        if l <= 4:
+            idx.append(range(p0, p1))
 
+    idx = numpy.hstack(idx)
+    return pmol, mo_coeff[idx]
 
 
 if __name__ == '__main__':
@@ -545,7 +561,7 @@ def remove_high_l(mol, mo_coeff=None):
     import tempfile
     mol = gto.Mole()
     mol.verbose = 5
-    mol.output = None#'out_gho'
+    mol.output = None  # 'out_gho'
     mol.atom = [['C', (0.,0.,0.)],
                 ['H', ( 1, 1, 1)],
                 ['H', (-1,-1, 1)],
@@ -561,7 +577,7 @@ def remove_high_l(mol, mo_coeff=None):
     print(order_ao_index(mol))
     orbital_coeff(mol, mol.stdout, m.mo_coeff)
 
-    ftmp = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+    ftmp = lib.NamedTemporaryFile(dir=lib.param.TMPDIR)
     from_mo(mol, ftmp.name, m.mo_coeff)
 
     print(parse(ftmp.name))
diff --git a/pyscf/tools/qcschema.py b/pyscf/tools/qcschema.py
index 9a264552b5..06fbfbb682 100644
--- a/pyscf/tools/qcschema.py
+++ b/pyscf/tools/qcschema.py
@@ -7,7 +7,8 @@
 import pyscf
 from pyscf.lib.parameters import BOHR
 
-def load_qcschema_json( file_name ):
+
+def load_qcschema_json(file_name):
     '''
     Does: loads qcschema format json into a dictionary
     Input:
@@ -16,12 +17,11 @@ def load_qcschema_json( file_name ):
     Returns: dict in qcschema format
     '''
     # load qcschema output json file
-    data = None
-    with open(file_name,'r') as f:
-        data = json.load(f)
-    return data
+    with open(file_name, 'r') as f:
+        return json.load(f)
+
 
-def load_qcschema_go_final_json( file_name ):
+def load_qcschema_go_final_json(file_name):
     '''
     Does: loads qcschema format geometry optimization json
           and returns only the optimized 'final' geometry
@@ -35,12 +35,11 @@ def load_qcschema_go_final_json( file_name ):
     # load qcschema GO output json file
     # and return last 'trajectory' point's entries
     # (this is the optimized molecule)
-    data = None
     temp = None
-    with open(file_name,'r') as f:
+    with open(file_name, 'r') as f:
         temp = json.load(f)
-    data = temp["trajectory"][-1]
-    return data
+    return temp["trajectory"][-1]
+
 
 def load_qcschema_molecule(qcschema_dict, to_Angstrom=False, xyz=False, mol_select=1, step=0):
     '''
@@ -67,21 +66,21 @@ def load_qcschema_molecule(qcschema_dict, to_Angstrom=False, xyz=False, mol_sele
         xyz=True: output a string in xyz file format
                   i.e. first line is number of atoms.
     '''
-    if(mol_select == 1):
+    if (mol_select == 1):
         syms = np.array(qcschema_dict["molecule"]["symbols"])
         geo = np.array(qcschema_dict["molecule"]["geometry"])
-    elif(mol_select == 2):
+    elif (mol_select == 2):
         syms = np.array(qcschema_dict["initial_molecule"]["symbols"])
         geo = np.array(qcschema_dict["initial_molecule"]["geometry"])
-    elif(mol_select == 3):
+    elif (mol_select == 3):
         syms = np.array(qcschema_dict["final_molecule"]["symbols"])
         geo = np.array(qcschema_dict["final_molecule"]["geometry"])
-    elif(mol_select == 4):
+    elif (mol_select == 4):
         # for geometry or md, can load a specific geometry
         syms = np.array(qcschema_dict["trajectory"][step]["molecule"]["symbols"])
         geo = np.array(qcschema_dict["trajectory"][step]["molecule"]["geometry"])
 
-    if(to_Angstrom):
+    if (to_Angstrom):
         # convert Bohr to Angstrom
         geo = geo*BOHR
 
@@ -91,13 +90,14 @@ def load_qcschema_molecule(qcschema_dict, to_Angstrom=False, xyz=False, mol_sele
     PySCF_atoms = list(zip(syms, geo))
 
     # Return as string or return as xyz-format string (i.e. top is NAtoms,blankline)
-    if(xyz):
+    if (xyz):
         bldstr = f'{NAtoms}\n\n'
         for element, coordinates in PySCF_atoms:
             bldstr += f'{element} {coordinates[0]}, {coordinates[1]}, {coordinates[2]}\n'
             PySCF_atoms = bldstr
     return PySCF_atoms
 
+
 def load_qcschema_hessian(qcschema_dict):
     '''
     Does: loads hessian from qcschema format dictionary
@@ -117,8 +117,8 @@ def load_qcschema_hessian(qcschema_dict):
     NAtom = len(syms)
 
     # reshape from (3N)**2 array to (N,N,3,3)
-    hessian = np.array(qc_h).reshape(NAtom,NAtom,3,3)
-    return hessian
+    return np.array(qc_h).reshape(NAtom, NAtom, 3, 3)
+
 
 def load_qcschema_scf_info(qcschema_dict):
     '''
@@ -133,18 +133,17 @@ def load_qcschema_scf_info(qcschema_dict):
     # Restricted wfn has schema scf_occupations_a occ of 1 or 0.
     # Need to double if rhf/rks/rohf
     method = qcschema_dict["keywords"]["scf"]["method"]
-    if(method == 'rks' or method == 'roks' or method == 'rhf' or method == 'rohf'):
+    if (method == 'rks' or method == 'roks' or method == 'rhf' or method == 'rohf'):
         OccFactor = 2.0
         have_beta = False
-    elif(method == 'uks' or method == 'uhf'):
+    elif (method == 'uks' or method == 'uhf'):
         OccFactor = 1.0
         have_beta = True
-    elif(method == 'gks' or method == 'ghf'):
+    elif (method == 'gks' or method == 'ghf'):
         OccFactor = 1.0
         have_beta = False
     else:
         raise RuntimeError('qcschema: cannot determine method..exit')
-        return
 
     # need to reshape MO coefficients for PySCF shape.
     nao = qcschema_dict["properties"]["calcinfo_nbasis"]
@@ -160,34 +159,32 @@ def load_qcschema_scf_info(qcschema_dict):
 
     # get the 4 things that PySCF wants
     # ...remembering to reshape coeffs and scale occupancies.
-    e_tot = float( qcschema_dict["properties"]["return_energy"] )
+    e_tot = float(qcschema_dict["properties"]["return_energy"])
     mo_coeff = np.reshape(qcschema_dict["wavefunction"]["scf_orbitals_a"],(nao,nmo))
-    mo_occ = np.array( qcschema_dict["wavefunction"]["scf_occupations_a"] )*OccFactor
-    mo_energy = np.array( qcschema_dict["wavefunction"]["scf_eigenvalues_a"] )
-    if(have_beta):
+    mo_occ = np.array(qcschema_dict["wavefunction"]["scf_occupations_a"])*OccFactor
+    mo_energy = np.array(qcschema_dict["wavefunction"]["scf_eigenvalues_a"])
+    if (have_beta):
         # for each useful piece of info we need to combine alpha and beta into 2d array, with alpha first
         # MO occupations
         mo_occ_beta = qcschema_dict["wavefunction"]["scf_occupations_b"]
-        mo_occ = np.vstack( (mo_occ, mo_occ_beta) )
+        mo_occ = np.vstack((mo_occ, mo_occ_beta))
         # MO coefficients
         mo_coeff_beta = np.reshape(qcschema_dict["wavefunction"]["scf_orbitals_b"],(nao,nmo))
-        mo_coeff = np.vstack( (mo_coeff,mo_coeff_beta))
+        mo_coeff = np.vstack((mo_coeff,mo_coeff_beta))
         mo_coeff = np.reshape(mo_coeff,(2,nao,nmo))
         # MO energies
-        mo_energy_beta = np.array( qcschema_dict["wavefunction"]["scf_eigenvalues_b"] )
-        mo_energy = np.vstack( (mo_energy, mo_energy_beta) )
+        mo_energy_beta = np.array(qcschema_dict["wavefunction"]["scf_eigenvalues_b"])
+        mo_energy = np.vstack((mo_energy, mo_energy_beta))
         # etot obviously doesn't need manipulation
 
     # convert to dictionary for PySCF
-    scf_dic = {'e_tot'    : e_tot,
-               'mo_energy': mo_energy,
-               'mo_occ'   : mo_occ,
-               'mo_coeff' : mo_coeff}
-
-    return scf_dic
+    return {'e_tot': e_tot,
+            'mo_energy': mo_energy,
+            'mo_occ': mo_occ,
+            'mo_coeff': mo_coeff}
 
 
-def recreate_mol_obj(qcschema_dict,to_Angstrom=False):
+def recreate_mol_obj(qcschema_dict, to_Angstrom=False):
     '''
     Does: recreates mol object from qcschema format dictionary
     Input:
@@ -197,31 +194,32 @@ def recreate_mol_obj(qcschema_dict,to_Angstrom=False):
     Returns: mol object
     '''
 
-    ## Mol info: ##
-    PySCF_charge = int( qcschema_dict["molecule"]["molecular_charge"] )
+    # ### Mol info: ###
+    PySCF_charge = int(qcschema_dict["molecule"]["molecular_charge"])
     # PySCF 'spin' is number of unpaired electrons, it will be mult-1
-    PySCF_spin = int( qcschema_dict["molecule"]["molecular_multiplicity"] ) - 1
-    PySCF_basis = str( qcschema_dict["model"]["basis"] )
+    PySCF_spin = int(qcschema_dict["molecule"]["molecular_multiplicity"]) - 1
+    PySCF_basis = str(qcschema_dict["model"]["basis"])
 
     # Cartesian/Pure basis
-    PySCF_cart = bool( qcschema_dict["keywords"]["basisSet"]["cartesian"] )
+    PySCF_cart = bool(qcschema_dict["keywords"]["basisSet"]["cartesian"])
 
     # Get molecular structure.
-    PySCF_atoms = load_qcschema_molecule(qcschema_dict, to_Angstrom,False)
+    PySCF_atoms = load_qcschema_molecule(qcschema_dict, to_Angstrom, False)
 
     # Unit Bohr or Angstrom. QCSchema default is Bohr but can change here.
-    if(to_Angstrom):
-        units='A'
+    if (to_Angstrom):
+        units = 'A'
     else:
-        units='B'
+        units = 'B'
 
-    ## Create mol ##
+    # ### Create mol ###
     mol = pyscf.gto.Mole(atom=PySCF_atoms,basis=PySCF_basis,ecp=PySCF_basis,
                          charge=PySCF_charge,spin=PySCF_spin,cart=PySCF_cart,unit=units)
-    mol.build(False,False)
+    mol.build(False, False)
 
     return mol
 
+
 def recreate_scf_obj(qcschema_dict,mol):
     '''
     Does: recreates scf object from qcschema format dictionary
@@ -235,25 +233,24 @@ def recreate_scf_obj(qcschema_dict,mol):
     scf_dict = load_qcschema_scf_info(qcschema_dict)
 
     # create scf object
-    method =  qcschema_dict["keywords"]["scf"]["method"]
-    if(method =='rks'):
+    method = qcschema_dict["keywords"]["scf"]["method"]
+    if (method == 'rks'):
         ks = mol.RKS()
-    elif(method =='uks'):
+    elif (method == 'uks'):
         ks = mol.UKS()
-    elif(method =='rhf'):
+    elif (method == 'rhf'):
         ks = mol.RHF()
-    elif(method =='uhf'):
+    elif (method == 'uhf'):
         ks = mol.UHF()
-    elif(method =='gks'):
+    elif (method == 'gks'):
         ks = mol.GKS()
-    elif(method =='ghf'):
+    elif (method == 'ghf'):
         ks = mol.GHF()
     else:
         raise RuntimeError('qcschema: cannot determine method..exit')
-        return
 
     # get functional
-    if(method == 'rks' or method == 'uks' or method == 'gks'):
+    if (method == 'rks' or method == 'uks' or method == 'gks'):
         functional = qcschema_dict["keywords"]["xcFunctional"]["name"]
         ks.xc = functional
 
diff --git a/pyscf/tools/test/test_cubegen.py b/pyscf/tools/test/test_cubegen.py
index 6a383a7e1f..cc25623f6c 100644
--- a/pyscf/tools/test/test_cubegen.py
+++ b/pyscf/tools/test/test_cubegen.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 import unittest
-import tempfile
 from pyscf import lib, gto, scf
 from pyscf.tools import cubegen
 
@@ -35,7 +34,7 @@ def tearDownModule():
 
 class KnownValues(unittest.TestCase):
     def test_mep(self):
-        with tempfile.NamedTemporaryFile() as ftmp:
+        with lib.NamedTemporaryFile() as ftmp:
             mep = cubegen.mep(mol, ftmp.name, mf.make_rdm1(),
                               nx=10, ny=10, nz=10)
             self.assertEqual(mep.shape, (10,10,10))
@@ -47,7 +46,7 @@ def test_mep(self):
             self.assertAlmostEqual(lib.fp(mep), -4.653995909548524, 5)
 
     def test_orb(self):
-        with tempfile.NamedTemporaryFile() as ftmp:
+        with lib.NamedTemporaryFile() as ftmp:
             orb = cubegen.orbital(mol, ftmp.name, mf.mo_coeff[:,0],
                                   nx=10, ny=10, nz=10)
             self.assertEqual(orb.shape, (10,10,10))
@@ -65,7 +64,7 @@ def test_orb(self):
 
 
     def test_rho(self):
-        with tempfile.NamedTemporaryFile() as ftmp:
+        with lib.NamedTemporaryFile() as ftmp:
             rho = cubegen.density(mol, ftmp.name, mf.make_rdm1(),
                                   nx=10, ny=10, nz=10)
             self.assertEqual(rho.shape, (10,10,10))
@@ -96,7 +95,7 @@ def test_rho_with_pbc(self):
         cell.output = '/dev/null'
         cell.build()
         mf = cell.RHF().run()
-        with tempfile.NamedTemporaryFile() as ftmp:
+        with lib.NamedTemporaryFile() as ftmp:
             rho = cubegen.density(cell, ftmp.name, mf.make_rdm1(),
                                   nx=10, ny=10, nz=10)
             cc = cubegen.Cube(cell)
diff --git a/pyscf/tools/test/test_fcidump.py b/pyscf/tools/test/test_fcidump.py
index 45ae7df48d..c51de60926 100644
--- a/pyscf/tools/test/test_fcidump.py
+++ b/pyscf/tools/test/test_fcidump.py
@@ -14,13 +14,11 @@
 # limitations under the License.
 
 import unittest
-import tempfile
 from functools import reduce
 import numpy
 from pyscf import lib
 from pyscf import gto, scf, ao2mo
 from pyscf.tools import fcidump
-import tempfile
 
 def setUpModule():
     global mol, mf
@@ -36,7 +34,7 @@ def setUpModule():
     mol.verbose = 0
     mol.build(0, 0)
 
-    mf = mol.RHF(chkfile=tempfile.NamedTemporaryFile().name).run()
+    mf = mol.RHF(chkfile=lib.NamedTemporaryFile().name).run()
 
 def tearDownModule():
     global mol, mf
@@ -44,19 +42,19 @@ def tearDownModule():
 
 class KnownValues(unittest.TestCase):
     def test_from_chkfile(self):
-        tmpfcidump = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+        tmpfcidump = lib.NamedTemporaryFile(dir=lib.param.TMPDIR)
         fcidump.from_chkfile(tmpfcidump.name, mf.chkfile, tol=1e-15,
                              molpro_orbsym=True)
 
     def test_from_integral(self):
-        tmpfcidump = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+        tmpfcidump = lib.NamedTemporaryFile(dir=lib.param.TMPDIR)
         h1 = reduce(numpy.dot, (mf.mo_coeff.T, mf.get_hcore(), mf.mo_coeff))
         h2 = ao2mo.full(mf._eri, mf.mo_coeff)
         fcidump.from_integrals(tmpfcidump.name, h1, h2, h1.shape[0],
                                mol.nelectron, tol=1e-15)
 
     def test_read(self):
-        with tempfile.NamedTemporaryFile(mode='w+') as f:
+        with lib.NamedTemporaryFile(mode='w+') as f:
             f.write('''&FCI NORB=4,
 NELEC=4, MS2=0, ISYM=1,
 ORBSYM=1,2,3,4,
@@ -72,7 +70,7 @@ def test_read(self):
             result = fcidump.read(f.name)
         self.assertEqual(result['ISYM'], 1)
 
-        with tempfile.NamedTemporaryFile(mode='w+') as f:
+        with lib.NamedTemporaryFile(mode='w+') as f:
             f.write('''&FCI NORB=4, NELEC=4, MS2=0, ISYM=1,ORBSYM=1,2,3,4, &END
 0.42 1 1 1 1
 0.33 1 1 2 2
@@ -87,7 +85,7 @@ def test_read(self):
 
     def test_to_scf(self):
         '''Test from_scf and to_scf'''
-        tmpfcidump = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+        tmpfcidump = lib.NamedTemporaryFile(dir=lib.param.TMPDIR)
         fcidump.from_scf(mf, tmpfcidump.name)
         mf1 = fcidump.to_scf(tmpfcidump.name)
         mf1.init_guess = mf.make_rdm1()
@@ -96,9 +94,9 @@ def test_to_scf(self):
         self.assertTrue(numpy.array_equal(mf.orbsym, mf1.orbsym))
 
     def test_to_scf_with_symmetry(self):
-        with tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR) as tmpfcidump:
+        with lib.NamedTemporaryFile(dir=lib.param.TMPDIR) as tmpfcidump:
             mol = gto.M(atom='H 0 0 0; H 1 0 0', symmetry=True)
-            mf = mol.RHF().run()
+            mf = mol.RHF(chkfile=lib.NamedTemporaryFile().name).run()
             fcidump.from_scf(mf, tmpfcidump.name)
             mf = fcidump.to_scf(tmpfcidump.name)
             self.assertEqual(mf.mol.groupname, 'D2h')
diff --git a/pyscf/tools/test/test_finite_diff.py b/pyscf/tools/test/test_finite_diff.py
index 3af64c8c88..10abd08804 100644
--- a/pyscf/tools/test/test_finite_diff.py
+++ b/pyscf/tools/test/test_finite_diff.py
@@ -62,7 +62,6 @@ def test_no_scanner(self):
 
     def test_convergence_failed(self):
         mol = pyscf.M(atom='H 0 0 0; H 0 0 1')
-        mol.verbose = 4
         geom_ref = mol.atom_coords()
         mf = mol.RHF().run()
         ref = mf.Gradients().kernel()
diff --git a/pyscf/tools/test/test_molden.py b/pyscf/tools/test/test_molden.py
index 944c7c9178..32cbdd24f6 100644
--- a/pyscf/tools/test/test_molden.py
+++ b/pyscf/tools/test/test_molden.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 import unittest
-import tempfile
 from pyscf import lib, gto, scf
 from pyscf.tools import molden
 
@@ -37,7 +36,7 @@ def tearDownModule():
 
 class KnownValues(unittest.TestCase):
     def test_dump_scf(self):
-        ftmp = tempfile.NamedTemporaryFile()
+        ftmp = lib.NamedTemporaryFile()
         fname = ftmp.name
         molden.dump_scf(mf, fname)
         res = molden.read(fname)
@@ -45,7 +44,7 @@ def test_dump_scf(self):
         self.assertAlmostEqual(abs(mf.mo_coeff-mo_coeff).max(), 0, 12)
 
     def test_dump_uhf(self):
-        ftmp = tempfile.NamedTemporaryFile()
+        ftmp = lib.NamedTemporaryFile()
         fname = ftmp.name
         with lib.temporary_env(mol, spin=2, charge=2):
             mf = scf.UHF(mol).run()
@@ -57,7 +56,7 @@ def test_dump_uhf(self):
             self.assertAlmostEqual(abs(mf.mo_coeff[1]-mo_coeff[1]).max(), 0, 12)
 
     def test_dump_cartesian_gto_orbital(self):
-        ftmp = tempfile.NamedTemporaryFile()
+        ftmp = lib.NamedTemporaryFile()
         fname = ftmp.name
         with lib.temporary_env(mol, cart=True, symmetry=False):
             mf = scf.UHF(mol).run()
@@ -69,7 +68,7 @@ def test_dump_cartesian_gto_orbital(self):
             self.assertAlmostEqual(abs(mf.mo_coeff[1]-mo_coeff[1]).max(), 0, 12)
 
     def test_dump_cartesian_gto_symm_orbital(self):
-        ftmp = tempfile.NamedTemporaryFile()
+        ftmp = lib.NamedTemporaryFile()
         fname = ftmp.name
 
         pmol = mol.copy()
@@ -83,7 +82,7 @@ def test_dump_cartesian_gto_symm_orbital(self):
         self.assertAlmostEqual(abs(mf.mo_coeff-mo_coeff).max(), 0, 12)
 
     def test_basis_not_sorted(self):
-        with tempfile.NamedTemporaryFile('w') as ftmp:
+        with lib.NamedTemporaryFile('w') as ftmp:
             ftmp.write('''\
 [Molden Format]
 made by pyscf v[2.4.0]
diff --git a/pyscf/tools/wfn_format.py b/pyscf/tools/wfn_format.py
index 6470604012..2fba89516b 100644
--- a/pyscf/tools/wfn_format.py
+++ b/pyscf/tools/wfn_format.py
@@ -90,11 +90,13 @@
     [21,24,25,30,33,31,26,34,35,28,22,27,32,29,23],  # G
     [56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36],  # H
 ]
+
+
 def write_mo(fout, mol, mo_coeff, mo_energy=None, mo_occ=None):
     if mol.cart:
         raise NotImplementedError('Cartesian basis not available')
 
-    #FIXME: Duplicated primitives may lead to problems.  x2c._uncontract_mol
+    # FIXME: Duplicated primitives may lead to problems.  x2c._uncontract_mol
     # is the workaround at the moment to remove duplicated primitives.
     from pyscf.x2c import x2c
     mol, ctr = x2c._uncontract_mol(mol, True, 0.)
@@ -118,8 +120,7 @@ def write_mo(fout, mol, mo_coeff, mo_energy=None, mo_occ=None):
         mosub = numpy.einsum('yki,cy,pk->pci', mosub, c2s, c)
         mo_cart.append(mosub.transpose(1,0,2).reshape(-1,nmo))
 
-        for t in TYPE_MAP[l]:
-            types.append([t]*np)
+        types.extend([t] * np for t in TYPE_MAP[l])
         ncart = mol.bas_len_cart(ib)
         exps.extend([es]*ncart)
         centers.extend([ia+1]*(np*ncart))
@@ -139,11 +140,11 @@ def write_mo(fout, mol, mo_coeff, mo_energy=None, mo_occ=None):
                    % (mol.atom_pure_symbol(ia), ia+1, ia+1, x, y, z,
                       mol.atom_charge(ia)))
     for i0, i1 in lib.prange(0, nprim, 20):
-        fout.write('CENTRE ASSIGNMENTS  %s\n' % ''.join('%3d'%x for x in centers[i0:i1]))
+        fout.write('CENTRE ASSIGNMENTS  %s\n' % ''.join('%3d' % x for x in centers[i0:i1]))
     for i0, i1 in lib.prange(0, nprim, 20):
-        fout.write('TYPE ASSIGNMENTS    %s\n' % ''.join('%3d'%x for x in types[i0:i1]))
+        fout.write('TYPE ASSIGNMENTS    %s\n' % ''.join('%3d' % x for x in types[i0:i1]))
     for i0, i1 in lib.prange(0, nprim, 5):
-        fout.write('EXPONENTS  %s\n' % ' '.join('%13.7E'%x for x in exps[i0:i1]))
+        fout.write('EXPONENTS  %s\n' % ' '.join('%13.7E' % x for x in exps[i0:i1]))
 
     for k in range(nmo):
         mo = mo_cart[:,k]
@@ -156,15 +157,16 @@ def write_mo(fout, mol, mo_coeff, mo_energy=None, mo_occ=None):
             fout.write('MO  %-4d                  OCC NO = %12.8f ORB. ENERGY = %12.8f\n' %
                        (k+1, mo_occ[k], mo_energy[k]))
         for i0, i1 in lib.prange(0, nprim, 5):
-            fout.write(' %s\n' % ' '.join('%15.8E'%x for x in mo[i0:i1]))
+            fout.write(' %s\n' % ' '.join('%15.8E' % x for x in mo[i0:i1]))
     fout.write('END DATA\n')
     if mo_energy is None or mo_occ is None:
         fout.write('ALDET    ENERGY =        0.0000000000   VIRIAL(-V/T)  =   0.00000000\n')
     elif mo_energy is None and mo_occ is None:
         pass
-    else :
+    else:
         fout.write('RHF      ENERGY =        0.0000000000   VIRIAL(-V/T)  =   0.00000000\n')
 
+
 def write_ci(fout, fcivec, norb, nelec, ncore=0):
     from pyscf import fci
     if isinstance(nelec, (int, numpy.number)):
@@ -179,6 +181,7 @@ def write_ci(fout, fcivec, norb, nelec, ncore=0):
     nb = fci.cistring.num_strings(norb, nelecb)
     stringsa = fci.cistring.gen_strings4orblist(range(norb), neleca)
     stringsb = fci.cistring.gen_strings4orblist(range(norb), nelecb)
+
     def str2orbidx(string, ncore):
         bstring = bin(string)
         return [i+1+ncore for i,s in enumerate(bstring[::-1]) if s == '1']
@@ -188,9 +191,10 @@ def str2orbidx(string, ncore):
         addra, addrb = divmod(iaddr, nb)
         idxa = ['%3d' % x for x in str2orbidx(stringsa[addra], ncore)]
         idxb = ['%3d' % (-x) for x in str2orbidx(stringsb[addrb], ncore)]
-        #TODO:add a cuttoff and a counter for ndets
+        # TODO:add a cuttoff and a counter for ndets
         fout.write('%18.10E %s %s\n' % (fcivec[addra,addrb], ' '.join(idxa), ' '.join(idxb)))
 
+
 if __name__ == '__main__':
     from pyscf import scf, mcscf, symm
     from pyscf.tools import molden
@@ -198,9 +202,9 @@ def str2orbidx(string, ncore):
                 unit='B', basis='ccpvtz', verbose=4,
                 symmetry=1, symmetry_subgroup='d2h')
     mf = scf.RHF(mol).run()
-    coeff = mf.mo_coeff[:,mf.mo_occ>0]
-    energy = mf.mo_energy[mf.mo_occ>0]
-    occ = mf.mo_occ[mf.mo_occ>0]
+    coeff = mf.mo_coeff[:,mf.mo_occ > 0]
+    energy = mf.mo_energy[mf.mo_occ > 0]
+    occ = mf.mo_occ[mf.mo_occ > 0]
     with open('n2_hf.wfn', 'w') as f2:
         write_mo(f2, mol, coeff, energy, occ)
 #
diff --git a/pyscf/x2c/sfx2c1e.py b/pyscf/x2c/sfx2c1e.py
index 62004154e6..03734abefb 100644
--- a/pyscf/x2c/sfx2c1e.py
+++ b/pyscf/x2c/sfx2c1e.py
@@ -184,19 +184,7 @@ def get_hcore(self, mol=None):
             h1 = x2c._get_hcore_fw(t, v, w, s, x, c)
 
         elif 'ATOM' in self.approx.upper():
-            atom_slices = xmol.offset_nr_by_atom()
-            nao = xmol.nao_nr()
-            x = numpy.zeros((nao,nao))
-            for ia in range(xmol.natm):
-                ish0, ish1, p0, p1 = atom_slices[ia]
-                shls_slice = (ish0, ish1, ish0, ish1)
-                t1 = xmol.intor('int1e_kin', shls_slice=shls_slice)
-                s1 = xmol.intor('int1e_ovlp', shls_slice=shls_slice)
-                with xmol.with_rinv_at_nucleus(ia):
-                    z = -xmol.atom_charge(ia)
-                    v1 = z * xmol.intor('int1e_rinv', shls_slice=shls_slice)
-                    w1 = z * xmol.intor('int1e_prinvp', shls_slice=shls_slice)
-                x[p0:p1,p0:p1] = x2c._x2c1e_xmatrix(t1, v1, w1, s1, c)
+            x = _atomic_1e_x(xmol)
             h1 = x2c._get_hcore_fw(t, v, w, s, x, c)
 
         else:
@@ -253,19 +241,7 @@ def get_xmat(self, mol=None):
         assert ('1E' in self.approx.upper())
 
         if 'ATOM' in self.approx.upper():
-            atom_slices = xmol.offset_nr_by_atom()
-            nao = xmol.nao_nr()
-            x = numpy.zeros((nao,nao))
-            for ia in range(xmol.natm):
-                ish0, ish1, p0, p1 = atom_slices[ia]
-                shls_slice = (ish0, ish1, ish0, ish1)
-                t1 = xmol.intor('int1e_kin', shls_slice=shls_slice)
-                s1 = xmol.intor('int1e_ovlp', shls_slice=shls_slice)
-                with xmol.with_rinv_at_nucleus(ia):
-                    z = -xmol.atom_charge(ia)
-                    v1 = z * xmol.intor('int1e_rinv', shls_slice=shls_slice)
-                    w1 = z * xmol.intor('int1e_prinvp', shls_slice=shls_slice)
-                x[p0:p1,p0:p1] = x2c._x2c1e_xmatrix(t1, v1, w1, s1, c)
+            x = _atomic_1e_x(xmol)
         else:
             t = xmol.intor_symmetric('int1e_kin')
             v = xmol.intor_symmetric('int1e_nuc')
@@ -297,28 +273,22 @@ def hcore_deriv_generator(self, mol=None, deriv=1):
 
 SpinFreeX2C = SpinFreeX2CHelper
 
-
-if __name__ == '__main__':
-    mol = gto.Mole()
-    mol.build(
-        verbose = 0,
-        atom = [["O" , (0. , 0.     , 0.)],
-                [1   , (0. , -0.757 , 0.587)],
-                [1   , (0. , 0.757  , 0.587)] ],
-        basis = 'ccpvdz-dk',
-    )
-
-    method = hf.RHF(mol)
-    enr = method.kernel()
-    print('E(NR) = %.12g' % enr)
-
-    method = sfx2c1e(hf.RHF(mol))
-    esfx2c = method.kernel()
-    print('E(SFX2C1E) = %.12g' % esfx2c)
-    method.with_x2c.basis = 'unc-ccpvqz-dk'
-    print('E(SFX2C1E) = %.12g' % method.kernel())
-    method.with_x2c.approx = 'atom1e'
-    print('E(SFX2C1E) = %.12g' % method.kernel())
-
-    mf = method.density_fit().undo_x2c().run()
-    print('E(DF-NR) = %.12g' % mf.e_tot)
+def _atomic_1e_x(mol):
+    atoms = x2c._atoms_in_mole(mol)
+    x_conf = {}
+    c = lib.param.LIGHT_SPEED
+    for elem, atom in atoms.items():
+        t1 = atom.intor_symmetric('int1e_kin')
+        s1 = atom.intor_symmetric('int1e_ovlp')
+        v1 = atom.intor_symmetric('int1e_nuc')
+        w1 = atom.intor_symmetric('int1e_pnucp')
+        x_conf[elem] = x2c._x2c1e_xmatrix(t1, v1, w1, s1, c)
+
+    atom_slices = mol.offset_nr_by_atom()
+    nao = mol.nao
+    x = numpy.zeros((nao, nao))
+    for ia in range(mol.natm):
+        p0, p1 = atom_slices[ia, 2:]
+        elem = mol.atom_symbol(ia)
+        x[p0:p1,p0:p1] = x_conf[elem]
+    return x
diff --git a/pyscf/x2c/sfx2c1e_grad.py b/pyscf/x2c/sfx2c1e_grad.py
index fa8fa9a881..18a4e4c223 100644
--- a/pyscf/x2c/sfx2c1e_grad.py
+++ b/pyscf/x2c/sfx2c1e_grad.py
@@ -25,7 +25,7 @@
 import scipy.linalg
 from pyscf import lib
 from pyscf import gto
-from pyscf.x2c import x2c
+from pyscf.x2c import x2c, sfx2c1e
 
 def hcore_grad_generator(x2cobj, mol=None):
     '''nuclear gradients of 1-component X2c hcore Hamiltonian  (spin-free part only)
@@ -49,25 +49,13 @@ def hcore_deriv(atm_id):
 
 def gen_sf_hfw(mol, approx='1E'):
     approx = approx.upper()
-    c = lib.param.LIGHT_SPEED
 
     h0, s0 = _get_h0_s0(mol)
     e0, c0 = scipy.linalg.eigh(h0, s0)
 
-    aoslices = mol.aoslice_by_atom()
     nao = mol.nao_nr()
     if 'ATOM' in approx:
-        x0 = numpy.zeros((nao,nao))
-        for ia in range(mol.natm):
-            ish0, ish1, p0, p1 = aoslices[ia]
-            shls_slice = (ish0, ish1, ish0, ish1)
-            t1 = mol.intor('int1e_kin', shls_slice=shls_slice)
-            s1 = mol.intor('int1e_ovlp', shls_slice=shls_slice)
-            with mol.with_rinv_at_nucleus(ia):
-                z = -mol.atom_charge(ia)
-                v1 = z * mol.intor('int1e_rinv', shls_slice=shls_slice)
-                w1 = z * mol.intor('int1e_prinvp', shls_slice=shls_slice)
-            x0[p0:p1,p0:p1] = x2c._x2c1e_xmatrix(t1, v1, w1, s1, c)
+        x0 = sfx2c1e._atomic_1e_x(mol)
     else:
         cl0 = c0[:nao,nao:]
         cs0 = c0[nao:,nao:]
@@ -244,51 +232,3 @@ def _get_r1(s0_roots, s_nesc0, s1, s_nesc1, r0_roots):
     R1 += reduce(numpy.dot, (vr0_s0_invsqrt.T, vr0_wr0_sqrt.T, s1_sqrt))
     R1 = reduce(numpy.dot, (v_s, R1, v_s.T))
     return R1
-
-
-if __name__ == '__main__':
-    bak = lib.param.LIGHT_SPEED
-    lib.param.LIGHT_SPEED = 10
-    def get_h(mol):
-        c = lib.param.LIGHT_SPEED
-        t = mol.intor_symmetric('int1e_kin')
-        v = mol.intor_symmetric('int1e_nuc')
-        s = mol.intor_symmetric('int1e_ovlp')
-        w = mol.intor_symmetric('int1e_pnucp')
-        return x2c._x2c1e_get_hcore(t, v, w, s, c)
-
-    mol = gto.M(
-        verbose = 0,
-        atom = [["O" , (0. , 0.     , 0.0001)],
-                [1   , (0. , -0.757 , 0.587)],
-                [1   , (0. , 0.757  , 0.587)]],
-        basis = '3-21g',
-    )
-    h_1 = get_h(mol)
-
-    mol = gto.M(
-        verbose = 0,
-        atom = [["O" , (0. , 0.     ,-0.0001)],
-                [1   , (0. , -0.757 , 0.587)],
-                [1   , (0. , 0.757  , 0.587)]],
-        basis = '3-21g',
-    )
-    h_2 = get_h(mol)
-    h_ref = (h_1 - h_2) / 0.0002 * lib.param.BOHR
-
-    mol = gto.M(
-        verbose = 0,
-        atom = [["O" , (0. , 0.     , 0.   )],
-                [1   , (0. , -0.757 , 0.587)],
-                [1   , (0. , 0.757  , 0.587)]],
-        basis = '3-21g',
-    )
-    hcore_deriv = gen_sf_hfw(mol)
-    h1 = hcore_deriv(0)
-    print(abs(h1[2]-h_ref).max())
-    lib.param.LIGHT_SPEED = bak
-
-    print(lib.finger(h1) - -1.4618392662849411)
-    hcore_deriv = gen_sf_hfw(mol, approx='atom1e')
-    h1 = hcore_deriv(0)
-    print(lib.finger(h1) - -1.3596826558976405)
diff --git a/pyscf/x2c/sfx2c1e_hess.py b/pyscf/x2c/sfx2c1e_hess.py
index 2385d6422d..2e629d4244 100644
--- a/pyscf/x2c/sfx2c1e_hess.py
+++ b/pyscf/x2c/sfx2c1e_hess.py
@@ -26,7 +26,7 @@
 import scipy.linalg
 from pyscf import lib
 from pyscf import gto
-from pyscf.x2c import x2c
+from pyscf.x2c import x2c, sfx2c1e
 from pyscf.x2c import sfx2c1e_grad
 
 def hcore_hess_generator(x2cobj, mol=None):
@@ -60,17 +60,7 @@ def gen_sf_hfw(mol, approx='1E'):
     aoslices = mol.aoslice_by_atom()
     nao = mol.nao_nr()
     if 'ATOM' in approx:
-        x0 = numpy.zeros((nao,nao))
-        for ia in range(mol.natm):
-            ish0, ish1, p0, p1 = aoslices[ia]
-            shls_slice = (ish0, ish1, ish0, ish1)
-            t1 = mol.intor('int1e_kin', shls_slice=shls_slice)
-            s1 = mol.intor('int1e_ovlp', shls_slice=shls_slice)
-            with mol.with_rinv_at_nucleus(ia):
-                z = -mol.atom_charge(ia)
-                v1 = z * mol.intor('int1e_rinv', shls_slice=shls_slice)
-                w1 = z * mol.intor('int1e_prinvp', shls_slice=shls_slice)
-            x0[p0:p1,p0:p1] = x2c._x2c1e_xmatrix(t1, v1, w1, s1, c)
+        x0 = sfx2c1e._atomic_1e_x(mol)
     else:
         cl0 = c0[:nao,nao:]
         cs0 = c0[nao:,nao:]
@@ -329,46 +319,3 @@ def _get_r2(s0_roots, sa0, s1i, sa1i, s1j, sa1j, s2, sa2, r0_roots):
     R2 += lib.einsum('i,iq,qj->ij' , w_invsqrt  , R0_mid , s2_sqrt)
     R2 = reduce(numpy.dot, (v_s, R2, v_s.T))
     return R2
-
-
-if __name__ == '__main__':
-    bak = lib.param.LIGHT_SPEED
-    lib.param.LIGHT_SPEED = 10
-
-    mol = gto.M(
-        verbose = 0,
-        atom = [["O" , (0. , 0.     , 0.0001)],
-                [1   , (0. , -0.757 , 0.587)],
-                [1   , (0. , 0.757  , 0.587)]],
-        basis = '3-21g',
-    )
-    h1_deriv_1 = sfx2c1e_grad.gen_sf_hfw(mol, approx='1E')
-
-    mol = gto.M(
-        verbose = 0,
-        atom = [["O" , (0. , 0.     ,-0.0001)],
-                [1   , (0. , -0.757 , 0.587)],
-                [1   , (0. , 0.757  , 0.587)]],
-        basis = '3-21g',
-    )
-    h1_deriv_2 = sfx2c1e_grad.gen_sf_hfw(mol, approx='1E')
-
-    mol = gto.M(
-        verbose = 0,
-        atom = [["O" , (0. , 0.     , 0.   )],
-                [1   , (0. , -0.757 , 0.587)],
-                [1   , (0. , 0.757  , 0.587)]],
-        basis = '3-21g',
-    )
-    h2_deriv = gen_sf_hfw(mol)
-
-    h2 = h2_deriv(0,0)
-    h2_ref = (h1_deriv_1(0)[2] - h1_deriv_2(0)[2]) / 0.0002 * lib.param.BOHR
-    print(abs(h2[2,2]-h2_ref).max())
-    print(lib.finger(h2) - 33.71188112440316)
-
-    h2 = h2_deriv(1,0)
-    h2_ref = (h1_deriv_1(1)[2] - h1_deriv_2(1)[2]) / 0.0002 * lib.param.BOHR
-    print(abs(h2[2,2]-h2_ref).max())
-    print(lib.finger(h2) - -23.609411428378138)
-    lib.param.LIGHT_SPEED = bak
diff --git a/pyscf/x2c/test/test_tdscf.py b/pyscf/x2c/test/test_tdscf.py
index a23607b78d..3a1aa04ca3 100644
--- a/pyscf/x2c/test/test_tdscf.py
+++ b/pyscf/x2c/test/test_tdscf.py
@@ -17,7 +17,6 @@
 #
 
 import unittest
-import tempfile
 import numpy
 from pyscf import lib, gto, scf
 from pyscf.dft import radi
@@ -40,8 +39,7 @@ def setUpModule():
     mol.spin = 1
     mol.build()
 
-    mf_lda = dft.UKS(mol).set(xc='lda,', conv_tol=1e-12,
-                              chkfile=tempfile.NamedTemporaryFile().name).newton().run()
+    mf_lda = dft.UKS(mol).set(xc='lda,', conv_tol=1e-12, chkfile=lib.NamedTemporaryFile().name).newton().run()
 
 def tearDownModule():
     global mol, mf_lda
diff --git a/pyscf/x2c/test/test_x2c.py b/pyscf/x2c/test/test_x2c.py
index eb061626a4..443b002714 100644
--- a/pyscf/x2c/test/test_x2c.py
+++ b/pyscf/x2c/test/test_x2c.py
@@ -17,6 +17,7 @@
 #
 
 import numpy
+import scipy.linalg
 import unittest
 from pyscf import gto
 from pyscf import scf
@@ -185,23 +186,42 @@ def test_lindep_xbasis(self):
 C     F 
     0.761000000E+00    0.100000000E+01
 ''')
-        xmol, c = x2c.X2C(mol).get_xmol(mol)
+        x2c_obj = x2c.X2C(mol)
+        xmol, c = x2c_obj.get_xmol(mol)
         self.assertEqual(xmol.nbas, 18)
         self.assertEqual(xmol.nao, 42)
         self.assertAlmostEqual(lib.fp(c), -5.480689638416739, 12)
 
+        hcore = x2c_obj.get_hcore()
+        s = mol.intor_symmetric('int1e_ovlp_spinor')
+        e_ref = scipy.linalg.eigvalsh(hcore, s)
+
+        mol = gto.M(atom='C', basis=(mol.basis, [[0, [0.128500001, 1]]]))
+        x2c_obj = x2c.X2C(mol)
+        xmol, c = x2c_obj.get_xmol(mol)
+        self.assertEqual(xmol.nbas, 19)
+        self.assertEqual(xmol.nao, 43)
+        hcore = x2c_obj.get_hcore()
+        s = mol.intor_symmetric('int1e_ovlp_spinor')
+        d, t = scipy.linalg.eigh(s)
+        idx = d > 1e-8
+        t = t[:,idx] / numpy.sqrt(d[idx])
+        tht = t.T.conj().dot(hcore.dot(t))
+        e = scipy.linalg.eigvalsh(tht)
+        self.assertAlmostEqual(abs(e - e_ref).max(), 0, 6)
+
     def test_get_hcore(self):
         myx2c = scf.RHF(mol).sfx2c1e()
         myx2c.with_x2c.get_xmat = lambda xmol: numpy.zeros((xmol.nao, xmol.nao))
         h1 = myx2c.with_x2c.get_hcore()
         ref = mol.intor('int1e_nuc')
-        self.assertAlmostEqual(abs(h1 - ref).max(), 0, 12)
+        self.assertAlmostEqual(abs(h1 - ref).max(), 0, 11)
 
         with_x2c = x2c.X2C(mol)
         with_x2c.get_xmat = lambda xmol: numpy.zeros((xmol.nao_2c(), xmol.nao_2c()))
         h1 = with_x2c.get_hcore()
         ref = mol.intor('int1e_nuc_spinor')
-        self.assertAlmostEqual(abs(h1 - ref).max(), 0, 12)
+        self.assertAlmostEqual(abs(h1 - ref).max(), 0, 11)
 
     def test_ghf(self):
         # Test whether the result of spinor X2C is a solution of .GHF().x2c()
@@ -225,9 +245,20 @@ def test_ghf_atom(self):
         mf_atom1e = mol.GHF().x2c1e()
         mf_atom1e.with_x2c.approx = 'ATOM1E'
         mf_atom1e.kernel()
-        self.assertAlmostEqual(abs(mf_1e.e_tot - mf_atom1e.e_tot).max(), 0, 9)
+        self.assertAlmostEqual(mf_1e.e_tot, mf_atom1e.e_tot, 9)
         self.assertAlmostEqual(abs(mf_1e.mo_energy - mf_atom1e.mo_energy).max(), 0, 9)
 
+        with lib.temporary_env(lib.param, LIGHT_SPEED=15.):
+            mol = gto.M(atom='Ne 0 1 -1; Ne 0 8 8', basis='ccpvdz')
+            mf_1e = mol.GHF().x2c1e()
+            mf_1e.kernel()
+            mf_atom1e = mol.GHF().x2c1e()
+            mf_atom1e.with_x2c.approx = 'ATOM1E'
+            mf_atom1e.kernel()
+            self.assertAlmostEqual(mf_1e.e_tot, -267.39699993561, 8)
+            self.assertAlmostEqual(mf_1e.e_tot, mf_atom1e.e_tot, 8)
+            self.assertAlmostEqual(abs(mf_1e.mo_energy - mf_atom1e.mo_energy).max(), 0, 6)
+
     def test_gks(self):
         mol = gto.M(atom='C', basis='ccpvdz-dk')
         ref = mol.DKS(xc='b3lyp').x2c().run()
@@ -241,6 +272,17 @@ def test_gks(self):
         self.assertAlmostEqual(mf.e_tot, ref.e_tot, 9)
         self.assertAlmostEqual(abs(mf.dip_moment() - ref.dip_moment()).max(), 0, 9)
 
+        with lib.temporary_env(lib.param, LIGHT_SPEED=15.):
+            mol = gto.M(atom='Ne 0 1 -1; Ne 0 8 8', basis='ccpvdz')
+            mf_1e = mol.DKS().x2c1e()
+            mf_1e.kernel()
+            mf_atom1e = mol.DKS().x2c1e()
+            mf_atom1e.with_x2c.approx = 'ATOM1E'
+            mf_atom1e.kernel()
+            self.assertAlmostEqual(mf_1e.e_tot, -266.688128052731, 8)
+            self.assertAlmostEqual(mf_1e.e_tot, mf_atom1e.e_tot, 8)
+            self.assertAlmostEqual(abs(mf_1e.mo_energy - mf_atom1e.mo_energy).max(), 0, 6)
+
     def test_undo_x2c(self):
         mf = mol.RHF().x2c().density_fit()
         self.assertEqual(mf.__class__.__name__, 'DFsfX2C1eRHF')
diff --git a/pyscf/x2c/test/test_x2c_grad.py b/pyscf/x2c/test/test_x2c_grad.py
index c823154ad8..2fe158d573 100644
--- a/pyscf/x2c/test/test_x2c_grad.py
+++ b/pyscf/x2c/test/test_x2c_grad.py
@@ -19,7 +19,7 @@
 import scipy.linalg
 from pyscf import lib
 from pyscf import gto
-from pyscf.x2c import sfx2c1e
+from pyscf.x2c import x2c, sfx2c1e
 from pyscf.x2c import sfx2c1e_grad
 
 def _sqrt0(a):
@@ -268,6 +268,47 @@ def test_hfw(self):
             fh = x2cobj.hcore_deriv_generator(deriv=1)
             self.assertAlmostEqual(abs(fh(0)[2] - fh_ref).max(), 0, 7)
 
+    def test_hcore(self):
+        with lib.light_speed(10) as c:
+            mol = gto.M(
+                verbose = 0,
+                atom = [["O" , (0. , 0.     , 0.0001)],
+                        [1   , (0. , -0.757 , 0.587)],
+                        [1   , (0. , 0.757  , 0.587)]],
+                basis = '3-21g',
+            )
+            h_1 = sfx2c1e.SpinFreeX2CHelper(mol).set(xuncontract=False).get_hcore()
+            ha_1 = sfx2c1e.SpinFreeX2CHelper(mol).set(xuncontract=False, approx='ATOM1E').get_hcore()
+
+            mol = gto.M(
+                verbose = 0,
+                atom = [["O" , (0. , 0.     ,-0.0001)],
+                        [1   , (0. , -0.757 , 0.587)],
+                        [1   , (0. , 0.757  , 0.587)]],
+                basis = '3-21g',
+            )
+            h_2 = sfx2c1e.SpinFreeX2CHelper(mol).set(xuncontract=False).get_hcore()
+            ha_2 = sfx2c1e.SpinFreeX2CHelper(mol).set(xuncontract=False, approx='ATOM1E').get_hcore()
+            h_ref = (h_1 - h_2) / 0.0002 * lib.param.BOHR
+            ha_ref = (ha_1 - ha_2) / 0.0002 * lib.param.BOHR
+
+            mol = gto.M(
+                verbose = 0,
+                atom = [["O" , (0. , 0.     , 0.   )],
+                        [1   , (0. , -0.757 , 0.587)],
+                        [1   , (0. , 0.757  , 0.587)]],
+                basis = '3-21g',
+            )
+            hcore_deriv = sfx2c1e_grad.gen_sf_hfw(mol)
+            h1 = hcore_deriv(0)
+            self.assertAlmostEqual(abs(h1[2]-h_ref).max(), 0, 6)
+
+            self.assertAlmostEqual(lib.fp(h1), -1.4618392662849411, 9)
+            hcore_deriv = sfx2c1e_grad.gen_sf_hfw(mol, approx='atom1e')
+            h1 = hcore_deriv(0)
+            self.assertAlmostEqual(abs(h1[2]-ha_ref).max(), 0, 6)
+            self.assertAlmostEqual(lib.fp(h1), -1.4802587171126063, 9)
+
 if __name__ == "__main__":
     print("Full Tests for sfx2c1e gradients")
     unittest.main()
diff --git a/pyscf/x2c/test/test_x2c_hess.py b/pyscf/x2c/test/test_x2c_hess.py
index e1cc187f9e..06285640ce 100644
--- a/pyscf/x2c/test/test_x2c_hess.py
+++ b/pyscf/x2c/test/test_x2c_hess.py
@@ -637,6 +637,53 @@ def test_hfw2(self):
         h2_ref = (h1_deriv_1(1)[2] - h1_deriv_2(1)[2]) / 0.0002 * lib.param.BOHR
         self.assertAlmostEqual(abs(h2[2,2]-h2_ref).max(), 0, 7)
 
+    def test_hcore(self):
+        with lib.light_speed(10) as c:
+            mol = gto.M(
+                verbose = 0,
+                atom = [["O" , (0. , 0.     , 0.0001)],
+                        [1   , (0. , -0.757 , 0.587)],
+                        [1   , (0. , 0.757  , 0.587)]],
+                basis = '3-21g',
+            )
+            h1_deriv_1 = sfx2c1e_grad.gen_sf_hfw(mol, approx='1E')
+            ha1_deriv_1 = sfx2c1e_grad.gen_sf_hfw(mol, approx='ATOM1E')
+
+            mol = gto.M(
+                verbose = 0,
+                atom = [["O" , (0. , 0.     ,-0.0001)],
+                        [1   , (0. , -0.757 , 0.587)],
+                        [1   , (0. , 0.757  , 0.587)]],
+                basis = '3-21g',
+            )
+            h1_deriv_2 = sfx2c1e_grad.gen_sf_hfw(mol, approx='1E')
+            ha1_deriv_2 = sfx2c1e_grad.gen_sf_hfw(mol, approx='ATOM1E')
+
+            mol = gto.M(
+                verbose = 0,
+                atom = [["O" , (0. , 0.     , 0.   )],
+                        [1   , (0. , -0.757 , 0.587)],
+                        [1   , (0. , 0.757  , 0.587)]],
+                basis = '3-21g',
+            )
+            h2_deriv = sfx2c1e_hess.gen_sf_hfw(mol)
+            ha2_deriv = sfx2c1e_hess.gen_sf_hfw(mol, approx='ATOM1E')
+
+            h2 = h2_deriv(0,0)
+            h2_ref = (h1_deriv_1(0)[2] - h1_deriv_2(0)[2]) / 0.0002 * lib.param.BOHR
+            self.assertAlmostEqual(abs(h2[2,2]-h2_ref).max(), 0, 6)
+            self.assertAlmostEqual(lib.fp(h2), 33.71188112440316, 9)
+
+            h2 = h2_deriv(1,0)
+            h2_ref = (h1_deriv_1(1)[2] - h1_deriv_2(1)[2]) / 0.0002 * lib.param.BOHR
+            self.assertAlmostEqual(abs(h2[2,2]-h2_ref).max(), 0, 6)
+            self.assertAlmostEqual(lib.fp(h2), -23.609411428378138, 7)
+
+            h2 = ha2_deriv(0,0)
+            h2_ref = (ha1_deriv_1(0)[2] - ha1_deriv_2(0)[2]) / 0.0002 * lib.param.BOHR
+            self.assertAlmostEqual(abs(h2[2,2]-h2_ref).max(), 0, 6)
+            self.assertAlmostEqual(lib.fp(h2), 33.718665748856324, 9)
+
 
 if __name__ == "__main__":
     print("Full Tests for sfx2c1e gradients")
diff --git a/pyscf/x2c/x2c.py b/pyscf/x2c/x2c.py
index d2bb85526c..408ac0736c 100644
--- a/pyscf/x2c/x2c.py
+++ b/pyscf/x2c/x2c.py
@@ -91,19 +91,7 @@ def get_hcore(self, mol=None):
             h1 = _get_hcore_fw(t, v, w, s, x, c)
 
         elif 'ATOM' in self.approx.upper():
-            atom_slices = xmol.offset_2c_by_atom()
-            n2c = xmol.nao_2c()
-            x = numpy.zeros((n2c,n2c), dtype=numpy.complex128)
-            for ia in range(xmol.natm):
-                ish0, ish1, p0, p1 = atom_slices[ia]
-                shls_slice = (ish0, ish1, ish0, ish1)
-                s1 = xmol.intor('int1e_ovlp_spinor', shls_slice=shls_slice)
-                t1 = xmol.intor('int1e_spsp_spinor', shls_slice=shls_slice) * .5
-                with xmol.with_rinv_at_nucleus(ia):
-                    z = -xmol.atom_charge(ia)
-                    v1 = z*xmol.intor('int1e_rinv_spinor', shls_slice=shls_slice)
-                    w1 = z*xmol.intor('int1e_sprinvsp_spinor', shls_slice=shls_slice)
-                x[p0:p1,p0:p1] = _x2c1e_xmatrix(t1, v1, w1, s1, c)
+            x = _spinor_atomic_1e_x(xmol)
             h1 = _get_hcore_fw(t, v, w, s, x, c)
 
         else:
@@ -239,19 +227,7 @@ def get_xmat(self, mol=None):
         assert ('1E' in self.approx.upper())
 
         if 'ATOM' in self.approx.upper():
-            atom_slices = xmol.offset_2c_by_atom()
-            n2c = xmol.nao_2c()
-            x = numpy.zeros((n2c,n2c), dtype=numpy.complex128)
-            for ia in range(xmol.natm):
-                ish0, ish1, p0, p1 = atom_slices[ia]
-                shls_slice = (ish0, ish1, ish0, ish1)
-                s1 = xmol.intor('int1e_ovlp_spinor', shls_slice=shls_slice)
-                t1 = xmol.intor('int1e_spsp_spinor', shls_slice=shls_slice) * .5
-                with xmol.with_rinv_at_nucleus(ia):
-                    z = -xmol.atom_charge(ia)
-                    v1 = z*xmol.intor('int1e_rinv_spinor', shls_slice=shls_slice)
-                    w1 = z*xmol.intor('int1e_sprinvsp_spinor', shls_slice=shls_slice)
-                x[p0:p1,p0:p1] = _x2c1e_xmatrix(t1, v1, w1, s1, c)
+            x = _spinor_atomic_1e_x(xmol)
         else:
             s = xmol.intor_symmetric('int1e_ovlp_spinor')
             t = xmol.intor_symmetric('int1e_spsp_spinor') * .5
@@ -277,6 +253,8 @@ def reset(self, mol=None):
             self.mol = mol
         return self
 
+    to_gpu = lib.to_gpu
+
 class SpinorX2CHelper(X2CHelperBase):
     '''2-component X2c (including spin-free and spin-dependent terms) in
     the j-adapted spinor basis.
@@ -309,21 +287,7 @@ def get_hcore(self, mol=None):
             h1 = _get_hcore_fw(t, v, w, s, x, c)
 
         elif 'ATOM' in self.approx.upper():
-            atom_slices = xmol.offset_nr_by_atom()
-            # spin-orbital basis is twice the size of NR basis
-            atom_slices[:,2:] *= 2
-            nao = xmol.nao_nr() * 2
-            x = numpy.zeros((nao,nao), dtype=numpy.complex128)
-            for ia in range(xmol.natm):
-                ish0, ish1, p0, p1 = atom_slices[ia]
-                shls_slice = (ish0, ish1, ish0, ish1)
-                t1 = _block_diag(xmol.intor('int1e_kin', shls_slice=shls_slice))
-                s1 = _block_diag(xmol.intor('int1e_ovlp', shls_slice=shls_slice))
-                with xmol.with_rinv_at_nucleus(ia):
-                    z = -xmol.atom_charge(ia)
-                    v1 = _block_diag(z * xmol.intor('int1e_rinv', shls_slice=shls_slice))
-                    w1 = _sigma_dot(z * xmol.intor('int1e_sprinvsp', shls_slice=shls_slice))
-                x[p0:p1,p0:p1] = _x2c1e_xmatrix(t1, v1, w1, s1, c)
+            x = _spin_orbital_atomic_1e_x(xmol)
             h1 = _get_hcore_fw(t, v, w, s, x, c)
 
         else:
@@ -371,21 +335,7 @@ def get_xmat(self, mol=None):
         assert ('1E' in self.approx.upper())
 
         if 'ATOM' in self.approx.upper():
-            atom_slices = xmol.offset_nr_by_atom()
-            # spin-orbital basis is twice the size of NR basis
-            atom_slices[:,2:] *= 2
-            nao = xmol.nao_nr() * 2
-            x = numpy.zeros((nao,nao), dtype=numpy.complex128)
-            for ia in range(xmol.natm):
-                ish0, ish1, p0, p1 = atom_slices[ia]
-                shls_slice = (ish0, ish1, ish0, ish1)
-                t1 = _block_diag(xmol.intor('int1e_kin', shls_slice=shls_slice))
-                s1 = _block_diag(xmol.intor('int1e_ovlp', shls_slice=shls_slice))
-                with xmol.with_rinv_at_nucleus(ia):
-                    z = -xmol.atom_charge(ia)
-                    v1 = _block_diag(z * xmol.intor('int1e_rinv', shls_slice=shls_slice))
-                    w1 = _sigma_dot(z * xmol.intor('int1e_sprinvsp', shls_slice=shls_slice))
-                x[p0:p1,p0:p1] = _x2c1e_xmatrix(t1, v1, w1, s1, c)
+            x = _spin_orbital_atomic_1e_x(xmol)
         else:
             t = _block_diag(xmol.intor_symmetric('int1e_kin'))
             v = _block_diag(xmol.intor_symmetric('int1e_nuc'))
@@ -673,8 +623,31 @@ def __init__(self, mol):
         if dhf.zquatev is None:
             raise RuntimeError('zquatev library is required to perform Kramers-restricted X2C-RHF')
 
-    def _eigh(self, h, s):
-        return dhf.zquatev.solve_KR_FCSCE(self.mol, h, s)
+    def check_linear_dependency(self, s, verbose=None):
+        log = logger.new_logger(self, verbose)
+        idx = dhf._kramers_pair_sort_ao_idx(self.mol, four_component=False)
+        s = s[idx[:,None], idx]
+        e, v = dhf.zquatev.eigh(s)
+        if log is not None:
+            abs_e = abs(e)
+            emax = abs_e.max()
+            emin = abs_e.min()
+            c = emax / emin
+            log.debug('cond(S) = %s', c)
+            if c > 1e10:
+                log.warn('Singularity detected in the overlap matrix. '
+                         'SCF may be inaccurate and difficult to converge.')
+
+        if hf.remove_overlap_zero_eigenvalue:
+            mask = e > hf.overlap_zero_eigenvalue_threshold
+            x = v[:,mask] / numpy.sqrt(e[mask])
+        else:
+            x = v / numpy.sqrt(e)
+        x1 = numpy.empty_like(x)
+        x1[idx] = x
+        return x1
+
+    _eigh = dhf.RDHF._eigh
 
     def to_ks(self, xc='HF'):
         '''Convert the input mean-field object to an X2C-KS object.
@@ -824,12 +797,12 @@ def _uncontract_mol(mol, xuncontract=None, exp_drop=0.2):
 
 
 def _sqrt(a, tol=1e-14):
-    e, v = numpy.linalg.eigh(a)
+    e, v = scipy.linalg.eigh(a)
     idx = e > tol
     return numpy.dot(v[:,idx]*numpy.sqrt(e[idx]), v[:,idx].T.conj())
 
 def _invsqrt(a, tol=1e-14):
-    e, v = numpy.linalg.eigh(a)
+    e, v = scipy.linalg.eigh(a)
     idx = e > tol
     return numpy.dot(v[:,idx]/numpy.sqrt(e[idx]), v[:,idx].T.conj())
 
@@ -850,7 +823,7 @@ def _get_hcore_fw(t, v, w, s, x, c):
 def _get_r(s, snesc):
     # R^dag \tilde{S} R = S
     # R = S^{-1/2} [S^{-1/2}\tilde{S}S^{-1/2}]^{-1/2} S^{1/2}
-    w, v = numpy.linalg.eigh(s)
+    w, v = scipy.linalg.eigh(s)
     idx = w > 1e-14
     v = v[:,idx]
     w_sqrt = numpy.sqrt(w[idx])
@@ -859,7 +832,7 @@ def _get_r(s, snesc):
     # eigenvectors of S as the new basis
     snesc = reduce(numpy.dot, (v.conj().T, snesc, v))
     r_mid = numpy.einsum('i,ij,j->ij', w_invsqrt, snesc, w_invsqrt)
-    w1, v1 = numpy.linalg.eigh(r_mid)
+    w1, v1 = scipy.linalg.eigh(r_mid)
     idx1 = w1 > 1e-14
     v1 = v1[:,idx1]
     r_mid = numpy.dot(v1/numpy.sqrt(w1[idx1]), v1.conj().T)
@@ -885,19 +858,20 @@ def _x2c1e_xmatrix(t, v, w, s, c):
         e, a = scipy.linalg.eigh(h, m)
         cl = a[:nao,nao:]
         cs = a[nao:,nao:]
-        x = numpy.linalg.solve(cl.T, cs.T).T  # B = XA
+        x = scipy.linalg.solve(cl.T, cs.T).T  # B = XA
     except scipy.linalg.LinAlgError:
-        d, t = numpy.linalg.eigh(m)
+        d, t = scipy.linalg.eigh(m)
         idx = d>LINEAR_DEP_THRESHOLD
         t = t[:,idx] / numpy.sqrt(d[idx])
         tht = reduce(numpy.dot, (t.T.conj(), h, t))
-        e, a = numpy.linalg.eigh(tht)
+        e, a = scipy.linalg.eigh(tht)
         a = numpy.dot(t, a)
         idx = e > -c**2
         cl = a[:nao,idx]
         cs = a[nao:,idx]
-        # X = B A^{-1} = B A^T S
-        x = cs.dot(cl.conj().T).dot(m)
+        # X = B A^{-1} = B (A^T A)^{-1} A^T
+        cl_inv = scipy.linalg.solve(cl.conj().T.dot(cl), cl.conj().T)
+        x = cs.dot(cl_inv)
     return x
 
 def _x2c1e_get_hcore(t, v, w, s, c):
@@ -919,11 +893,11 @@ def _x2c1e_get_hcore(t, v, w, s, c):
         # cs = a[nao:,nao:]
         e = e[nao:]
     except scipy.linalg.LinAlgError:
-        d, t = numpy.linalg.eigh(m)
+        d, t = scipy.linalg.eigh(m)
         idx = d>LINEAR_DEP_THRESHOLD
         t = t[:,idx] / numpy.sqrt(d[idx])
         tht = reduce(numpy.dot, (t.T.conj(), h, t))
-        e, a = numpy.linalg.eigh(tht)
+        e, a = scipy.linalg.eigh(tht)
         a = numpy.dot(t, a)
         idx = e > -c**2
         cl = a[:nao,idx]
@@ -959,7 +933,7 @@ def _x2c1e_get_hcore(t, v, w, s, c):
 #      = S A R[A]^{-1}^+ A^+ h1 A R[A]^{-1} A^+ S
 #      = S A R[A]^{-1}^+ e R[A]^{-1} A^+ S                (2)
 
-    w, u = numpy.linalg.eigh(reduce(numpy.dot, (cl.T.conj(), s, cl)))
+    w, u = scipy.linalg.eigh(reduce(numpy.dot, (cl.T.conj(), s, cl)))
     idx = w > 1e-14
     # Adopt (2) here because X is not appeared in Eq (2).
     # R[A] = u w^{1/2} u^+,  so R[A]^{-1} A^+ S in Eq (2) is
@@ -1076,3 +1050,55 @@ def _decontract_spinor(mol, atoms=None):
     pmol._env = numpy.hstack(env)
     contr_coeff = scipy.linalg.block_diag(*contr_coeff)
     return pmol, contr_coeff
+
+def _atoms_in_mole(mol):
+    atoms = {}
+    for i in range(mol.natm):
+        symb = mol.atom_symbol(i)
+        if symb not in atoms:
+            atoms[symb] = atom = mol.copy(deep=False)
+            mask = mol._bas[:,mole.ATOM_OF] == i
+            atom._bas = mol._bas[mask]
+            atom._atm = mol._atm[i:i+1]
+            atom._bas[:,mole.ATOM_OF] = 0
+    return atoms
+
+def _spin_orbital_atomic_1e_x(mol):
+    atoms = _atoms_in_mole(mol)
+    x_conf = {}
+    c = lib.param.LIGHT_SPEED
+    for elem, atom in atoms.items():
+        t1 = _block_diag(atom.intor_symmetric('int1e_kin'))
+        s1 = _block_diag(atom.intor_symmetric('int1e_ovlp'))
+        v1 = _block_diag(atom.intor_symmetric('int1e_nuc'))
+        w1 = _sigma_dot(atom.intor('int1e_spnucsp'))
+        x_conf[elem] = _x2c1e_xmatrix(t1, v1, w1, s1, c)
+
+    atom_slices = mol.offset_nr_by_atom()
+    nao = mol.nao_nr()
+    x = numpy.zeros((2, nao, 2, nao), dtype=numpy.complex128)
+    for ia in range(mol.natm):
+        p0, p1 = atom_slices[ia, 2:]
+        elem = mol.atom_symbol(ia)
+        x[:,p0:p1,:,p0:p1] = x_conf[elem].reshape(2, p1-p0, 2, p1-p0)
+    return x.reshape(nao*2, nao*2)
+
+def _spinor_atomic_1e_x(mol):
+    atoms = _atoms_in_mole(mol)
+    x_conf = {}
+    c = lib.param.LIGHT_SPEED
+    for elem, atom in atoms.items():
+        t1 = atom.intor_symmetric('int1e_kin_spinor')
+        s1 = atom.intor_symmetric('int1e_ovlp_spinor')
+        v1 = atom.intor_symmetric('int1e_nuc_spinor')
+        w1 = atom.intor_symmetric('int1e_spnucsp_spinor')
+        x_conf[elem] = _x2c1e_xmatrix(t1, v1, w1, s1, c)
+
+    atom_slices = mol.offset_2c_by_atom()
+    nao = mol.nao_2c()
+    x = numpy.zeros((nao, nao), dtype=numpy.complex128)
+    for ia in range(mol.natm):
+        p0, p1 = atom_slices[ia, 2:]
+        elem = mol.atom_symbol(ia)
+        x[p0:p1,p0:p1] = x_conf[elem]
+    return x