Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SORCSD2BY1 computes inaccurate result #917

Closed
2 tasks done
christoph-conrads opened this issue Oct 26, 2023 · 7 comments · Fixed by #930
Closed
2 tasks done

SORCSD2BY1 computes inaccurate result #917

christoph-conrads opened this issue Oct 26, 2023 · 7 comments · Fixed by #930

Comments

@christoph-conrads
Copy link
Contributor

christoph-conrads commented Oct 26, 2023

Description

Given a 2-by-1 block matrix Q with orthonormal columns, SORCSD2BY1 computes matrices U1, U2, Σ1, Σ2, and V such that

|Q11|   |U1   | |Σ1|
|   | = |     | |  | V^T
|Q21|   |   U2| |Σ2|

In commit 4174d8d, SORCSD2BY1 computes an inaccurate result with a backward error of ||Q - U Σ V^T|| ≈ 10^-5 when it should be at most ε ||Q|| ≈ 10^-7. Input with m = 3, n = 2, p = 2 (Q11 ∈ ℝ(p, n), Q21 ∈ ℝ(m - p, n)):

    [-2.0392263e-01 -9.7898704e-01]
Q = [ 1.1427624e-08  9.2925374e-09]
    [ 9.7898704e-01 -2.0392257e-01]

Checklist

  • I've included a minimal example to reproduce the issue
  • I'd be willing to make a PR to solve this issue

This issue was found while working on #406.

@christoph-conrads
Copy link
Contributor Author

The Python code below demonstrates the problem. It requires Python 3, NumPy, and a shared LAPACK library.

Call it with python3 lapack-issue-917.py and optionally pass the path to the LAPACK shared library. Example (the first execution uses the system's LAPACK installation, the second execution uses my LAPACK build of the latest master branch code):

$ python3 lapack-issue-917.py 
name of LAPACK library: liblapack.so.3
tolerance: 1.69e-07
actual value: 2.97e-05 (176.0 times tolerance)
input matrix:
[[-2.0392263e-01 -9.7898704e-01]
 [ 1.1427624e-08  9.2925374e-09]
 [ 9.7898704e-01 -2.0392257e-01]]
reassembled matrix:
[[-2.0392254e-01 -9.7898668e-01]
 [ 6.0102630e-06  2.9068287e-05]
 [ 9.7898704e-01 -2.0392255e-01]]
$ python3 lapack-issue-917.py /tmp/tmp.y7q9ZOrOZA/lib/liblapack.so.3
name of LAPACK library: /tmp/tmp.y7q9ZOrOZA/lib/liblapack.so.3
tolerance: 1.69e-07
actual value: 2.97e-05 (176.0 times tolerance)
input matrix:
[[-2.0392263e-01 -9.7898704e-01]
 [ 1.1427624e-08  9.2925374e-09]
 [ 9.7898704e-01 -2.0392257e-01]]
reassembled matrix:
[[-2.0392254e-01 -9.7898668e-01]
 [ 6.0102630e-06  2.9068287e-05]
 [ 9.7898704e-01 -2.0392255e-01]]
#!/usr/bin/python3

import ctypes
import ctypes.util
from ctypes import byref, c_char, c_float, c_int32, c_void_p, POINTER
import sys

import numpy as np
import numpy.linalg as la

def main():
    if len(sys.argv) not in [1, 2]:
        return "usage: python3 %s [path to LAPACK library]" % (sys.argv[0], )

    if len(sys.argv) >= 2:
        lapack_library = sys.argv[1]
    else:
        lapack_library = ctypes.util.find_library("lapack")

    print("name of LAPACK library:", lapack_library)

    lapack = ctypes.CDLL(lapack_library, use_errno=True)

    lapack.sorcsd2by1_.restype = None
    # an alias for the _F_ortran integer type
    f_int = ctypes.c_int32
    lapack.sorcsd2by1_.argtypes = [
        POINTER(c_char),
        POINTER(c_char),
        POINTER(c_char),
        # M
        POINTER(f_int),
        # N
        POINTER(f_int),
        # P
        POINTER(f_int),
        # X11
        POINTER(c_float),
        POINTER(f_int),
        # X21
        POINTER(c_float),
        POINTER(f_int),
        # theta
        POINTER(c_float),
        # U1
        POINTER(c_float),
        POINTER(f_int),
        # U2
        POINTER(c_float),
        POINTER(f_int),
        # V^T
        POINTER(c_float),
        POINTER(f_int),
        # work
        POINTER(c_float),
        POINTER(f_int),
        POINTER(f_int),
        POINTER(f_int),
    ]

    dtype = np.float32
    eps = np.finfo(dtype).eps
    nan = np.float32(np.nan)

    m = 2 + 1
    n = 2
    p = 2
    q = np.array(
        [
            [-0.203922629, -0.978987038],
            [1.14276242E-08, 9.29253741E-09],
            [0.978987038, -0.203922570],
        ],
        dtype=np.float32,
        order="F",
    )
    q_copy = q.copy(order="F")
    u1 = np.full([p, p], nan, order="F")
    u2 = np.full([m - p, m - p], nan, order="F")
    vt = np.full([n, n], nan, order="F")
    theta = np.full(n, nan)
    work = np.full(256, nan)
    iwork = np.full(256, 0, dtype=np.int32 if f_int == c_int32 else np.int64)
    info = f_int(0)

    yes = byref(c_char(b'y'))
    intref = lambda n: byref(f_int(n))
    lapack.sorcsd2by1_(
        yes,
        yes,
        yes,
        intref(m),
        intref(n),
        intref(p),
        q_copy.ctypes.data_as(POINTER(c_float)),
        intref(q_copy.shape[0]),
        q_copy[2].ctypes.data_as(POINTER(c_float)),
        intref(q_copy.shape[0]),
        theta.ctypes.data_as(POINTER(c_float)),
        u1.ctypes.data_as(POINTER(c_float)),
        intref(u1.shape[0]),
        u2.ctypes.data_as(POINTER(c_float)),
        intref(u2.shape[0]),
        vt.ctypes.data_as(POINTER(c_float)),
        intref(vt.shape[0]),
        work.ctypes.data_as(POINTER(c_float)),
        intref(work.shape[0]),
        iwork.ctypes.data_as(POINTER(f_int)),
        byref(info),
    )

    assert info.value == 0

    r = min(p, m - p, n, m - n)
    k1 = max(n + p - m, 0)
    k2 = max(n - p, 0)

    assert r == 1
    assert k1 == 1
    assert k2 == 0

    sigma_1 = np.zeros([2, 2], dtype=np.float32)
    sigma_2 = np.zeros([1, 2], dtype=np.float32)
    sigma_1[0, 0] = 1.0
    sigma_1[1, 1] = np.cos(theta[0])
    sigma_2[0, 1] = np.sin(theta[0])

    reassembled_q11 = np.dot(u1, np.dot(sigma_1, vt))
    reassembled_q21 = np.dot(u2, np.dot(sigma_2, vt))
    reassembled_q = np.vstack([reassembled_q11, reassembled_q21])

    tol = eps * la.norm(q)
    delta = la.norm(reassembled_q - q)

    print("tolerance: %8.2e" % tol)
    print("actual value: %8.2e (%.1f times tolerance)" % (delta, delta / tol))
    print("input matrix:")
    print(q)
    print("reassembled matrix:")
    print(reassembled_q)


if __name__ == "__main__":
    sys.exit(main())

@christoph-conrads
Copy link
Contributor Author

christoph-conrads commented Nov 2, 2023

The patch closing #634 added the assumption that the input vector x of xORBDB6 has unit norm. This may not be the case when xORBDB6 is called for the first time inside xORBDB5 (the only xORBDB6-caller in LAPACK).

[Given a vector x and a matrix with orthonormal columns Q, xORBDB6 returns the components of x orthogonal to the range of Q or any other vector orthogonal to ran(Q) if x lies within ran(Q).]

For example when calling SORCSD2BY1 with the arguments in the first post, the SORBDB6 arguments are as follows:

x = [-7.45058060E-08; -1.51934143E-09]
Q = [0.999999702; -7.29401961E-09]

x does evidently not have unit norm.

@christoph-conrads
Copy link
Contributor Author

christoph-conrads commented Nov 3, 2023

Given the following 2x1 matrix with orthonormal columns
Q1 = [1 - ε / 2; 0]
and the following unit-norm vector x1:

x0 = [6399 / 8192 ε; 1 / 128 ε]
x1 = x0 / ‖x0‖

SORBDB5 computes the output y = [1.2e-5; 1] with ‖Q1^T y‖ ≈ 100 ε. This is clearly inaccurate (the factor 100 should instead be small in modulus).

Call the Python code below as follows (NumPy required):

python3 lapack-issue-917-sorbdb5.py # uses system LAPACK
python3 lapack-issue-917-sorbdb5.py /path/to/lapack.so # uses the LAPACK library at the provided location

Python code calling SORBDB5:

#!/usr/bin/python3

# This file calls SORBDB5 with a unit-norm vector `x` and a matrix `Q` with a
# single unit-norm column as parameters. The vector `x` is overwritten by the
# SORBDB5 output vector `x` which is supposed to be orthogonal to the range of
# `Q`.

import ctypes
import ctypes.util
from ctypes import byref, c_char, c_float, c_int32, c_void_p, POINTER
import sys

import numpy as np
import numpy.linalg as la


def main():
    if len(sys.argv) not in [1, 2]:
        return "usage: python3 %s [path to LAPACK library]" % (sys.argv[0], )

    if len(sys.argv) >= 2:
        lapack_library = sys.argv[1]
    else:
        lapack_library = ctypes.util.find_library("lapack")

    print("name of LAPACK library:", lapack_library)

    lapack = ctypes.CDLL(lapack_library, use_errno=True)

    lapack.sorbdb5_.restype = None
    # an alias for the _F_ortran integer type
    f_int = ctypes.c_int32
    lapack.sorcsd2by1_.argtypes = [
        POINTER(f_int),
        POINTER(f_int),
        POINTER(f_int),
        # X1
        POINTER(c_float),
        POINTER(f_int),
        # X2
        POINTER(c_float),
        POINTER(f_int),
        # Q1
        POINTER(c_float),
        POINTER(f_int),
        # Q2
        POINTER(c_float),
        POINTER(f_int),
        # WORK
        POINTER(c_float),
        POINTER(f_int),
        # INFO
        POINTER(f_int),
    ]

    eps = np.finfo(np.float32).eps
    nan = np.float32(np.nan)

    m1 = 2
    m2 = 0
    n = 1
    x1 = np.array([6399 / 8192 * eps, 1 / 128 * eps], dtype=np.float32)
    x1 = x1 / la.norm(x1)
    x2 = np.array([], dtype=np.float32)
    q1 = np.array(
        [
            [1 - eps / 2.],
            [0],
        ],
        dtype=np.float32,
        order="F",
    ).reshape([2, 1])

    assert np.abs(1 - np.dot(q1.T, q1)) <= eps

    q2 = np.array([[]], dtype=np.float32, order="F").reshape((0, 1))
    work = np.full(256, nan)
    info = f_int(0)

    print("vector x")
    print(x1)
    print("matrix with orthonormal columns Q")
    print(q1)

    intref = lambda n: byref(f_int(n))
    lapack.sorbdb5_(
        intref(m1),
        intref(m2),
        intref(n),
        x1.ctypes.data_as(POINTER(c_float)),
        intref(1),
        x2.ctypes.data_as(POINTER(c_float)),
        intref(1),
        q1.ctypes.data_as(POINTER(c_float)),
        intref(q1.shape[0]),
        q2.ctypes.data_as(POINTER(c_float)),
        intref(max(1, q2.shape[0])), # ldq2 must be nonnegative
        work.ctypes.data_as(POINTER(c_float)),
        intref(work.shape[0]),
        byref(info),
    )

    assert info.value == 0

    x1 = x1 / la.norm(x1)

    tol = 2 * eps
    delta = la.norm(np.dot(q1.T, x1))

    print("tolerance: %.2e" % (tol, ))
    print("computed vector orthogonal to ran(Q):")
    print(x1)
    if delta <= tol:
        outcome = "ok"
    else:
        outcome = "FAIL (error is %.3f times epsilon)\n" % (delta / eps,)

    print()
    print(outcome)

if __name__ == "__main__":
    sys.exit(main())

@christoph-conrads
Copy link
Contributor Author

christoph-conrads commented Nov 7, 2023

@langou, ALPHA is provably too small.

Context

xORCSD2BY1/xUNBDB6 calls xORBDB6/xUNBDB6 indirectly. Given a vector x and a unit-norm vector q, xORBDB6 computes a vector orthogonal to q with at most two iterations of the Gram-Schmidt method. The subroutine terminates immediately if

∥y∥₂ ≥ α ∥x∥₂,

where y is the result of one round of the Gram-Schmidt method

y ≔ x - q (q^* x)

and 0 < α < 1 is a magic constant. The assumption is that α is sufficiently large to ensure that y is numerically orthogonal to q. This is not the case with the current value of α.

Implementation

In the current code α = 0.1 and this value is provably too small. Let x ≔ [1; β] and let q = [1 - ε; ε]. We are looking for the value of β maximizing the inner product of q and y subject to ∥y∥₂ ≥ α ∥x∥₂. With a bit of calculation one arrives at the equations

q^* y = ε (α + β)

and

α² (1 + β²) ≈ β² - 2 ε β

Solving for β one arrives at β ≈ 0.1005 and ∣q^* y∣ ≈ 20.9 ε. y is clearly not numerically orthogonal to q.

Next we can reverse the question and ask for a value of α subject to ∣q^* y∣ ≤ 2 ε or equivalently β ≈ 1. The solution is α ≈ 1 / √2. This is clearly much larger than the current value 0.1.

Note

I do not claim that the values above maximize the required value of α; they were just derived from the input in the issue description and modified to yield nice formula.

Demo Code

The script below allows you to play with the values:

#!/usr/bin/python3

import numpy as np
import numpy.linalg as la

eps = np.finfo(np.float32).eps
alpha = 0.1
beta = 1 / (1 - alpha**2) * eps + alpha / np.sqrt(1 - alpha**2)
# slightly grow beta to avoid failing vector norm test norm(y) / norm(x) below
beta = (1 + eps) * beta

print("beta =", beta)

x = np.array([1, beta], dtype=np.float32)
q = np.array([1 - eps, eps], dtype=np.float32)

assert np.abs(1 - la.norm(q)) <= eps

y = x - np.dot(q, np.dot(q.T, x))

# check that the early termination condition is fulfilled
assert la.norm(y) >= alpha * la.norm(x)

print("norm(y) / norm(x) =", la.norm(y) / la.norm(x))
print("            q^* y =", np.dot(q.T, y) / la.norm(y))

@langou
Copy link
Contributor

langou commented Nov 7, 2023

Thanks @ConradS.

Sorry our posts criss-crossed. I was answering on the other thread at:
#928 (comment)

A similar analysis for two vectors to what you are doing is done by Beresford Parlett in his book "The Symmetric Eigenvalue Problem" published in 1980. Credits for the algorithm is Kahan and Parlett and the algorithm for two vectors gave the moniker of "Twice is enough".

@christoph-conrads
Copy link
Contributor Author

(I am taking the liberty to continue the discussion here.)

Thank you for this hint. I can now see in the Giraud/Langou/Rozložnı́k paper "On the round-off error analysis of the Gram-Schmidt algorithm with
reorthogonalization"
where this magic constant is discussed.

Taking an ALPHA small such as 1/10 should not be too much of an issue in this case. (Only 10x loss of accuracy at most.)

As stated earlier, xORBDB6 assumes unit-norm input vectors x. When this assumption is removed (i.e., xORBDB6 computes the norm of x before performing orthogonalization), then the Python code in the second post still fails with the larger ALPHA = 0.1 value. That is, three layers higher in the call stack the loss of accuracy is detectable by a random test problem.

For this reason I think ALPHA is too small even for practitioners and should be set to a safe value instead or an unconditional second round of orthogonalization should be executed.

Having skimmed the GLR paper, I wonder if ALPHA = 1 / (4 √2) is safe because according to Inequality (2.5) the loss of orthogonality in the Kahan-Parlett method is bounded by

|p · q| ≤ σ ϕ(m, n) ε,

where σ = 1 / ALPHA. In the case of unconditional execution of two orthogonalization rounds, the loss of orthogonality is bounded by (3.21):

∥I - Q^* Q∥₂ ≤ 2 √2n ϕ(m, n) ε

For n = 2, σ = 4 √2 yields the same error bound (according to my understanding the polynomial ϕ(m, n) is the same in both cases).

@langou, please let me know if and how you want me to make the orthogonalization safer.

@christoph-conrads
Copy link
Contributor Author

christoph-conrads commented Nov 9, 2023

The input in the C code below triggers SORCSD2BY1 to return a matrix with infinite values (when the matrix should be orthogonal). The issue is present at least since v3.9.1 (commit 77a0ceb). The cause is xORBDB5 returning a vector very small in norm (much smaller than ε). The causes problems when computing Householder reflectors in SLARFGP in the xORBDB5 caller code. This subroutine does possess a branch for handling underflows but this branch is not taken because ABS(TAU) = 2 * SMLNUM.

IF ( ABS(TAU).LE.SMLNUM ) THEN

The demonstration code (compile with cc -Wextra -Wall -std=c99 -pedantic -llapack) prints the contents of the matrix U1 computed by SORCSD2BY1. This matrix should be orthogonal. On the author's computer and with LAPACK 3.9.1, the output is instead

U1      -inf +1.00e+00
U1      -inf      -inf

Starting with version 3.11, the output is

U1 -8.88e-16 +1.00e+00
U1 -1.00e+00 -8.88e-16

Version 3.11 accidentally fixes the issue. In version 3.11 (specifically in #647) an assumption was added to xORBDB6 that the input vector has unit norm. This is most likely not the case when xORBDB5 calls xORBDB6 for the first time in line 218 but this is the only execution path returning nonzero vectors with a very small norm.

Demonstration code (NB it calls SORCSD2BY1 instad of SORBDB3 because it conveniently forms U1 from the elementary Householder reflectors):

#include <math.h>
#include <stddef.h>
#include <stdio.h>

typedef int lapack_int;

void sorcsd2by1_(
    char* jobu1,
    char* jobu2,
    char* jobv1t,
    lapack_int* m,
    lapack_int* p,
    lapack_int* q,
    float* x11,
    lapack_int* ldx11,
    float* x21,
    lapack_int* ldx21,
    float* theta,
    float* u1,
    lapack_int* ldu1,
    float* u2,
    lapack_int* ldu2,
    float* v1t,
    lapack_int* ldv1t,
    float* work,
    lapack_int* lwork,
    lapack_int* iwork,
    lapack_int* info,
    size_t strlen_jobu1,
    size_t strlen_jobu2,
    size_t strlen_jobv1t);

#define P 2
#define M (P + 1)
#define Q 2

int main()
{
    lapack_int m = M;
    lapack_int p = P;
    lapack_int q = Q;
    float x[M * Q] = { 0.00000000, 1.00000000,
        -1.12831231E-07, -6.37805897E-09, -1.12831231E-07,
        -1.00000000 };
    lapack_int ldx11 = m;
    lapack_int ldx21 = m;
    float nan = NAN;
    float theta[Q] = { nan };
    float u1[P * P] = { nan };
    lapack_int ldu1 = P;
    float u2[(M - P) * (M - P)] = { nan };
    lapack_int ldu2 = M - P;
    float v1t[Q * Q] = { nan };
    lapack_int ldv1t = Q;
    float work[256] = { nan };
    lapack_int lwork = sizeof(work) / sizeof(work[0]);
    lapack_int iwork[M + P + Q] = { -1 };
    lapack_int info = -1;

    //printf("X11 %+9.2e %+9.2e\n", x[0], x[ldx11 + 0]);
    //printf("X11 %+9.2e %+9.2e\n", x[1], x[ldx11 + 1]);
    //printf("X11 %+9.2e %+9.2e\n", x[2], x[ldx11 + 2]);

    char yes = 'Y';
    sorcsd2by1_(
        &yes,
        &yes,
        &yes,
        &m,
        &p,
        &q,
        x,
        &ldx11,
        x + p,
        &ldx21,
        theta,
        u1,
        &ldu1,
        u2,
        &ldu2,
        v1t,
        &ldv1t,
        work,
        &lwork,
        iwork,
        &info,
        1,
        1,
        1);

    printf(
        "U1 %+9.2e %+9.2e\n", u1[0], u1[1 * ldu1 + 0]);
    printf(
        "U1 %+9.2e %+9.2e\n", u1[1], u1[1 * ldu1 + 1]);
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging a pull request may close this issue.

2 participants